diff mbox series

[RFC,v1,1/7] fs: Add inode_get_ino() and implement get_ino() for NFS

Message ID 20241010152649.849254-1-mic@digikod.net (mailing list archive)
State New
Headers show
Series [RFC,v1,1/7] fs: Add inode_get_ino() and implement get_ino() for NFS | expand

Commit Message

Mickaël Salaün Oct. 10, 2024, 3:26 p.m. UTC
When a filesystem manages its own inode numbers, like NFS's fileid shown
to user space with getattr(), other part of the kernel may still expose
the private inode->ino through kernel logs and audit.

Another issue is on 32-bit architectures, on which ino_t is 32 bits,
whereas the user space's view of an inode number can still be 64 bits.

Add a new inode_get_ino() helper calling the new struct
inode_operations' get_ino() when set, to get the user space's view of an
inode number.  inode_get_ino() is called by generic_fillattr().

Implement get_ino() for NFS.

Cc: Trond Myklebust <trondmy@kernel.org>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---

I'm not sure about nfs_namespace_getattr(), please review carefully.

I guess there are other filesystems exposing inode numbers different
than inode->i_ino, and they should be patched too.
---
 fs/nfs/inode.c     | 6 ++++--
 fs/nfs/internal.h  | 1 +
 fs/nfs/namespace.c | 2 ++
 fs/stat.c          | 2 +-
 include/linux/fs.h | 9 +++++++++
 5 files changed, 17 insertions(+), 3 deletions(-)

Comments

Anna Schumaker Oct. 10, 2024, 6:07 p.m. UTC | #1
Hi Mickaël,

On 10/10/24 11:26 AM, Mickaël Salaün wrote:
> When a filesystem manages its own inode numbers, like NFS's fileid shown
> to user space with getattr(), other part of the kernel may still expose
> the private inode->ino through kernel logs and audit.
> 
> Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> whereas the user space's view of an inode number can still be 64 bits.
> 
> Add a new inode_get_ino() helper calling the new struct
> inode_operations' get_ino() when set, to get the user space's view of an
> inode number.  inode_get_ino() is called by generic_fillattr().
> 
> Implement get_ino() for NFS.
> 
> Cc: Trond Myklebust <trondmy@kernel.org>
> Cc: Anna Schumaker <anna@kernel.org>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: Jan Kara <jack@suse.cz>
> Signed-off-by: Mickaël Salaün <mic@digikod.net>
> ---
> 
> I'm not sure about nfs_namespace_getattr(), please review carefully.
> 
> I guess there are other filesystems exposing inode numbers different
> than inode->i_ino, and they should be patched too.
> ---
>  fs/nfs/inode.c     | 6 ++++--
>  fs/nfs/internal.h  | 1 +
>  fs/nfs/namespace.c | 2 ++
>  fs/stat.c          | 2 +-
>  include/linux/fs.h | 9 +++++++++
>  5 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index 542c7d97b235..5dfc176b6d92 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -83,18 +83,19 @@ EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
>  
>  /**
>   * nfs_compat_user_ino64 - returns the user-visible inode number
> - * @fileid: 64-bit fileid
> + * @inode: inode pointer
>   *
>   * This function returns a 32-bit inode number if the boot parameter
>   * nfs.enable_ino64 is zero.
>   */
> -u64 nfs_compat_user_ino64(u64 fileid)
> +u64 nfs_compat_user_ino64(const struct *inode)
                             ^^^^^^^^^^^^^^^^^^^
This should be "const struct inode *inode"

>  {
>  #ifdef CONFIG_COMPAT
>  	compat_ulong_t ino;
>  #else	
>  	unsigned long ino;
>  #endif
> +	u64 fileid = NFS_FILEID(inode);
>  
>  	if (enable_ino64)
>  		return fileid;
> @@ -103,6 +104,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
>  		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
>  	return ino;
>  }
> +EXPORT_SYMBOL_GPL(nfs_compat_user_ino64);
>  
>  int nfs_drop_inode(struct inode *inode)
>  {
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index 430733e3eff2..f5555a71a733 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -451,6 +451,7 @@ extern void nfs_zap_acl_cache(struct inode *inode);
>  extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
>  extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
>  extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
> +extern u64 nfs_compat_user_ino64(const struct *inode);

Why add this here when it's already in include/linux/nfs_fs.h? Can you update that declaration instead?

Also, there is a caller for nfs_compat_user_ino64() in fs/nfs/dir.c that needs to be updated. Can you double check that you have CONFIG_NFS_FS=m (or 'y') in your kernel .config? These are all issues my compiler caught when I applied your patch.

Thanks,
Anna

>  
>  #if IS_ENABLED(CONFIG_NFS_LOCALIO)
>  /* localio.c */
> diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
> index e7494cdd957e..d9b1e0606833 100644
> --- a/fs/nfs/namespace.c
> +++ b/fs/nfs/namespace.c
> @@ -232,11 +232,13 @@ nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
>  const struct inode_operations nfs_mountpoint_inode_operations = {
>  	.getattr	= nfs_getattr,
>  	.setattr	= nfs_setattr,
> +	.get_ino	= nfs_compat_user_ino64,
>  };
>  
>  const struct inode_operations nfs_referral_inode_operations = {
>  	.getattr	= nfs_namespace_getattr,
>  	.setattr	= nfs_namespace_setattr,
> +	.get_ino	= nfs_compat_user_ino64,
>  };
>  
>  static void nfs_expire_automounts(struct work_struct *work)
> diff --git a/fs/stat.c b/fs/stat.c
> index 41e598376d7e..05636919f94b 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -50,7 +50,7 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
>  	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
>  
>  	stat->dev = inode->i_sb->s_dev;
> -	stat->ino = inode->i_ino;
> +	stat->ino = inode_get_ino(inode);
>  	stat->mode = inode->i_mode;
>  	stat->nlink = inode->i_nlink;
>  	stat->uid = vfsuid_into_kuid(vfsuid);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e3c603d01337..0eba09a21cf7 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2165,6 +2165,7 @@ struct inode_operations {
>  			    struct dentry *dentry, struct fileattr *fa);
>  	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
>  	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
> +	u64 (*get_ino)(const struct inode *inode);
>  } ____cacheline_aligned;
>  
>  static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
> @@ -2172,6 +2173,14 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
>  	return file->f_op->mmap(file, vma);
>  }
>  
> +static inline u64 inode_get_ino(struct inode *inode)
> +{
> +	if (unlikely(inode->i_op->get_ino))
> +		return inode->i_op->get_ino(inode);
> +
> +	return inode->i_ino;
> +}
> +
>  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
>  extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
>  extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
Trond Myklebust Oct. 10, 2024, 7:28 p.m. UTC | #2
On Thu, 2024-10-10 at 17:26 +0200, Mickaël Salaün wrote:
> When a filesystem manages its own inode numbers, like NFS's fileid
> shown
> to user space with getattr(), other part of the kernel may still
> expose
> the private inode->ino through kernel logs and audit.
> 
> Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> whereas the user space's view of an inode number can still be 64
> bits.
> 
> Add a new inode_get_ino() helper calling the new struct
> inode_operations' get_ino() when set, to get the user space's view of
> an
> inode number.  inode_get_ino() is called by generic_fillattr().
> 
> Implement get_ino() for NFS.
> 
> Cc: Trond Myklebust <trondmy@kernel.org>
> Cc: Anna Schumaker <anna@kernel.org>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: Jan Kara <jack@suse.cz>
> Signed-off-by: Mickaël Salaün <mic@digikod.net>
> ---
> 
> I'm not sure about nfs_namespace_getattr(), please review carefully.
> 
> I guess there are other filesystems exposing inode numbers different
> than inode->i_ino, and they should be patched too.
> ---
>  fs/nfs/inode.c     | 6 ++++--
>  fs/nfs/internal.h  | 1 +
>  fs/nfs/namespace.c | 2 ++
>  fs/stat.c          | 2 +-
>  include/linux/fs.h | 9 +++++++++
>  5 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index 542c7d97b235..5dfc176b6d92 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -83,18 +83,19 @@ EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
>  
>  /**
>   * nfs_compat_user_ino64 - returns the user-visible inode number
> - * @fileid: 64-bit fileid
> + * @inode: inode pointer
>   *
>   * This function returns a 32-bit inode number if the boot parameter
>   * nfs.enable_ino64 is zero.
>   */
> -u64 nfs_compat_user_ino64(u64 fileid)
> +u64 nfs_compat_user_ino64(const struct *inode)
>  {
>  #ifdef CONFIG_COMPAT
>  	compat_ulong_t ino;
>  #else	
>  	unsigned long ino;
>  #endif
> +	u64 fileid = NFS_FILEID(inode);
>  
>  	if (enable_ino64)
>  		return fileid;
> @@ -103,6 +104,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
>  		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
>  	return ino;
>  }
> +EXPORT_SYMBOL_GPL(nfs_compat_user_ino64);
>  
>  int nfs_drop_inode(struct inode *inode)
>  {
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index 430733e3eff2..f5555a71a733 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -451,6 +451,7 @@ extern void nfs_zap_acl_cache(struct inode
> *inode);
>  extern void nfs_set_cache_invalid(struct inode *inode, unsigned long
> flags);
>  extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
>  extern int nfs_wait_bit_killable(struct wait_bit_key *key, int
> mode);
> +extern u64 nfs_compat_user_ino64(const struct *inode);
>  
>  #if IS_ENABLED(CONFIG_NFS_LOCALIO)
>  /* localio.c */
> diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
> index e7494cdd957e..d9b1e0606833 100644
> --- a/fs/nfs/namespace.c
> +++ b/fs/nfs/namespace.c
> @@ -232,11 +232,13 @@ nfs_namespace_setattr(struct mnt_idmap *idmap,
> struct dentry *dentry,
>  const struct inode_operations nfs_mountpoint_inode_operations = {
>  	.getattr	= nfs_getattr,
>  	.setattr	= nfs_setattr,
> +	.get_ino	= nfs_compat_user_ino64,
>  };
>  
>  const struct inode_operations nfs_referral_inode_operations = {
>  	.getattr	= nfs_namespace_getattr,
>  	.setattr	= nfs_namespace_setattr,
> +	.get_ino	= nfs_compat_user_ino64,
>  };
>  
>  static void nfs_expire_automounts(struct work_struct *work)
> diff --git a/fs/stat.c b/fs/stat.c
> index 41e598376d7e..05636919f94b 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -50,7 +50,7 @@ void generic_fillattr(struct mnt_idmap *idmap, u32
> request_mask,
>  	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
>  
>  	stat->dev = inode->i_sb->s_dev;
> -	stat->ino = inode->i_ino;
> +	stat->ino = inode_get_ino(inode);
>  	stat->mode = inode->i_mode;
>  	stat->nlink = inode->i_nlink;
>  	stat->uid = vfsuid_into_kuid(vfsuid);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e3c603d01337..0eba09a21cf7 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2165,6 +2165,7 @@ struct inode_operations {
>  			    struct dentry *dentry, struct fileattr
> *fa);
>  	int (*fileattr_get)(struct dentry *dentry, struct fileattr
> *fa);
>  	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
> +	u64 (*get_ino)(const struct inode *inode);
>  } ____cacheline_aligned;
>  
>  static inline int call_mmap(struct file *file, struct vm_area_struct
> *vma)
> @@ -2172,6 +2173,14 @@ static inline int call_mmap(struct file *file,
> struct vm_area_struct *vma)
>  	return file->f_op->mmap(file, vma);
>  }
>  
> +static inline u64 inode_get_ino(struct inode *inode)
> +{
> +	if (unlikely(inode->i_op->get_ino))
> +		return inode->i_op->get_ino(inode);
> +
> +	return inode->i_ino;
> +}
> +
>  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t
> *);
>  extern ssize_t vfs_write(struct file *, const char __user *, size_t,
> loff_t *);
>  extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct
> file *,

There should be no need to add this callback to generic_fillattr().

generic_fillattr() is a helper function for use by the filesystems
themselves. It should never be called from any outside functions, as
the inode number would be far from the only attribute that will be
incorrect.
Tetsuo Handa Oct. 11, 2024, 10:12 a.m. UTC | #3
On 2024/10/11 0:26, Mickaël Salaün wrote:
> When a filesystem manages its own inode numbers, like NFS's fileid shown
> to user space with getattr(), other part of the kernel may still expose
> the private inode->ino through kernel logs and audit.

I can't catch what you are trying to do. What is wrong with that?

> Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> whereas the user space's view of an inode number can still be 64 bits.

Currently, ino_t is 32bits on 32-bit architectures, isn't it?
Why do you need to use 64bits on 32-bit architectures?

> Add a new inode_get_ino() helper calling the new struct
> inode_operations' get_ino() when set, to get the user space's view of an
> inode number.  inode_get_ino() is called by generic_fillattr().

What does the user space's view of an inode number mean?
What does the kernel space's view of an inode number mean?
Mickaël Salaün Oct. 11, 2024, 10:14 a.m. UTC | #4
On Thu, Oct 10, 2024 at 02:07:52PM -0400, Anna Schumaker wrote:
> Hi Mickaël,
> 
> On 10/10/24 11:26 AM, Mickaël Salaün wrote:
> > When a filesystem manages its own inode numbers, like NFS's fileid shown
> > to user space with getattr(), other part of the kernel may still expose
> > the private inode->ino through kernel logs and audit.
> > 
> > Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> > whereas the user space's view of an inode number can still be 64 bits.
> > 
> > Add a new inode_get_ino() helper calling the new struct
> > inode_operations' get_ino() when set, to get the user space's view of an
> > inode number.  inode_get_ino() is called by generic_fillattr().
> > 
> > Implement get_ino() for NFS.
> > 
> > Cc: Trond Myklebust <trondmy@kernel.org>
> > Cc: Anna Schumaker <anna@kernel.org>
> > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > Cc: Christian Brauner <brauner@kernel.org>
> > Cc: Jan Kara <jack@suse.cz>
> > Signed-off-by: Mickaël Salaün <mic@digikod.net>
> > ---
> > 
> > I'm not sure about nfs_namespace_getattr(), please review carefully.
> > 
> > I guess there are other filesystems exposing inode numbers different
> > than inode->i_ino, and they should be patched too.
> > ---
> >  fs/nfs/inode.c     | 6 ++++--
> >  fs/nfs/internal.h  | 1 +
> >  fs/nfs/namespace.c | 2 ++
> >  fs/stat.c          | 2 +-
> >  include/linux/fs.h | 9 +++++++++
> >  5 files changed, 17 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> > index 542c7d97b235..5dfc176b6d92 100644
> > --- a/fs/nfs/inode.c
> > +++ b/fs/nfs/inode.c
> > @@ -83,18 +83,19 @@ EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
> >  
> >  /**
> >   * nfs_compat_user_ino64 - returns the user-visible inode number
> > - * @fileid: 64-bit fileid
> > + * @inode: inode pointer
> >   *
> >   * This function returns a 32-bit inode number if the boot parameter
> >   * nfs.enable_ino64 is zero.
> >   */
> > -u64 nfs_compat_user_ino64(u64 fileid)
> > +u64 nfs_compat_user_ino64(const struct *inode)
>                              ^^^^^^^^^^^^^^^^^^^
> This should be "const struct inode *inode"

Indeed...

> 
> >  {
> >  #ifdef CONFIG_COMPAT
> >  	compat_ulong_t ino;
> >  #else	
> >  	unsigned long ino;
> >  #endif
> > +	u64 fileid = NFS_FILEID(inode);
> >  
> >  	if (enable_ino64)
> >  		return fileid;
> > @@ -103,6 +104,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
> >  		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
> >  	return ino;
> >  }
> > +EXPORT_SYMBOL_GPL(nfs_compat_user_ino64);
> >  
> >  int nfs_drop_inode(struct inode *inode)
> >  {
> > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> > index 430733e3eff2..f5555a71a733 100644
> > --- a/fs/nfs/internal.h
> > +++ b/fs/nfs/internal.h
> > @@ -451,6 +451,7 @@ extern void nfs_zap_acl_cache(struct inode *inode);
> >  extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
> >  extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
> >  extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
> > +extern u64 nfs_compat_user_ino64(const struct *inode);
> 
> Why add this here when it's already in include/linux/nfs_fs.h? Can you update that declaration instead?
> 
> Also, there is a caller for nfs_compat_user_ino64() in fs/nfs/dir.c that needs to be updated. Can you double check that you have CONFIG_NFS_FS=m (or 'y') in your kernel .config? These are all issues my compiler caught when I applied your patch.

Oh, I enabled CONFIG_NFSD_V4 and CONFIG_NFS_COMMON but not
CONFIG_NFS_FS, that explains these uncaught errors... Thanks!

About the nfs_do_filldir(), the nfs_compat_user_ino64() call is indeed
not correct with this patch, but I'm wondering why not use ent->ino
instead?

> 
> Thanks,
> Anna
> 
> >  
> >  #if IS_ENABLED(CONFIG_NFS_LOCALIO)
> >  /* localio.c */
> > diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
> > index e7494cdd957e..d9b1e0606833 100644
> > --- a/fs/nfs/namespace.c
> > +++ b/fs/nfs/namespace.c
> > @@ -232,11 +232,13 @@ nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
> >  const struct inode_operations nfs_mountpoint_inode_operations = {
> >  	.getattr	= nfs_getattr,
> >  	.setattr	= nfs_setattr,
> > +	.get_ino	= nfs_compat_user_ino64,
> >  };
> >  
> >  const struct inode_operations nfs_referral_inode_operations = {
> >  	.getattr	= nfs_namespace_getattr,
> >  	.setattr	= nfs_namespace_setattr,
> > +	.get_ino	= nfs_compat_user_ino64,

Is it correct to use nfs_compat_user_ino64() for both of these inode
operation structs?  With this patch, generic_fileattr() initialize the
stat struct with get_ino(), which might not what we want according to
these nfs_mountpoint and nfs_referral structs.


> >  };
> >  
> >  static void nfs_expire_automounts(struct work_struct *work)
> > diff --git a/fs/stat.c b/fs/stat.c
> > index 41e598376d7e..05636919f94b 100644
> > --- a/fs/stat.c
> > +++ b/fs/stat.c
> > @@ -50,7 +50,7 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
> >  	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
> >  
> >  	stat->dev = inode->i_sb->s_dev;
> > -	stat->ino = inode->i_ino;
> > +	stat->ino = inode_get_ino(inode);
> >  	stat->mode = inode->i_mode;
> >  	stat->nlink = inode->i_nlink;
> >  	stat->uid = vfsuid_into_kuid(vfsuid);
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index e3c603d01337..0eba09a21cf7 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -2165,6 +2165,7 @@ struct inode_operations {
> >  			    struct dentry *dentry, struct fileattr *fa);
> >  	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
> >  	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
> > +	u64 (*get_ino)(const struct inode *inode);
> >  } ____cacheline_aligned;
> >  
> >  static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
> > @@ -2172,6 +2173,14 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
> >  	return file->f_op->mmap(file, vma);
> >  }
> >  
> > +static inline u64 inode_get_ino(struct inode *inode)
> > +{
> > +	if (unlikely(inode->i_op->get_ino))
> > +		return inode->i_op->get_ino(inode);
> > +
> > +	return inode->i_ino;
> > +}
> > +
> >  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
> >  extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
> >  extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
> 
>
Mickaël Salaün Oct. 11, 2024, 10:15 a.m. UTC | #5
On Thu, Oct 10, 2024 at 03:28:12PM -0400, Trond Myklebust wrote:
> On Thu, 2024-10-10 at 17:26 +0200, Mickaël Salaün wrote:
> > When a filesystem manages its own inode numbers, like NFS's fileid
> > shown
> > to user space with getattr(), other part of the kernel may still
> > expose
> > the private inode->ino through kernel logs and audit.
> > 
> > Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> > whereas the user space's view of an inode number can still be 64
> > bits.
> > 
> > Add a new inode_get_ino() helper calling the new struct
> > inode_operations' get_ino() when set, to get the user space's view of
> > an
> > inode number.  inode_get_ino() is called by generic_fillattr().
> > 
> > Implement get_ino() for NFS.
> > 
> > Cc: Trond Myklebust <trondmy@kernel.org>
> > Cc: Anna Schumaker <anna@kernel.org>
> > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > Cc: Christian Brauner <brauner@kernel.org>
> > Cc: Jan Kara <jack@suse.cz>
> > Signed-off-by: Mickaël Salaün <mic@digikod.net>
> > ---
> > 
> > I'm not sure about nfs_namespace_getattr(), please review carefully.
> > 
> > I guess there are other filesystems exposing inode numbers different
> > than inode->i_ino, and they should be patched too.
> > ---
> >  fs/nfs/inode.c     | 6 ++++--
> >  fs/nfs/internal.h  | 1 +
> >  fs/nfs/namespace.c | 2 ++
> >  fs/stat.c          | 2 +-
> >  include/linux/fs.h | 9 +++++++++
> >  5 files changed, 17 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> > index 542c7d97b235..5dfc176b6d92 100644
> > --- a/fs/nfs/inode.c
> > +++ b/fs/nfs/inode.c
> > @@ -83,18 +83,19 @@ EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
> >  
> >  /**
> >   * nfs_compat_user_ino64 - returns the user-visible inode number
> > - * @fileid: 64-bit fileid
> > + * @inode: inode pointer
> >   *
> >   * This function returns a 32-bit inode number if the boot parameter
> >   * nfs.enable_ino64 is zero.
> >   */
> > -u64 nfs_compat_user_ino64(u64 fileid)
> > +u64 nfs_compat_user_ino64(const struct *inode)
> >  {
> >  #ifdef CONFIG_COMPAT
> >  	compat_ulong_t ino;
> >  #else	
> >  	unsigned long ino;
> >  #endif
> > +	u64 fileid = NFS_FILEID(inode);
> >  
> >  	if (enable_ino64)
> >  		return fileid;
> > @@ -103,6 +104,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
> >  		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
> >  	return ino;
> >  }
> > +EXPORT_SYMBOL_GPL(nfs_compat_user_ino64);
> >  
> >  int nfs_drop_inode(struct inode *inode)
> >  {
> > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> > index 430733e3eff2..f5555a71a733 100644
> > --- a/fs/nfs/internal.h
> > +++ b/fs/nfs/internal.h
> > @@ -451,6 +451,7 @@ extern void nfs_zap_acl_cache(struct inode
> > *inode);
> >  extern void nfs_set_cache_invalid(struct inode *inode, unsigned long
> > flags);
> >  extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
> >  extern int nfs_wait_bit_killable(struct wait_bit_key *key, int
> > mode);
> > +extern u64 nfs_compat_user_ino64(const struct *inode);
> >  
> >  #if IS_ENABLED(CONFIG_NFS_LOCALIO)
> >  /* localio.c */
> > diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
> > index e7494cdd957e..d9b1e0606833 100644
> > --- a/fs/nfs/namespace.c
> > +++ b/fs/nfs/namespace.c
> > @@ -232,11 +232,13 @@ nfs_namespace_setattr(struct mnt_idmap *idmap,
> > struct dentry *dentry,
> >  const struct inode_operations nfs_mountpoint_inode_operations = {
> >  	.getattr	= nfs_getattr,
> >  	.setattr	= nfs_setattr,
> > +	.get_ino	= nfs_compat_user_ino64,
> >  };
> >  
> >  const struct inode_operations nfs_referral_inode_operations = {
> >  	.getattr	= nfs_namespace_getattr,
> >  	.setattr	= nfs_namespace_setattr,
> > +	.get_ino	= nfs_compat_user_ino64,
> >  };
> >  
> >  static void nfs_expire_automounts(struct work_struct *work)
> > diff --git a/fs/stat.c b/fs/stat.c
> > index 41e598376d7e..05636919f94b 100644
> > --- a/fs/stat.c
> > +++ b/fs/stat.c
> > @@ -50,7 +50,7 @@ void generic_fillattr(struct mnt_idmap *idmap, u32
> > request_mask,
> >  	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
> >  
> >  	stat->dev = inode->i_sb->s_dev;
> > -	stat->ino = inode->i_ino;
> > +	stat->ino = inode_get_ino(inode);
> >  	stat->mode = inode->i_mode;
> >  	stat->nlink = inode->i_nlink;
> >  	stat->uid = vfsuid_into_kuid(vfsuid);
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index e3c603d01337..0eba09a21cf7 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -2165,6 +2165,7 @@ struct inode_operations {
> >  			    struct dentry *dentry, struct fileattr
> > *fa);
> >  	int (*fileattr_get)(struct dentry *dentry, struct fileattr
> > *fa);
> >  	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
> > +	u64 (*get_ino)(const struct inode *inode);
> >  } ____cacheline_aligned;
> >  
> >  static inline int call_mmap(struct file *file, struct vm_area_struct
> > *vma)
> > @@ -2172,6 +2173,14 @@ static inline int call_mmap(struct file *file,
> > struct vm_area_struct *vma)
> >  	return file->f_op->mmap(file, vma);
> >  }
> >  
> > +static inline u64 inode_get_ino(struct inode *inode)
> > +{
> > +	if (unlikely(inode->i_op->get_ino))
> > +		return inode->i_op->get_ino(inode);
> > +
> > +	return inode->i_ino;
> > +}
> > +
> >  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t
> > *);
> >  extern ssize_t vfs_write(struct file *, const char __user *, size_t,
> > loff_t *);
> >  extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct
> > file *,
> 
> There should be no need to add this callback to generic_fillattr().
> 
> generic_fillattr() is a helper function for use by the filesystems
> themselves. It should never be called from any outside functions, as
> the inode number would be far from the only attribute that will be
> incorrect.

This change will not impact filesystems except the ones that implement the new
get_ino() operation, and I suspect NFS is not or will not be the only one.  We
need to investigate on other filesystems but I wanted to get a first feedback
before.  Using get_ino() in generic_fillattr() should guarantee a consistent
getattr() wrt inode numbers.  I forgot to remove the now-useless call to
nfs_compat_user_ino64() in nfs_getattr() for this to make sense:

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 542c7d97b235..78a9e907c905 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -981,7 +983,6 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path,
        stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask;

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
-       stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
        stat->change_cookie = inode_peek_iversion_raw(inode);
        stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
        if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED)

Al, Christian, what do you think about this generic_fillattr() change?

> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
> 
>
Tetsuo Handa Oct. 11, 2024, 10:54 a.m. UTC | #6
On 2024/10/11 19:12, Tetsuo Handa wrote:
> On 2024/10/11 0:26, Mickaël Salaün wrote:
>> When a filesystem manages its own inode numbers, like NFS's fileid shown
>> to user space with getattr(), other part of the kernel may still expose
>> the private inode->ino through kernel logs and audit.
> 
> I can't catch what you are trying to do. What is wrong with that?
> 
>> Another issue is on 32-bit architectures, on which ino_t is 32 bits,
>> whereas the user space's view of an inode number can still be 64 bits.
> 
> Currently, ino_t is 32bits on 32-bit architectures, isn't it?
> Why do you need to use 64bits on 32-bit architectures?

Changing from 32bits to 64bits for communicating with userspace programs
breaks userspace programs using "ino_t" (or "unsigned long") for handling
inode numbers, doesn't it? Attempt to change from %lu to %llu will not be
acceptable unless the upper 32bits are guaranteed to be 0 on 32-bit
architectures.

Since syslogd/auditd are not the only programs that parse kernel logs and
audit logs, updating only syslogd/auditd is not sufficient. We must not break
existing userspace programs, and thus we can't change the format string.

> 
>> Add a new inode_get_ino() helper calling the new struct
>> inode_operations' get_ino() when set, to get the user space's view of an
>> inode number.  inode_get_ino() is called by generic_fillattr().
> 
> What does the user space's view of an inode number mean?
> What does the kernel space's view of an inode number mean?
>
Mickaël Salaün Oct. 11, 2024, 11:04 a.m. UTC | #7
On Fri, Oct 11, 2024 at 07:12:17PM +0900, Tetsuo Handa wrote:
> On 2024/10/11 0:26, Mickaël Salaün wrote:
> > When a filesystem manages its own inode numbers, like NFS's fileid shown
> > to user space with getattr(), other part of the kernel may still expose
> > the private inode->ino through kernel logs and audit.
> 
> I can't catch what you are trying to do. What is wrong with that?

My understanding is that tomoyo_get_attributes() is used to log or
expose access requests to user space, including inode numbers.  Is that
correct?  If yes, then the inode numbers might not reflect what user
space sees with stat(2).

> 
> > Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> > whereas the user space's view of an inode number can still be 64 bits.
> 
> Currently, ino_t is 32bits on 32-bit architectures, isn't it?
> Why do you need to use 64bits on 32-bit architectures?

ino_t is indeed 32 bits on 32-bit architectures, but user space can
still get a 64-bit stat->ino value.

> 
> > Add a new inode_get_ino() helper calling the new struct
> > inode_operations' get_ino() when set, to get the user space's view of an
> > inode number.  inode_get_ino() is called by generic_fillattr().
> 
> What does the user space's view of an inode number mean?

It's the value user space gets with stat(2).

> What does the kernel space's view of an inode number mean?

It's struct inode->i_ino and ino_t variables.
Mickaël Salaün Oct. 11, 2024, 11:10 a.m. UTC | #8
On Fri, Oct 11, 2024 at 07:54:58PM +0900, Tetsuo Handa wrote:
> On 2024/10/11 19:12, Tetsuo Handa wrote:
> > On 2024/10/11 0:26, Mickaël Salaün wrote:

> >> Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> >> whereas the user space's view of an inode number can still be 64 bits.
> > 
> > Currently, ino_t is 32bits on 32-bit architectures, isn't it?
> > Why do you need to use 64bits on 32-bit architectures?
> 
> Changing from 32bits to 64bits for communicating with userspace programs
> breaks userspace programs using "ino_t" (or "unsigned long") for handling
> inode numbers, doesn't it? Attempt to change from %lu to %llu will not be
> acceptable unless the upper 32bits are guaranteed to be 0 on 32-bit
> architectures.
> 
> Since syslogd/auditd are not the only programs that parse kernel logs and
> audit logs, updating only syslogd/auditd is not sufficient. We must not break
> existing userspace programs, and thus we can't change the format string.

This might only break user space that wrongfully uses 32-bit variables
to store 64-bit values.  According to the UAPI, inode numbers should be
64 bits.
Trond Myklebust Oct. 11, 2024, 12:22 p.m. UTC | #9
On Fri, 2024-10-11 at 12:15 +0200, Mickaël Salaün wrote:
> On Thu, Oct 10, 2024 at 03:28:12PM -0400, Trond Myklebust wrote:
> > On Thu, 2024-10-10 at 17:26 +0200, Mickaël Salaün wrote:
> > > When a filesystem manages its own inode numbers, like NFS's
> > > fileid
> > > shown
> > > to user space with getattr(), other part of the kernel may still
> > > expose
> > > the private inode->ino through kernel logs and audit.
> > > 
> > > Another issue is on 32-bit architectures, on which ino_t is 32
> > > bits,
> > > whereas the user space's view of an inode number can still be 64
> > > bits.
> > > 
> > > Add a new inode_get_ino() helper calling the new struct
> > > inode_operations' get_ino() when set, to get the user space's
> > > view of
> > > an
> > > inode number.  inode_get_ino() is called by generic_fillattr().
> > > 
> > > Implement get_ino() for NFS.
> > > 
> > > Cc: Trond Myklebust <trondmy@kernel.org>
> > > Cc: Anna Schumaker <anna@kernel.org>
> > > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > > Cc: Christian Brauner <brauner@kernel.org>
> > > Cc: Jan Kara <jack@suse.cz>
> > > Signed-off-by: Mickaël Salaün <mic@digikod.net>
> > > ---
> > > 
> > > I'm not sure about nfs_namespace_getattr(), please review
> > > carefully.
> > > 
> > > I guess there are other filesystems exposing inode numbers
> > > different
> > > than inode->i_ino, and they should be patched too.
> > > ---
> > >  fs/nfs/inode.c     | 6 ++++--
> > >  fs/nfs/internal.h  | 1 +
> > >  fs/nfs/namespace.c | 2 ++
> > >  fs/stat.c          | 2 +-
> > >  include/linux/fs.h | 9 +++++++++
> > >  5 files changed, 17 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> > > index 542c7d97b235..5dfc176b6d92 100644
> > > --- a/fs/nfs/inode.c
> > > +++ b/fs/nfs/inode.c
> > > @@ -83,18 +83,19 @@ EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
> > >  
> > >  /**
> > >   * nfs_compat_user_ino64 - returns the user-visible inode number
> > > - * @fileid: 64-bit fileid
> > > + * @inode: inode pointer
> > >   *
> > >   * This function returns a 32-bit inode number if the boot
> > > parameter
> > >   * nfs.enable_ino64 is zero.
> > >   */
> > > -u64 nfs_compat_user_ino64(u64 fileid)
> > > +u64 nfs_compat_user_ino64(const struct *inode)
> > >  {
> > >  #ifdef CONFIG_COMPAT
> > >  	compat_ulong_t ino;
> > >  #else	
> > >  	unsigned long ino;
> > >  #endif
> > > +	u64 fileid = NFS_FILEID(inode);
> > >  
> > >  	if (enable_ino64)
> > >  		return fileid;
> > > @@ -103,6 +104,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
> > >  		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) *
> > > 8;
> > >  	return ino;
> > >  }
> > > +EXPORT_SYMBOL_GPL(nfs_compat_user_ino64);
> > >  
> > >  int nfs_drop_inode(struct inode *inode)
> > >  {
> > > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> > > index 430733e3eff2..f5555a71a733 100644
> > > --- a/fs/nfs/internal.h
> > > +++ b/fs/nfs/internal.h
> > > @@ -451,6 +451,7 @@ extern void nfs_zap_acl_cache(struct inode
> > > *inode);
> > >  extern void nfs_set_cache_invalid(struct inode *inode, unsigned
> > > long
> > > flags);
> > >  extern bool nfs_check_cache_invalid(struct inode *, unsigned
> > > long);
> > >  extern int nfs_wait_bit_killable(struct wait_bit_key *key, int
> > > mode);
> > > +extern u64 nfs_compat_user_ino64(const struct *inode);
> > >  
> > >  #if IS_ENABLED(CONFIG_NFS_LOCALIO)
> > >  /* localio.c */
> > > diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
> > > index e7494cdd957e..d9b1e0606833 100644
> > > --- a/fs/nfs/namespace.c
> > > +++ b/fs/nfs/namespace.c
> > > @@ -232,11 +232,13 @@ nfs_namespace_setattr(struct mnt_idmap
> > > *idmap,
> > > struct dentry *dentry,
> > >  const struct inode_operations nfs_mountpoint_inode_operations =
> > > {
> > >  	.getattr	= nfs_getattr,
> > >  	.setattr	= nfs_setattr,
> > > +	.get_ino	= nfs_compat_user_ino64,
> > >  };
> > >  
> > >  const struct inode_operations nfs_referral_inode_operations = {
> > >  	.getattr	= nfs_namespace_getattr,
> > >  	.setattr	= nfs_namespace_setattr,
> > > +	.get_ino	= nfs_compat_user_ino64,
> > >  };
> > >  
> > >  static void nfs_expire_automounts(struct work_struct *work)
> > > diff --git a/fs/stat.c b/fs/stat.c
> > > index 41e598376d7e..05636919f94b 100644
> > > --- a/fs/stat.c
> > > +++ b/fs/stat.c
> > > @@ -50,7 +50,7 @@ void generic_fillattr(struct mnt_idmap *idmap,
> > > u32
> > > request_mask,
> > >  	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
> > >  
> > >  	stat->dev = inode->i_sb->s_dev;
> > > -	stat->ino = inode->i_ino;
> > > +	stat->ino = inode_get_ino(inode);
> > >  	stat->mode = inode->i_mode;
> > >  	stat->nlink = inode->i_nlink;
> > >  	stat->uid = vfsuid_into_kuid(vfsuid);
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index e3c603d01337..0eba09a21cf7 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -2165,6 +2165,7 @@ struct inode_operations {
> > >  			    struct dentry *dentry, struct
> > > fileattr
> > > *fa);
> > >  	int (*fileattr_get)(struct dentry *dentry, struct
> > > fileattr
> > > *fa);
> > >  	struct offset_ctx *(*get_offset_ctx)(struct inode
> > > *inode);
> > > +	u64 (*get_ino)(const struct inode *inode);
> > >  } ____cacheline_aligned;
> > >  
> > >  static inline int call_mmap(struct file *file, struct
> > > vm_area_struct
> > > *vma)
> > > @@ -2172,6 +2173,14 @@ static inline int call_mmap(struct file
> > > *file,
> > > struct vm_area_struct *vma)
> > >  	return file->f_op->mmap(file, vma);
> > >  }
> > >  
> > > +static inline u64 inode_get_ino(struct inode *inode)
> > > +{
> > > +	if (unlikely(inode->i_op->get_ino))
> > > +		return inode->i_op->get_ino(inode);
> > > +
> > > +	return inode->i_ino;
> > > +}
> > > +
> > >  extern ssize_t vfs_read(struct file *, char __user *, size_t,
> > > loff_t
> > > *);
> > >  extern ssize_t vfs_write(struct file *, const char __user *,
> > > size_t,
> > > loff_t *);
> > >  extern ssize_t vfs_copy_file_range(struct file *, loff_t ,
> > > struct
> > > file *,
> > 
> > There should be no need to add this callback to generic_fillattr().
> > 
> > generic_fillattr() is a helper function for use by the filesystems
> > themselves. It should never be called from any outside functions,
> > as
> > the inode number would be far from the only attribute that will be
> > incorrect.
> 
> This change will not impact filesystems except the ones that
> implement the new
> get_ino() operation, and I suspect NFS is not or will not be the only
> one.  We
> need to investigate on other filesystems but I wanted to get a first
> feedback
> before.  Using get_ino() in generic_fillattr() should guarantee a
> consistent
> getattr() wrt inode numbers.  I forgot to remove the now-useless call
> to
> nfs_compat_user_ino64() in nfs_getattr() for this to make sense:

You're missing my point. From the point of view of NFS, all you're
doing is to replace a relatively fast direct call to
nfs_compat_user_ino64() with a much slower callback. There is no
benefit at all to anyone in doing so.

Yes, other filesystems may also want to replace this and/or other
fields in the "struct kstat" that they return, but none of them should
have a problem with doing that after the actual call to
generic_fillattr().
Christoph Hellwig Oct. 11, 2024, 12:30 p.m. UTC | #10
On Thu, Oct 10, 2024 at 05:26:41PM +0200, Mickaël Salaün wrote:
> When a filesystem manages its own inode numbers, like NFS's fileid shown
> to user space with getattr(), other part of the kernel may still expose
> the private inode->ino through kernel logs and audit.
> 
> Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> whereas the user space's view of an inode number can still be 64 bits.
> 
> Add a new inode_get_ino() helper calling the new struct
> inode_operations' get_ino() when set, to get the user space's view of an
> inode number.  inode_get_ino() is called by generic_fillattr().
> 
> Implement get_ino() for NFS.

The proper interface for that is ->getattr, and you should use that for
all file systems (through the proper vfs wrappers).

And yes, it's really time to move to a 64-bit i_ino, but that's a
separate discussion.
Mickaël Salaün Oct. 11, 2024, 12:38 p.m. UTC | #11
On Fri, Oct 11, 2024 at 08:22:51AM -0400, Trond Myklebust wrote:
> On Fri, 2024-10-11 at 12:15 +0200, Mickaël Salaün wrote:
> > On Thu, Oct 10, 2024 at 03:28:12PM -0400, Trond Myklebust wrote:
> > > On Thu, 2024-10-10 at 17:26 +0200, Mickaël Salaün wrote:
> > > > When a filesystem manages its own inode numbers, like NFS's
> > > > fileid
> > > > shown
> > > > to user space with getattr(), other part of the kernel may still
> > > > expose
> > > > the private inode->ino through kernel logs and audit.
> > > > 
> > > > Another issue is on 32-bit architectures, on which ino_t is 32
> > > > bits,
> > > > whereas the user space's view of an inode number can still be 64
> > > > bits.
> > > > 
> > > > Add a new inode_get_ino() helper calling the new struct
> > > > inode_operations' get_ino() when set, to get the user space's
> > > > view of
> > > > an
> > > > inode number.  inode_get_ino() is called by generic_fillattr().
> > > > 
> > > > Implement get_ino() for NFS.
> > > > 
> > > > Cc: Trond Myklebust <trondmy@kernel.org>
> > > > Cc: Anna Schumaker <anna@kernel.org>
> > > > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > > > Cc: Christian Brauner <brauner@kernel.org>
> > > > Cc: Jan Kara <jack@suse.cz>
> > > > Signed-off-by: Mickaël Salaün <mic@digikod.net>
> > > > ---
> > > > 
> > > > I'm not sure about nfs_namespace_getattr(), please review
> > > > carefully.
> > > > 
> > > > I guess there are other filesystems exposing inode numbers
> > > > different
> > > > than inode->i_ino, and they should be patched too.
> > > > ---
> > > >  fs/nfs/inode.c     | 6 ++++--
> > > >  fs/nfs/internal.h  | 1 +
> > > >  fs/nfs/namespace.c | 2 ++
> > > >  fs/stat.c          | 2 +-
> > > >  include/linux/fs.h | 9 +++++++++
> > > >  5 files changed, 17 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> > > > index 542c7d97b235..5dfc176b6d92 100644
> > > > --- a/fs/nfs/inode.c
> > > > +++ b/fs/nfs/inode.c
> > > > @@ -83,18 +83,19 @@ EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
> > > >  
> > > >  /**
> > > >   * nfs_compat_user_ino64 - returns the user-visible inode number
> > > > - * @fileid: 64-bit fileid
> > > > + * @inode: inode pointer
> > > >   *
> > > >   * This function returns a 32-bit inode number if the boot
> > > > parameter
> > > >   * nfs.enable_ino64 is zero.
> > > >   */
> > > > -u64 nfs_compat_user_ino64(u64 fileid)
> > > > +u64 nfs_compat_user_ino64(const struct *inode)
> > > >  {
> > > >  #ifdef CONFIG_COMPAT
> > > >  	compat_ulong_t ino;
> > > >  #else	
> > > >  	unsigned long ino;
> > > >  #endif
> > > > +	u64 fileid = NFS_FILEID(inode);
> > > >  
> > > >  	if (enable_ino64)
> > > >  		return fileid;
> > > > @@ -103,6 +104,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
> > > >  		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) *
> > > > 8;
> > > >  	return ino;
> > > >  }
> > > > +EXPORT_SYMBOL_GPL(nfs_compat_user_ino64);
> > > >  
> > > >  int nfs_drop_inode(struct inode *inode)
> > > >  {
> > > > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> > > > index 430733e3eff2..f5555a71a733 100644
> > > > --- a/fs/nfs/internal.h
> > > > +++ b/fs/nfs/internal.h
> > > > @@ -451,6 +451,7 @@ extern void nfs_zap_acl_cache(struct inode
> > > > *inode);
> > > >  extern void nfs_set_cache_invalid(struct inode *inode, unsigned
> > > > long
> > > > flags);
> > > >  extern bool nfs_check_cache_invalid(struct inode *, unsigned
> > > > long);
> > > >  extern int nfs_wait_bit_killable(struct wait_bit_key *key, int
> > > > mode);
> > > > +extern u64 nfs_compat_user_ino64(const struct *inode);
> > > >  
> > > >  #if IS_ENABLED(CONFIG_NFS_LOCALIO)
> > > >  /* localio.c */
> > > > diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
> > > > index e7494cdd957e..d9b1e0606833 100644
> > > > --- a/fs/nfs/namespace.c
> > > > +++ b/fs/nfs/namespace.c
> > > > @@ -232,11 +232,13 @@ nfs_namespace_setattr(struct mnt_idmap
> > > > *idmap,
> > > > struct dentry *dentry,
> > > >  const struct inode_operations nfs_mountpoint_inode_operations =
> > > > {
> > > >  	.getattr	= nfs_getattr,
> > > >  	.setattr	= nfs_setattr,
> > > > +	.get_ino	= nfs_compat_user_ino64,
> > > >  };
> > > >  
> > > >  const struct inode_operations nfs_referral_inode_operations = {
> > > >  	.getattr	= nfs_namespace_getattr,
> > > >  	.setattr	= nfs_namespace_setattr,
> > > > +	.get_ino	= nfs_compat_user_ino64,
> > > >  };
> > > >  
> > > >  static void nfs_expire_automounts(struct work_struct *work)
> > > > diff --git a/fs/stat.c b/fs/stat.c
> > > > index 41e598376d7e..05636919f94b 100644
> > > > --- a/fs/stat.c
> > > > +++ b/fs/stat.c
> > > > @@ -50,7 +50,7 @@ void generic_fillattr(struct mnt_idmap *idmap,
> > > > u32
> > > > request_mask,
> > > >  	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
> > > >  
> > > >  	stat->dev = inode->i_sb->s_dev;
> > > > -	stat->ino = inode->i_ino;
> > > > +	stat->ino = inode_get_ino(inode);
> > > >  	stat->mode = inode->i_mode;
> > > >  	stat->nlink = inode->i_nlink;
> > > >  	stat->uid = vfsuid_into_kuid(vfsuid);
> > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > index e3c603d01337..0eba09a21cf7 100644
> > > > --- a/include/linux/fs.h
> > > > +++ b/include/linux/fs.h
> > > > @@ -2165,6 +2165,7 @@ struct inode_operations {
> > > >  			    struct dentry *dentry, struct
> > > > fileattr
> > > > *fa);
> > > >  	int (*fileattr_get)(struct dentry *dentry, struct
> > > > fileattr
> > > > *fa);
> > > >  	struct offset_ctx *(*get_offset_ctx)(struct inode
> > > > *inode);
> > > > +	u64 (*get_ino)(const struct inode *inode);
> > > >  } ____cacheline_aligned;
> > > >  
> > > >  static inline int call_mmap(struct file *file, struct
> > > > vm_area_struct
> > > > *vma)
> > > > @@ -2172,6 +2173,14 @@ static inline int call_mmap(struct file
> > > > *file,
> > > > struct vm_area_struct *vma)
> > > >  	return file->f_op->mmap(file, vma);
> > > >  }
> > > >  
> > > > +static inline u64 inode_get_ino(struct inode *inode)
> > > > +{
> > > > +	if (unlikely(inode->i_op->get_ino))
> > > > +		return inode->i_op->get_ino(inode);
> > > > +
> > > > +	return inode->i_ino;
> > > > +}
> > > > +
> > > >  extern ssize_t vfs_read(struct file *, char __user *, size_t,
> > > > loff_t
> > > > *);
> > > >  extern ssize_t vfs_write(struct file *, const char __user *,
> > > > size_t,
> > > > loff_t *);
> > > >  extern ssize_t vfs_copy_file_range(struct file *, loff_t ,
> > > > struct
> > > > file *,
> > > 
> > > There should be no need to add this callback to generic_fillattr().
> > > 
> > > generic_fillattr() is a helper function for use by the filesystems
> > > themselves. It should never be called from any outside functions,
> > > as
> > > the inode number would be far from the only attribute that will be
> > > incorrect.
> > 
> > This change will not impact filesystems except the ones that
> > implement the new
> > get_ino() operation, and I suspect NFS is not or will not be the only
> > one.  We
> > need to investigate on other filesystems but I wanted to get a first
> > feedback
> > before.  Using get_ino() in generic_fillattr() should guarantee a
> > consistent
> > getattr() wrt inode numbers.  I forgot to remove the now-useless call
> > to
> > nfs_compat_user_ino64() in nfs_getattr() for this to make sense:
> 
> You're missing my point. From the point of view of NFS, all you're
> doing is to replace a relatively fast direct call to
> nfs_compat_user_ino64() with a much slower callback. There is no
> benefit at all to anyone in doing so.
> 
> Yes, other filesystems may also want to replace this and/or other
> fields in the "struct kstat" that they return, but none of them should
> have a problem with doing that after the actual call to
> generic_fillattr().

OK, I'll remove this part then.

> 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
> 
>
Mickaël Salaün Oct. 11, 2024, 12:43 p.m. UTC | #12
On Fri, Oct 11, 2024 at 02:38:29PM +0200, Mickaël Salaün wrote:
> On Fri, Oct 11, 2024 at 08:22:51AM -0400, Trond Myklebust wrote:
> > On Fri, 2024-10-11 at 12:15 +0200, Mickaël Salaün wrote:
> > > On Thu, Oct 10, 2024 at 03:28:12PM -0400, Trond Myklebust wrote:
> > > > On Thu, 2024-10-10 at 17:26 +0200, Mickaël Salaün wrote:
> > > > > When a filesystem manages its own inode numbers, like NFS's
> > > > > fileid
> > > > > shown
> > > > > to user space with getattr(), other part of the kernel may still
> > > > > expose
> > > > > the private inode->ino through kernel logs and audit.
> > > > > 
> > > > > Another issue is on 32-bit architectures, on which ino_t is 32
> > > > > bits,
> > > > > whereas the user space's view of an inode number can still be 64
> > > > > bits.
> > > > > 
> > > > > Add a new inode_get_ino() helper calling the new struct
> > > > > inode_operations' get_ino() when set, to get the user space's
> > > > > view of
> > > > > an
> > > > > inode number.  inode_get_ino() is called by generic_fillattr().
> > > > > 
> > > > > Implement get_ino() for NFS.
> > > > > 
> > > > > Cc: Trond Myklebust <trondmy@kernel.org>
> > > > > Cc: Anna Schumaker <anna@kernel.org>
> > > > > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > > > > Cc: Christian Brauner <brauner@kernel.org>
> > > > > Cc: Jan Kara <jack@suse.cz>
> > > > > Signed-off-by: Mickaël Salaün <mic@digikod.net>
> > > > > ---
> > > > > 
> > > > > I'm not sure about nfs_namespace_getattr(), please review
> > > > > carefully.
> > > > > 
> > > > > I guess there are other filesystems exposing inode numbers
> > > > > different
> > > > > than inode->i_ino, and they should be patched too.
> > > > > ---
> > > > >  fs/nfs/inode.c     | 6 ++++--
> > > > >  fs/nfs/internal.h  | 1 +
> > > > >  fs/nfs/namespace.c | 2 ++
> > > > >  fs/stat.c          | 2 +-
> > > > >  include/linux/fs.h | 9 +++++++++
> > > > >  5 files changed, 17 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> > > > > index 542c7d97b235..5dfc176b6d92 100644
> > > > > --- a/fs/nfs/inode.c
> > > > > +++ b/fs/nfs/inode.c
> > > > > @@ -83,18 +83,19 @@ EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
> > > > >  
> > > > >  /**
> > > > >   * nfs_compat_user_ino64 - returns the user-visible inode number
> > > > > - * @fileid: 64-bit fileid
> > > > > + * @inode: inode pointer
> > > > >   *
> > > > >   * This function returns a 32-bit inode number if the boot
> > > > > parameter
> > > > >   * nfs.enable_ino64 is zero.
> > > > >   */
> > > > > -u64 nfs_compat_user_ino64(u64 fileid)
> > > > > +u64 nfs_compat_user_ino64(const struct *inode)
> > > > >  {
> > > > >  #ifdef CONFIG_COMPAT
> > > > >  	compat_ulong_t ino;
> > > > >  #else	
> > > > >  	unsigned long ino;
> > > > >  #endif
> > > > > +	u64 fileid = NFS_FILEID(inode);
> > > > >  
> > > > >  	if (enable_ino64)
> > > > >  		return fileid;
> > > > > @@ -103,6 +104,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
> > > > >  		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) *
> > > > > 8;
> > > > >  	return ino;
> > > > >  }
> > > > > +EXPORT_SYMBOL_GPL(nfs_compat_user_ino64);
> > > > >  
> > > > >  int nfs_drop_inode(struct inode *inode)
> > > > >  {
> > > > > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> > > > > index 430733e3eff2..f5555a71a733 100644
> > > > > --- a/fs/nfs/internal.h
> > > > > +++ b/fs/nfs/internal.h
> > > > > @@ -451,6 +451,7 @@ extern void nfs_zap_acl_cache(struct inode
> > > > > *inode);
> > > > >  extern void nfs_set_cache_invalid(struct inode *inode, unsigned
> > > > > long
> > > > > flags);
> > > > >  extern bool nfs_check_cache_invalid(struct inode *, unsigned
> > > > > long);
> > > > >  extern int nfs_wait_bit_killable(struct wait_bit_key *key, int
> > > > > mode);
> > > > > +extern u64 nfs_compat_user_ino64(const struct *inode);
> > > > >  
> > > > >  #if IS_ENABLED(CONFIG_NFS_LOCALIO)
> > > > >  /* localio.c */
> > > > > diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
> > > > > index e7494cdd957e..d9b1e0606833 100644
> > > > > --- a/fs/nfs/namespace.c
> > > > > +++ b/fs/nfs/namespace.c
> > > > > @@ -232,11 +232,13 @@ nfs_namespace_setattr(struct mnt_idmap
> > > > > *idmap,
> > > > > struct dentry *dentry,
> > > > >  const struct inode_operations nfs_mountpoint_inode_operations =
> > > > > {
> > > > >  	.getattr	= nfs_getattr,
> > > > >  	.setattr	= nfs_setattr,
> > > > > +	.get_ino	= nfs_compat_user_ino64,
> > > > >  };
> > > > >  
> > > > >  const struct inode_operations nfs_referral_inode_operations = {
> > > > >  	.getattr	= nfs_namespace_getattr,
> > > > >  	.setattr	= nfs_namespace_setattr,
> > > > > +	.get_ino	= nfs_compat_user_ino64,
> > > > >  };
> > > > >  
> > > > >  static void nfs_expire_automounts(struct work_struct *work)
> > > > > diff --git a/fs/stat.c b/fs/stat.c
> > > > > index 41e598376d7e..05636919f94b 100644
> > > > > --- a/fs/stat.c
> > > > > +++ b/fs/stat.c
> > > > > @@ -50,7 +50,7 @@ void generic_fillattr(struct mnt_idmap *idmap,
> > > > > u32
> > > > > request_mask,
> > > > >  	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
> > > > >  
> > > > >  	stat->dev = inode->i_sb->s_dev;
> > > > > -	stat->ino = inode->i_ino;
> > > > > +	stat->ino = inode_get_ino(inode);
> > > > >  	stat->mode = inode->i_mode;
> > > > >  	stat->nlink = inode->i_nlink;
> > > > >  	stat->uid = vfsuid_into_kuid(vfsuid);
> > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > index e3c603d01337..0eba09a21cf7 100644
> > > > > --- a/include/linux/fs.h
> > > > > +++ b/include/linux/fs.h
> > > > > @@ -2165,6 +2165,7 @@ struct inode_operations {
> > > > >  			    struct dentry *dentry, struct
> > > > > fileattr
> > > > > *fa);
> > > > >  	int (*fileattr_get)(struct dentry *dentry, struct
> > > > > fileattr
> > > > > *fa);
> > > > >  	struct offset_ctx *(*get_offset_ctx)(struct inode
> > > > > *inode);
> > > > > +	u64 (*get_ino)(const struct inode *inode);
> > > > >  } ____cacheline_aligned;
> > > > >  
> > > > >  static inline int call_mmap(struct file *file, struct
> > > > > vm_area_struct
> > > > > *vma)
> > > > > @@ -2172,6 +2173,14 @@ static inline int call_mmap(struct file
> > > > > *file,
> > > > > struct vm_area_struct *vma)
> > > > >  	return file->f_op->mmap(file, vma);
> > > > >  }
> > > > >  
> > > > > +static inline u64 inode_get_ino(struct inode *inode)
> > > > > +{
> > > > > +	if (unlikely(inode->i_op->get_ino))
> > > > > +		return inode->i_op->get_ino(inode);
> > > > > +
> > > > > +	return inode->i_ino;
> > > > > +}
> > > > > +
> > > > >  extern ssize_t vfs_read(struct file *, char __user *, size_t,
> > > > > loff_t
> > > > > *);
> > > > >  extern ssize_t vfs_write(struct file *, const char __user *,
> > > > > size_t,
> > > > > loff_t *);
> > > > >  extern ssize_t vfs_copy_file_range(struct file *, loff_t ,
> > > > > struct
> > > > > file *,
> > > > 
> > > > There should be no need to add this callback to generic_fillattr().
> > > > 
> > > > generic_fillattr() is a helper function for use by the filesystems
> > > > themselves. It should never be called from any outside functions,
> > > > as
> > > > the inode number would be far from the only attribute that will be
> > > > incorrect.
> > > 
> > > This change will not impact filesystems except the ones that
> > > implement the new
> > > get_ino() operation, and I suspect NFS is not or will not be the only
> > > one.  We
> > > need to investigate on other filesystems but I wanted to get a first
> > > feedback
> > > before.  Using get_ino() in generic_fillattr() should guarantee a
> > > consistent
> > > getattr() wrt inode numbers.  I forgot to remove the now-useless call
> > > to
> > > nfs_compat_user_ino64() in nfs_getattr() for this to make sense:
> > 
> > You're missing my point. From the point of view of NFS, all you're
> > doing is to replace a relatively fast direct call to
> > nfs_compat_user_ino64() with a much slower callback. There is no
> > benefit at all to anyone in doing so.
> > 
> > Yes, other filesystems may also want to replace this and/or other
> > fields in the "struct kstat" that they return, but none of them should
> > have a problem with doing that after the actual call to
> > generic_fillattr().
> 
> OK, I'll remove this part then.

My concern is about maintaining consistency.  If this get_ino()
operation is not visible to filesystem developers, I think there is a
good chance for desynchronization between the getattr() implementation
and get get_ino().  Anyway, I guess the performance argument wins.

> 
> > 
> > 
> > -- 
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
> > 
> >
Mickaël Salaün Oct. 11, 2024, 12:47 p.m. UTC | #13
On Fri, Oct 11, 2024 at 05:30:12AM -0700, Christoph Hellwig wrote:
> On Thu, Oct 10, 2024 at 05:26:41PM +0200, Mickaël Salaün wrote:
> > When a filesystem manages its own inode numbers, like NFS's fileid shown
> > to user space with getattr(), other part of the kernel may still expose
> > the private inode->ino through kernel logs and audit.
> > 
> > Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> > whereas the user space's view of an inode number can still be 64 bits.
> > 
> > Add a new inode_get_ino() helper calling the new struct
> > inode_operations' get_ino() when set, to get the user space's view of an
> > inode number.  inode_get_ino() is called by generic_fillattr().
> > 
> > Implement get_ino() for NFS.
> 
> The proper interface for that is ->getattr, and you should use that for
> all file systems (through the proper vfs wrappers).

How to get the inode number with ->getattr and only a struct inode?

> 
> And yes, it's really time to move to a 64-bit i_ino, but that's a
> separate discussion.
Christoph Hellwig Oct. 11, 2024, 12:54 p.m. UTC | #14
On Fri, Oct 11, 2024 at 02:47:14PM +0200, Mickaël Salaün wrote:
> How to get the inode number with ->getattr and only a struct inode?

You get a struct kstat and extract it from that.
Mickaël Salaün Oct. 11, 2024, 1:20 p.m. UTC | #15
On Fri, Oct 11, 2024 at 05:54:37AM -0700, Christoph Hellwig wrote:
> On Fri, Oct 11, 2024 at 02:47:14PM +0200, Mickaël Salaün wrote:
> > How to get the inode number with ->getattr and only a struct inode?
> 
> You get a struct kstat and extract it from that.

Yes, but how do you call getattr() without a path?
Christoph Hellwig Oct. 11, 2024, 1:23 p.m. UTC | #16
On Fri, Oct 11, 2024 at 03:20:30PM +0200, Mickaël Salaün wrote:
> On Fri, Oct 11, 2024 at 05:54:37AM -0700, Christoph Hellwig wrote:
> > On Fri, Oct 11, 2024 at 02:47:14PM +0200, Mickaël Salaün wrote:
> > > How to get the inode number with ->getattr and only a struct inode?
> > 
> > You get a struct kstat and extract it from that.
> 
> Yes, but how do you call getattr() without a path?

You don't because inode numbers are irrelevant without the path.
Mickaël Salaün Oct. 11, 2024, 1:52 p.m. UTC | #17
On Fri, Oct 11, 2024 at 06:23:48AM -0700, Christoph Hellwig wrote:
> On Fri, Oct 11, 2024 at 03:20:30PM +0200, Mickaël Salaün wrote:
> > On Fri, Oct 11, 2024 at 05:54:37AM -0700, Christoph Hellwig wrote:
> > > On Fri, Oct 11, 2024 at 02:47:14PM +0200, Mickaël Salaün wrote:
> > > > How to get the inode number with ->getattr and only a struct inode?
> > > 
> > > You get a struct kstat and extract it from that.
> > 
> > Yes, but how do you call getattr() without a path?
> 
> You don't because inode numbers are irrelevant without the path.

They are for kernel messages and audit logs.  Please take a look at the
use cases with the other patches.
Tetsuo Handa Oct. 11, 2024, 2:27 p.m. UTC | #18
On 2024/10/11 20:04, Mickaël Salaün wrote:
> On Fri, Oct 11, 2024 at 07:12:17PM +0900, Tetsuo Handa wrote:
>> On 2024/10/11 0:26, Mickaël Salaün wrote:
>>> When a filesystem manages its own inode numbers, like NFS's fileid shown
>>> to user space with getattr(), other part of the kernel may still expose
>>> the private inode->ino through kernel logs and audit.
>>
>> I can't catch what you are trying to do. What is wrong with that?
> 
> My understanding is that tomoyo_get_attributes() is used to log or
> expose access requests to user space, including inode numbers.  Is that
> correct?  If yes, then the inode numbers might not reflect what user
> space sees with stat(2).

Several questions because I've never seen inode number beyond UINT_MAX...

Since "struct inode"->i_ino is "unsigned long" (which is 32bits on 32-bit
architectures), despite stat(2) is ready to receive inode number as 64bits,
filesystems (except NFS) did not use inode numbers beyond UINT_MAX until now
so that fs/stat.c will not hit -EOVERFLOW condition, and that resulted in
misuse of %lu for e.g. audit logs?

But NFS was already using inode numbers beyond UINT_MAX, and e.g. audit logs
had been recording incorrect values when NFS is used?

Or, some filesystems are already using inode numbers beyond UINT_MAX but the
capacity limitation on 32-bit architectures practically prevented users from
creating/mounting filesystems with so many inodes enough to require inode
numbers going beyond UINT_MAX?



You are trying to fix out-of-sync between stat(2) and e.g. audit logs
rather than introducing new feature, aren't you?

Then, what you are trying to do is OK, but TOMOYO side needs more changes.
Since TOMOYO is currently handling any numeric values (e.g. uid, gid, device
major/minor number, inode number, ioctl's cmd number) as "unsigned long",
most of "unsigned long" usage in TOMOYO needs to be updated to use "u64"
because you are about to change inode number values to always-64bits.
Christoph Hellwig Oct. 11, 2024, 2:39 p.m. UTC | #19
On Fri, Oct 11, 2024 at 03:52:42PM +0200, Mickaël Salaün wrote:
> > > Yes, but how do you call getattr() without a path?
> > 
> > You don't because inode numbers are irrelevant without the path.
> 
> They are for kernel messages and audit logs.  Please take a look at the
> use cases with the other patches.

It still is useless.  E.g. btrfs has duplicate inode numbers due to
subvolumes.

If you want a better pretty but not useful value just work on making
i_ino 64-bits wide, which is long overdue.
Christoph Hellwig Oct. 11, 2024, 3:13 p.m. UTC | #20
On Fri, Oct 11, 2024 at 11:27:45PM +0900, Tetsuo Handa wrote:
> Or, some filesystems are already using inode numbers beyond UINT_MAX but the
> capacity limitation on 32-bit architectures practically prevented users from
> creating/mounting filesystems with so many inodes enough to require inode
> numbers going beyond UINT_MAX?

Plenty of file systems use 64-bit inode numbers.  XFS and btrfs for
example if you care about commonly used local file systems.
Mickaël Salaün Oct. 11, 2024, 3:26 p.m. UTC | #21
On Fri, Oct 11, 2024 at 11:27:45PM +0900, Tetsuo Handa wrote:
> On 2024/10/11 20:04, Mickaël Salaün wrote:
> > On Fri, Oct 11, 2024 at 07:12:17PM +0900, Tetsuo Handa wrote:
> >> On 2024/10/11 0:26, Mickaël Salaün wrote:
> >>> When a filesystem manages its own inode numbers, like NFS's fileid shown
> >>> to user space with getattr(), other part of the kernel may still expose
> >>> the private inode->ino through kernel logs and audit.
> >>
> >> I can't catch what you are trying to do. What is wrong with that?
> > 
> > My understanding is that tomoyo_get_attributes() is used to log or
> > expose access requests to user space, including inode numbers.  Is that
> > correct?  If yes, then the inode numbers might not reflect what user
> > space sees with stat(2).
> 
> Several questions because I've never seen inode number beyond UINT_MAX...
> 
> Since "struct inode"->i_ino is "unsigned long" (which is 32bits on 32-bit
> architectures), despite stat(2) is ready to receive inode number as 64bits,
> filesystems (except NFS) did not use inode numbers beyond UINT_MAX until now
> so that fs/stat.c will not hit -EOVERFLOW condition, and that resulted in
> misuse of %lu for e.g. audit logs?

Yes, I think other filesystems (e.g. tmpfs) only enable 64-bit inodes on
64-bit architectures.

> 
> But NFS was already using inode numbers beyond UINT_MAX, and e.g. audit logs
> had been recording incorrect values when NFS is used?

Correct, all the logs with NFS inodes are wrong.

> 
> Or, some filesystems are already using inode numbers beyond UINT_MAX but the
> capacity limitation on 32-bit architectures practically prevented users from
> creating/mounting filesystems with so many inodes enough to require inode
> numbers going beyond UINT_MAX?

I think so but I didn't take a look at all other filesystems.

> 
> 
> 
> You are trying to fix out-of-sync between stat(2) and e.g. audit logs
> rather than introducing new feature, aren't you?

Yes

> 
> Then, what you are trying to do is OK, but TOMOYO side needs more changes.
> Since TOMOYO is currently handling any numeric values (e.g. uid, gid, device
> major/minor number, inode number, ioctl's cmd number) as "unsigned long",
> most of "unsigned long" usage in TOMOYO needs to be updated to use "u64"
> because you are about to change inode number values to always-64bits.
> 

OK, could you please send a full patch in reply to this email?  I'll
include it in the next patch series.
Mickaël Salaün Oct. 11, 2024, 3:30 p.m. UTC | #22
On Fri, Oct 11, 2024 at 07:39:33AM -0700, Christoph Hellwig wrote:
> On Fri, Oct 11, 2024 at 03:52:42PM +0200, Mickaël Salaün wrote:
> > > > Yes, but how do you call getattr() without a path?
> > > 
> > > You don't because inode numbers are irrelevant without the path.
> > 
> > They are for kernel messages and audit logs.  Please take a look at the
> > use cases with the other patches.
> 
> It still is useless.  E.g. btrfs has duplicate inode numbers due to
> subvolumes.

At least it reflects what users see.

> 
> If you want a better pretty but not useful value just work on making
> i_ino 64-bits wide, which is long overdue.

That would require too much work for me, and this would be a pain to
backport to all stable kernels.
Christoph Hellwig Oct. 11, 2024, 3:34 p.m. UTC | #23
On Fri, Oct 11, 2024 at 05:30:59PM +0200, Mickaël Salaün wrote:
> > It still is useless.  E.g. btrfs has duplicate inode numbers due to
> > subvolumes.
> 
> At least it reflects what users see.

Users generally don't see inode numbers.

> > If you want a better pretty but not useful value just work on making
> > i_ino 64-bits wide, which is long overdue.
> 
> That would require too much work for me, and this would be a pain to
> backport to all stable kernels.

Well, if doing the right thing is too hard we can easily do nothing.

In case it wan't clear, this thread has been a very explicit:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Jeff Layton Oct. 13, 2024, 10:17 a.m. UTC | #24
On Fri, 2024-10-11 at 17:30 +0200, Mickaël Salaün wrote:
> On Fri, Oct 11, 2024 at 07:39:33AM -0700, Christoph Hellwig wrote:
> > On Fri, Oct 11, 2024 at 03:52:42PM +0200, Mickaël Salaün wrote:
> > > > > Yes, but how do you call getattr() without a path?
> > > > 
> > > > You don't because inode numbers are irrelevant without the path.
> > > 
> > > They are for kernel messages and audit logs.  Please take a look at the
> > > use cases with the other patches.
> > 
> > It still is useless.  E.g. btrfs has duplicate inode numbers due to
> > subvolumes.
> 
> At least it reflects what users see.
> 
> > 
> > If you want a better pretty but not useful value just work on making
> > i_ino 64-bits wide, which is long overdue.
> 
> That would require too much work for me, and this would be a pain to
> backport to all stable kernels.
> 

Would it though? Adding this new inode operation seems sub-optimal.

Inode numbers are static information. Once an inode number is set on an
inode it basically never changes.  This patchset will turn all of those
direct inode->i_ino fetches into a pointer chase for the new inode
operation, which will then almost always just result in a direct fetch.

A better solution here would be to make inode->i_ino a u64, and just
fix up all of the places that touch it to expect that. Then, just
ensure that all of the filesystems set it properly when instantiating
new inodes. Even on 32-bit arch, you likely wouldn't need a seqcount
loop or anything to fetch this since the chance of a torn read there is
basically zero.

If there are places where we need to convert i_ino down to 32-bits,
then we can just use some scheme like nfs_fattr_to_ino_t(), or just
cast i_ino to a u32.

Yes, it'd be a larger patchset, but that seems like it would be a
better solution.
Burn Alting Oct. 14, 2024, 8:40 a.m. UTC | #25
On 13/10/24 21:17, Jeff Layton wrote:
> On Fri, 2024-10-11 at 17:30 +0200, Mickaël Salaün wrote:
>> On Fri, Oct 11, 2024 at 07:39:33AM -0700, Christoph Hellwig wrote:
>>> On Fri, Oct 11, 2024 at 03:52:42PM +0200, Mickaël Salaün wrote:
>>>>>> Yes, but how do you call getattr() without a path?
>>>>>
>>>>> You don't because inode numbers are irrelevant without the path.
>>>>
>>>> They are for kernel messages and audit logs.  Please take a look at the
>>>> use cases with the other patches.
>>>
>>> It still is useless.  E.g. btrfs has duplicate inode numbers due to
>>> subvolumes.
>>
>> At least it reflects what users see.
>>
>>>
>>> If you want a better pretty but not useful value just work on making
>>> i_ino 64-bits wide, which is long overdue.
>>
>> That would require too much work for me, and this would be a pain to
>> backport to all stable kernels.
>>
> 
> Would it though? Adding this new inode operation seems sub-optimal.
> 
> Inode numbers are static information. Once an inode number is set on an
> inode it basically never changes.  This patchset will turn all of those
> direct inode->i_ino fetches into a pointer chase for the new inode
> operation, which will then almost always just result in a direct fetch.
> 
> A better solution here would be to make inode->i_ino a u64, and just
> fix up all of the places that touch it to expect that. Then, just
> ensure that all of the filesystems set it properly when instantiating
> new inodes. Even on 32-bit arch, you likely wouldn't need a seqcount
> loop or anything to fetch this since the chance of a torn read there is
> basically zero.
> 
> If there are places where we need to convert i_ino down to 32-bits,
> then we can just use some scheme like nfs_fattr_to_ino_t(), or just
> cast i_ino to a u32.
> 
> Yes, it'd be a larger patchset, but that seems like it would be a
> better solution.
As someone who lives in the analytical user space of Linux audit,  I 
take it that for large (say >200TB) file systems, the inode value 
reported in audit PATH records is no longer forensically defensible and 
it's use as a correlation item is of questionable value now?
Christoph Hellwig Oct. 14, 2024, 9:02 a.m. UTC | #26
On Mon, Oct 14, 2024 at 07:40:37PM +1100, Burn Alting wrote:
> As someone who lives in the analytical user space of Linux audit,  I take it
> that for large (say >200TB) file systems, the inode value reported in audit
> PATH records is no longer forensically defensible and it's use as a
> correlation item is of questionable value now?

What do you mean with forensically defensible?

A 64-bit inode number is supposed to be unique.  Some file systems
(most notably btrfs, but probably also various non-native file system)
break and this, and get away with lots of userspace hacks papering
over it.  If you are on a 32-bit system and not using the LFS APIs
stat will fail with -EOVERFLOW.  Some file systems have options to
never generate > 32bit inode numbers.  None of that is directly
related to file system size, although at least for XFS file system
size is one relevant variable, but 200TB is in no way relevant.
Jeff Layton Oct. 14, 2024, 10:22 a.m. UTC | #27
On Mon, 2024-10-14 at 17:44 +1100, Burn Alting wrote:
>  
> 
> 
> 
> 
>  
> 
> 
> 
>  
> 
> 
> 
> On 13/10/24 21:17, Jeff Layton wrote:
>  
> 
> 
> 
>  
> 
> 
> 
> >  
> > 
> > 
> > 
> > On Fri, 2024-10-11 at 17:30 +0200, Mickaël Salaün wrote:
> >  
> > 
> > 
> > 
> > >  
> > > 
> > > 
> > > 
> > > On Fri, Oct 11, 2024 at 07:39:33AM -0700, Christoph Hellwig wrote:
> > >  
> > > 
> > > 
> > > 
> > > >  
> > > > 
> > > > 
> > > > 
> > > > On Fri, Oct 11, 2024 at 03:52:42PM +0200, Mickaël Salaün wrote:
> > > >  
> > > > 
> > > > 
> > > > 
> > > > >  
> > > > > 
> > > > > 
> > > > > 
> > > > > >  
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > >  
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > Yes, but how do you call getattr() without a path?
> > > > > > >  
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > >  
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > You don't because inode numbers are irrelevant without the path.
> > > > > >  
> > > > > > 
> > > > > > 
> > > > > > 
> > > > >  
> > > > > 
> > > > > 
> > > > > 
> > > > > They are for kernel messages and audit logs.  Please take a look at the
> > > > > use cases with the other patches.
> > > > >  
> > > > > 
> > > > > 
> > > > > 
> > > >  
> > > > 
> > > > 
> > > > 
> > > > It still is useless.  E.g. btrfs has duplicate inode numbers due to
> > > > subvolumes.
> > > >  
> > > > 
> > > > 
> > > > 
> > >  
> > > 
> > > 
> > > 
> > > At least it reflects what users see.
> > > 
> > >  
> > > 
> > > 
> > > 
> > > >  
> > > > 
> > > > 
> > > > 
> > > > If you want a better pretty but not useful value just work on making
> > > > i_ino 64-bits wide, which is long overdue.
> > > >  
> > > > 
> > > > 
> > > > 
> > >  
> > > 
> > > 
> > > 
> > > That would require too much work for me, and this would be a pain to
> > > backport to all stable kernels.
> > > 
> > >  
> > > 
> > > 
> > > 
> >  
> > 
> > 
> > 
> > Would it though? Adding this new inode operation seems sub-optimal.
> > 
> > Inode numbers are static information. Once an inode number is set on an
> > inode it basically never changes.  This patchset will turn all of those
> > direct inode->i_ino fetches into a pointer chase for the new inode
> > operation, which will then almost always just result in a direct fetch.
> > 
> > A better solution here would be to make inode->i_ino a u64, and just
> > fix up all of the places that touch it to expect that. Then, just
> > ensure that all of the filesystems set it properly when instantiating
> > new inodes. Even on 32-bit arch, you likely wouldn't need a seqcount
> > loop or anything to fetch this since the chance of a torn read there is
> > basically zero.
> > 
> > If there are places where we need to convert i_ino down to 32-bits,
> > then we can just use some scheme like nfs_fattr_to_ino_t(), or just
> > cast i_ino to a u32.
> > 
> > Yes, it'd be a larger patchset, but that seems like it would be a
> > better solution.
> >  
> > 
> > 
> > 
>  
> 
> 
> 
> As someone who lives in the analytical user space of Linux audit,  I take it that for large (say >200TB) file systems, the inode reported in audit PATH records is no longer forensically defensible and it's use as a correlation item is of questionable value now?
>  
> 
> 
> 

It's been a while since I worked in audit, but if audit is only
reporting 32-bit inode numbers in its records, then that could be
ambiguous. It's easily possible on larger filesystems to generate more
than 2^32 inodes.
Burn Alting Oct. 14, 2024, 12:12 p.m. UTC | #28
On 14/10/24 20:02, Christoph Hellwig wrote:
> On Mon, Oct 14, 2024 at 07:40:37PM +1100, Burn Alting wrote:
>> As someone who lives in the analytical user space of Linux audit,  I take it
>> that for large (say >200TB) file systems, the inode value reported in audit
>> PATH records is no longer forensically defensible and it's use as a
>> correlation item is of questionable value now?
> 
> What do you mean with forensically defensible?

If the auditd system only maintains a 32 bit variable for an inode 
value, when it emits an inode number, then how does one categorically 
state/defend that the inode value in the audit event is the actual one 
on the file system. The PATH record will offer one value (32 bits) but 
the returned inode value from a stat will return another (the actual 64 
bit value). Basically auditd would not be recording the correct value.

> 
> A 64-bit inode number is supposed to be unique.  Some file systems
> (most notably btrfs, but probably also various non-native file system)
> break and this, and get away with lots of userspace hacks papering
> over it.  If you are on a 32-bit system and not using the LFS APIs
> stat will fail with -EOVERFLOW.  Some file systems have options to
> never generate > 32bit inode numbers.  None of that is directly
> related to file system size, although at least for XFS file system
> size is one relevant variable, but 200TB is in no way relevant.

My reference to the filesystem size was a quick and dirty estimate of 
when one may see more than 2^32 inodes on a single filesystem. What I 
failed to state (my apologies) is that this presumed an xfs filesystem 
with default values when creating the file system. (A quick check on an 
single 240TB xfs filesystem advised more than 5000000000 inodes were 
available).
Christoph Hellwig Oct. 14, 2024, 12:17 p.m. UTC | #29
On Mon, Oct 14, 2024 at 11:12:25PM +1100, Burn Alting wrote:
> > > PATH records is no longer forensically defensible and it's use as a
> > > correlation item is of questionable value now?
> > 
> > What do you mean with forensically defensible?
> 
> If the auditd system only maintains a 32 bit variable for an inode value,
> when it emits an inode number, then how does one categorically state/defend
> that the inode value in the audit event is the actual one on the file
> system. The PATH record will offer one value (32 bits) but the returned
> inode value from a stat will return another (the actual 64 bit value).
> Basically auditd would not be recording the correct value.

Does auditd only track 32-bit inodes?  If yes, it is fundamentally
broken.

> My reference to the filesystem size was a quick and dirty estimate of when
> one may see more than 2^32 inodes on a single filesystem. What I failed to
> state (my apologies) is that this presumed an xfs filesystem with default
> values when creating the file system. (A quick check on an single 240TB xfs
> filesystem advised more than 5000000000 inodes were available).

For XFS inode number encoding is sparse, with part of it encoding the
allocation group it resides in.  For btrfs for example the inode number
is simply a 64-bit monotonically increasing counter per subvolume
where freed inode numbers never get reused.
Mickaël Salaün Oct. 14, 2024, 1:13 p.m. UTC | #30
On Mon, Oct 14, 2024 at 05:17:53AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 14, 2024 at 11:12:25PM +1100, Burn Alting wrote:
> > > > PATH records is no longer forensically defensible and it's use as a
> > > > correlation item is of questionable value now?
> > > 
> > > What do you mean with forensically defensible?
> > 
> > If the auditd system only maintains a 32 bit variable for an inode value,
> > when it emits an inode number, then how does one categorically state/defend
> > that the inode value in the audit event is the actual one on the file
> > system. The PATH record will offer one value (32 bits) but the returned
> > inode value from a stat will return another (the actual 64 bit value).
> > Basically auditd would not be recording the correct value.
> 
> Does auditd only track 32-bit inodes?  If yes, it is fundamentally
> broken.

auditd logs 32-bit inodes on 32-bit architecture, whereas it should
always log 64-bit inodes.  The goal of this patch series is to fix this
this issue for auditd and other kernel logs (and to backport these
fixes).
Christian Brauner Oct. 14, 2024, 2:35 p.m. UTC | #31
On Fri, Oct 11, 2024 at 08:34:35AM -0700, Christoph Hellwig wrote:
> On Fri, Oct 11, 2024 at 05:30:59PM +0200, Mickaël Salaün wrote:
> > > It still is useless.  E.g. btrfs has duplicate inode numbers due to
> > > subvolumes.
> > 
> > At least it reflects what users see.
> 
> Users generally don't see inode numbers.
> 
> > > If you want a better pretty but not useful value just work on making
> > > i_ino 64-bits wide, which is long overdue.
> > 
> > That would require too much work for me, and this would be a pain to
> > backport to all stable kernels.
> 
> Well, if doing the right thing is too hard we can easily do nothing.
> 
> In case it wan't clear, this thread has been a very explicit:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>

This must be typo and you want a NAK here, right?
Christoph Hellwig Oct. 14, 2024, 2:36 p.m. UTC | #32
On Mon, Oct 14, 2024 at 04:35:24PM +0200, Christian Brauner wrote:
> On Fri, Oct 11, 2024 at 08:34:35AM -0700, Christoph Hellwig wrote:
> > On Fri, Oct 11, 2024 at 05:30:59PM +0200, Mickaël Salaün wrote:
> > > > It still is useless.  E.g. btrfs has duplicate inode numbers due to
> > > > subvolumes.
> > > 
> > > At least it reflects what users see.
> > 
> > Users generally don't see inode numbers.
> > 
> > > > If you want a better pretty but not useful value just work on making
> > > > i_ino 64-bits wide, which is long overdue.
> > > 
> > > That would require too much work for me, and this would be a pain to
> > > backport to all stable kernels.
> > 
> > Well, if doing the right thing is too hard we can easily do nothing.
> > 
> > In case it wan't clear, this thread has been a very explicit:
> > 
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> This must be typo and you want a NAK here, right?

Yes :)
Christian Brauner Oct. 14, 2024, 2:45 p.m. UTC | #33
On Sun, Oct 13, 2024 at 06:17:43AM -0400, Jeff Layton wrote:
> On Fri, 2024-10-11 at 17:30 +0200, Mickaël Salaün wrote:
> > On Fri, Oct 11, 2024 at 07:39:33AM -0700, Christoph Hellwig wrote:
> > > On Fri, Oct 11, 2024 at 03:52:42PM +0200, Mickaël Salaün wrote:
> > > > > > Yes, but how do you call getattr() without a path?
> > > > > 
> > > > > You don't because inode numbers are irrelevant without the path.
> > > > 
> > > > They are for kernel messages and audit logs.  Please take a look at the
> > > > use cases with the other patches.
> > > 
> > > It still is useless.  E.g. btrfs has duplicate inode numbers due to
> > > subvolumes.
> > 
> > At least it reflects what users see.
> > 
> > > 
> > > If you want a better pretty but not useful value just work on making
> > > i_ino 64-bits wide, which is long overdue.
> > 
> > That would require too much work for me, and this would be a pain to
> > backport to all stable kernels.
> > 
> 
> Would it though? Adding this new inode operation seems sub-optimal.

I agree.

> Inode numbers are static information. Once an inode number is set on an
> inode it basically never changes.  This patchset will turn all of those
> direct inode->i_ino fetches into a pointer chase for the new inode
> operation, which will then almost always just result in a direct fetch.

Yup.

> A better solution here would be to make inode->i_ino a u64, and just
> fix up all of the places that touch it to expect that. Then, just

I would like us to try and see to make this happen. I really dislike
that struct inode is full of non-explicity types.
Christian Brauner Oct. 14, 2024, 2:47 p.m. UTC | #34
On Thu, Oct 10, 2024 at 05:26:41PM +0200, Mickaël Salaün wrote:
> When a filesystem manages its own inode numbers, like NFS's fileid shown
> to user space with getattr(), other part of the kernel may still expose
> the private inode->ino through kernel logs and audit.
> 
> Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> whereas the user space's view of an inode number can still be 64 bits.
> 
> Add a new inode_get_ino() helper calling the new struct
> inode_operations' get_ino() when set, to get the user space's view of an
> inode number.  inode_get_ino() is called by generic_fillattr().

I mean, you have to admit that this is a pretty blatant hack and that's
not worthy of a separate inode method, let alone the potential
performance implication that multiple people already brought up.
Mickaël Salaün Oct. 14, 2024, 3:27 p.m. UTC | #35
On Mon, Oct 14, 2024 at 04:45:00PM +0200, Christian Brauner wrote:
> On Sun, Oct 13, 2024 at 06:17:43AM -0400, Jeff Layton wrote:
> > On Fri, 2024-10-11 at 17:30 +0200, Mickaël Salaün wrote:
> > > On Fri, Oct 11, 2024 at 07:39:33AM -0700, Christoph Hellwig wrote:
> > > > On Fri, Oct 11, 2024 at 03:52:42PM +0200, Mickaël Salaün wrote:
> > > > > > > Yes, but how do you call getattr() without a path?
> > > > > > 
> > > > > > You don't because inode numbers are irrelevant without the path.
> > > > > 
> > > > > They are for kernel messages and audit logs.  Please take a look at the
> > > > > use cases with the other patches.
> > > > 
> > > > It still is useless.  E.g. btrfs has duplicate inode numbers due to
> > > > subvolumes.
> > > 
> > > At least it reflects what users see.
> > > 
> > > > 
> > > > If you want a better pretty but not useful value just work on making
> > > > i_ino 64-bits wide, which is long overdue.
> > > 
> > > That would require too much work for me, and this would be a pain to
> > > backport to all stable kernels.
> > > 
> > 
> > Would it though? Adding this new inode operation seems sub-optimal.
> 
> I agree.

Of course it would be better to fix the root of the issue.  Who is up
for the challenge?

> 
> > Inode numbers are static information. Once an inode number is set on an
> > inode it basically never changes.  This patchset will turn all of those
> > direct inode->i_ino fetches into a pointer chase for the new inode
> > operation, which will then almost always just result in a direct fetch.
> 
> Yup.
> 
> > A better solution here would be to make inode->i_ino a u64, and just
> > fix up all of the places that touch it to expect that. Then, just
> 
> I would like us to try and see to make this happen. I really dislike
> that struct inode is full of non-explicity types.

Also, it would be OK to backport it, right?
Mickaël Salaün Oct. 14, 2024, 5:51 p.m. UTC | #36
On Mon, Oct 14, 2024 at 04:47:17PM +0200, Christian Brauner wrote:
> On Thu, Oct 10, 2024 at 05:26:41PM +0200, Mickaël Salaün wrote:
> > When a filesystem manages its own inode numbers, like NFS's fileid shown
> > to user space with getattr(), other part of the kernel may still expose
> > the private inode->ino through kernel logs and audit.
> > 
> > Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> > whereas the user space's view of an inode number can still be 64 bits.
> > 
> > Add a new inode_get_ino() helper calling the new struct
> > inode_operations' get_ino() when set, to get the user space's view of an
> > inode number.  inode_get_ino() is called by generic_fillattr().
> 
> I mean, you have to admit that this is a pretty blatant hack and that's
> not worthy of a separate inode method, let alone the potential
> performance implication that multiple people already brought up.

My initial approach was to use u64 for struct inode's i_ino, but I
realized it had too much implications on potentially all filesystems and
other parts of the kernel.  I'm sure they are much more legitimate folks
to handle this transition.  I'd be curious to know how such i_ino type
change could be backported though.
Paul Moore Oct. 16, 2024, 12:15 a.m. UTC | #37
On Mon, Oct 14, 2024 at 10:45 AM Christian Brauner <brauner@kernel.org> wrote:
> On Sun, Oct 13, 2024 at 06:17:43AM -0400, Jeff Layton wrote:
> > On Fri, 2024-10-11 at 17:30 +0200, Mickaël Salaün wrote:
> > > On Fri, Oct 11, 2024 at 07:39:33AM -0700, Christoph Hellwig wrote:
> > > > On Fri, Oct 11, 2024 at 03:52:42PM +0200, Mickaël Salaün wrote:

...

> > A better solution here would be to make inode->i_ino a u64, and just
> > fix up all of the places that touch it to expect that. Then, just
>
> I would like us to try and see to make this happen. I really dislike
> that struct inode is full of non-explicity types.

Presumably this would include moving all of the filesystems to use
inode->i_ino instead of their own private file ID number, e.g. the NFS
issue pointed out in the original patch?  If not, I think most of us
in the LSM/audit space still have a need for a filesystem agnostic way
to determine the inode number from an inode struct.
Christian Brauner Oct. 16, 2024, 2:23 p.m. UTC | #38
On Thu, Oct 10, 2024 at 05:26:41PM +0200, Mickaël Salaün wrote:
> When a filesystem manages its own inode numbers, like NFS's fileid shown
> to user space with getattr(), other part of the kernel may still expose
> the private inode->ino through kernel logs and audit.
> 
> Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> whereas the user space's view of an inode number can still be 64 bits.
> 
> Add a new inode_get_ino() helper calling the new struct
> inode_operations' get_ino() when set, to get the user space's view of an
> inode number.  inode_get_ino() is called by generic_fillattr().
> 
> Implement get_ino() for NFS.
> 
> Cc: Trond Myklebust <trondmy@kernel.org>
> Cc: Anna Schumaker <anna@kernel.org>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: Jan Kara <jack@suse.cz>
> Signed-off-by: Mickaël Salaün <mic@digikod.net>
> ---
> 
> I'm not sure about nfs_namespace_getattr(), please review carefully.
> 
> I guess there are other filesystems exposing inode numbers different
> than inode->i_ino, and they should be patched too.

What are the other filesystems that are presumably affected by this that
would need an inode accessor?

If this is just about NFS then just add a helper function that audit and
whatever can call if they need to know the real inode number without
forcing a new get_inode() method onto struct inode_operations.

I'm against adding a new inode_operations method unless we really have
to because it means that we grow a nasty interface that filesystem can
start using and we absolutely don't want them to.

And I don't buy that is suddenly rush hour for this. Seemingly no one
noticed this in the past idk how many years.

> ---
>  fs/nfs/inode.c     | 6 ++++--
>  fs/nfs/internal.h  | 1 +
>  fs/nfs/namespace.c | 2 ++
>  fs/stat.c          | 2 +-
>  include/linux/fs.h | 9 +++++++++
>  5 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index 542c7d97b235..5dfc176b6d92 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -83,18 +83,19 @@ EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
>  
>  /**
>   * nfs_compat_user_ino64 - returns the user-visible inode number
> - * @fileid: 64-bit fileid
> + * @inode: inode pointer
>   *
>   * This function returns a 32-bit inode number if the boot parameter
>   * nfs.enable_ino64 is zero.
>   */
> -u64 nfs_compat_user_ino64(u64 fileid)
> +u64 nfs_compat_user_ino64(const struct *inode)
>  {
>  #ifdef CONFIG_COMPAT
>  	compat_ulong_t ino;
>  #else	
>  	unsigned long ino;
>  #endif
> +	u64 fileid = NFS_FILEID(inode);
>  
>  	if (enable_ino64)
>  		return fileid;
> @@ -103,6 +104,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
>  		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
>  	return ino;
>  }
> +EXPORT_SYMBOL_GPL(nfs_compat_user_ino64);
>  
>  int nfs_drop_inode(struct inode *inode)
>  {
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index 430733e3eff2..f5555a71a733 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -451,6 +451,7 @@ extern void nfs_zap_acl_cache(struct inode *inode);
>  extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
>  extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
>  extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
> +extern u64 nfs_compat_user_ino64(const struct *inode);
>  
>  #if IS_ENABLED(CONFIG_NFS_LOCALIO)
>  /* localio.c */
> diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
> index e7494cdd957e..d9b1e0606833 100644
> --- a/fs/nfs/namespace.c
> +++ b/fs/nfs/namespace.c
> @@ -232,11 +232,13 @@ nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
>  const struct inode_operations nfs_mountpoint_inode_operations = {
>  	.getattr	= nfs_getattr,
>  	.setattr	= nfs_setattr,
> +	.get_ino	= nfs_compat_user_ino64,
>  };
>  
>  const struct inode_operations nfs_referral_inode_operations = {
>  	.getattr	= nfs_namespace_getattr,
>  	.setattr	= nfs_namespace_setattr,
> +	.get_ino	= nfs_compat_user_ino64,
>  };
>  
>  static void nfs_expire_automounts(struct work_struct *work)
> diff --git a/fs/stat.c b/fs/stat.c
> index 41e598376d7e..05636919f94b 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -50,7 +50,7 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
>  	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
>  
>  	stat->dev = inode->i_sb->s_dev;
> -	stat->ino = inode->i_ino;
> +	stat->ino = inode_get_ino(inode);
>  	stat->mode = inode->i_mode;
>  	stat->nlink = inode->i_nlink;
>  	stat->uid = vfsuid_into_kuid(vfsuid);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e3c603d01337..0eba09a21cf7 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2165,6 +2165,7 @@ struct inode_operations {
>  			    struct dentry *dentry, struct fileattr *fa);
>  	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
>  	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
> +	u64 (*get_ino)(const struct inode *inode);
>  } ____cacheline_aligned;
>  
>  static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
> @@ -2172,6 +2173,14 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
>  	return file->f_op->mmap(file, vma);
>  }
>  
> +static inline u64 inode_get_ino(struct inode *inode)
> +{
> +	if (unlikely(inode->i_op->get_ino))
> +		return inode->i_op->get_ino(inode);
> +
> +	return inode->i_ino;
> +}
> +
>  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
>  extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
>  extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
> -- 
> 2.46.1
>
Paul Moore Oct. 16, 2024, 11:05 p.m. UTC | #39
On Wed, Oct 16, 2024 at 10:23 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Thu, Oct 10, 2024 at 05:26:41PM +0200, Mickaël Salaün wrote:
> > When a filesystem manages its own inode numbers, like NFS's fileid shown
> > to user space with getattr(), other part of the kernel may still expose
> > the private inode->ino through kernel logs and audit.
> >
> > Another issue is on 32-bit architectures, on which ino_t is 32 bits,
> > whereas the user space's view of an inode number can still be 64 bits.
> >
> > Add a new inode_get_ino() helper calling the new struct
> > inode_operations' get_ino() when set, to get the user space's view of an
> > inode number.  inode_get_ino() is called by generic_fillattr().
> >
> > Implement get_ino() for NFS.
> >
> > Cc: Trond Myklebust <trondmy@kernel.org>
> > Cc: Anna Schumaker <anna@kernel.org>
> > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > Cc: Christian Brauner <brauner@kernel.org>
> > Cc: Jan Kara <jack@suse.cz>
> > Signed-off-by: Mickaël Salaün <mic@digikod.net>
> > ---
> >
> > I'm not sure about nfs_namespace_getattr(), please review carefully.
> >
> > I guess there are other filesystems exposing inode numbers different
> > than inode->i_ino, and they should be patched too.
>
> What are the other filesystems that are presumably affected by this that
> would need an inode accessor?

I don't want to speak for Mickaël, but my reading of the patchset was
that he was suspecting that other filesystems had the same issue
(privately maintained inode numbers) and was posting this as a RFC
partly for clarity on this from the VFS developers such as yourself.

> If this is just about NFS then just add a helper function that audit and
> whatever can call if they need to know the real inode number without
> forcing a new get_inode() method onto struct inode_operations.

If this really is just limited to NFS, or perhaps NFS and a small
number of filesystems, then a a helper function is a reasonable
solution.  I think Mickaël was worried that private inode numbers
would be more common, in which case a get_ino() method makes a bit
more sense.

> And I don't buy that is suddenly rush hour for this.

I don't think Mickaël ever characterized this as a "rush hour" issue
and I know I didn't.  It definitely caught us by surprise to learn
that inode->i_no wasn't always maintained, and we want to find a
solution, but I'm not hearing anyone screaming for a solution
"yesterday".

> Seemingly no one noticed this in the past idk how many years.

Yet the issue has been noticed and we would like to find a solution,
one that is acceptable both to the VFS and LSM folks.

Can we start with compiling a list of filesystems that maintain their
inode numbers outside of inode->i_no?  NFS is obviously first on that
list, are there others that the VFS devs can add?
Trond Myklebust Oct. 17, 2024, 2:30 p.m. UTC | #40
On Wed, 2024-10-16 at 19:05 -0400, Paul Moore wrote:
> On Wed, Oct 16, 2024 at 10:23 AM Christian Brauner
> <brauner@kernel.org> wrote:
> > 
> > On Thu, Oct 10, 2024 at 05:26:41PM +0200, Mickaël Salaün wrote:
> > > When a filesystem manages its own inode numbers, like NFS's
> > > fileid shown
> > > to user space with getattr(), other part of the kernel may still
> > > expose
> > > the private inode->ino through kernel logs and audit.
> > > 
> > > Another issue is on 32-bit architectures, on which ino_t is 32
> > > bits,
> > > whereas the user space's view of an inode number can still be 64
> > > bits.
> > > 
> > > Add a new inode_get_ino() helper calling the new struct
> > > inode_operations' get_ino() when set, to get the user space's
> > > view of an
> > > inode number.  inode_get_ino() is called by generic_fillattr().
> > > 
> > > Implement get_ino() for NFS.
> > > 
> > > Cc: Trond Myklebust <trondmy@kernel.org>
> > > Cc: Anna Schumaker <anna@kernel.org>
> > > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > > Cc: Christian Brauner <brauner@kernel.org>
> > > Cc: Jan Kara <jack@suse.cz>
> > > Signed-off-by: Mickaël Salaün <mic@digikod.net>
> > > ---
> > > 
> > > I'm not sure about nfs_namespace_getattr(), please review
> > > carefully.
> > > 
> > > I guess there are other filesystems exposing inode numbers
> > > different
> > > than inode->i_ino, and they should be patched too.
> > 
> > What are the other filesystems that are presumably affected by this
> > that
> > would need an inode accessor?
> 
> I don't want to speak for Mickaël, but my reading of the patchset was
> that he was suspecting that other filesystems had the same issue
> (privately maintained inode numbers) and was posting this as a RFC
> partly for clarity on this from the VFS developers such as yourself.
> 
> > If this is just about NFS then just add a helper function that
> > audit and
> > whatever can call if they need to know the real inode number
> > without
> > forcing a new get_inode() method onto struct inode_operations.
> 
> If this really is just limited to NFS, or perhaps NFS and a small
> number of filesystems, then a a helper function is a reasonable
> solution.  I think Mickaël was worried that private inode numbers
> would be more common, in which case a get_ino() method makes a bit
> more sense.
> 
> > And I don't buy that is suddenly rush hour for this.
> 
> I don't think Mickaël ever characterized this as a "rush hour" issue
> and I know I didn't.  It definitely caught us by surprise to learn
> that inode->i_no wasn't always maintained, and we want to find a
> solution, but I'm not hearing anyone screaming for a solution
> "yesterday".
> 
> > Seemingly no one noticed this in the past idk how many years.
> 
> Yet the issue has been noticed and we would like to find a solution,
> one that is acceptable both to the VFS and LSM folks.
> 
> Can we start with compiling a list of filesystems that maintain their
> inode numbers outside of inode->i_no?  NFS is obviously first on that
> list, are there others that the VFS devs can add?
> 

Pretty much any filesystem that uses 64-bit inode numbers has the same
problem: if the application calls stat(), 32-bit glibc will happily
convert that into a call to fstatat64() and then cry foul if the kernel
dares return an inode number that doesn't fit in 32 bits.
Paul Moore Oct. 17, 2024, 2:54 p.m. UTC | #41
On Thu, Oct 17, 2024 at 10:30 AM Trond Myklebust
<trondmy@hammerspace.com> wrote:
> On Wed, 2024-10-16 at 19:05 -0400, Paul Moore wrote:
> > On Wed, Oct 16, 2024 at 10:23 AM Christian Brauner
> > <brauner@kernel.org> wrote:
> > >
> > > On Thu, Oct 10, 2024 at 05:26:41PM +0200, Mickaël Salaün wrote:
> > > > When a filesystem manages its own inode numbers, like NFS's
> > > > fileid shown
> > > > to user space with getattr(), other part of the kernel may still
> > > > expose
> > > > the private inode->ino through kernel logs and audit.
> > > >
> > > > Another issue is on 32-bit architectures, on which ino_t is 32
> > > > bits,
> > > > whereas the user space's view of an inode number can still be 64
> > > > bits.
> > > >
> > > > Add a new inode_get_ino() helper calling the new struct
> > > > inode_operations' get_ino() when set, to get the user space's
> > > > view of an
> > > > inode number.  inode_get_ino() is called by generic_fillattr().
> > > >
> > > > Implement get_ino() for NFS.
> > > >
> > > > Cc: Trond Myklebust <trondmy@kernel.org>
> > > > Cc: Anna Schumaker <anna@kernel.org>
> > > > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > > > Cc: Christian Brauner <brauner@kernel.org>
> > > > Cc: Jan Kara <jack@suse.cz>
> > > > Signed-off-by: Mickaël Salaün <mic@digikod.net>
> > > > ---
> > > >
> > > > I'm not sure about nfs_namespace_getattr(), please review
> > > > carefully.
> > > >
> > > > I guess there are other filesystems exposing inode numbers
> > > > different
> > > > than inode->i_ino, and they should be patched too.
> > >
> > > What are the other filesystems that are presumably affected by this
> > > that
> > > would need an inode accessor?
> >
> > I don't want to speak for Mickaël, but my reading of the patchset was
> > that he was suspecting that other filesystems had the same issue
> > (privately maintained inode numbers) and was posting this as a RFC
> > partly for clarity on this from the VFS developers such as yourself.
> >
> > > If this is just about NFS then just add a helper function that
> > > audit and
> > > whatever can call if they need to know the real inode number
> > > without
> > > forcing a new get_inode() method onto struct inode_operations.
> >
> > If this really is just limited to NFS, or perhaps NFS and a small
> > number of filesystems, then a a helper function is a reasonable
> > solution.  I think Mickaël was worried that private inode numbers
> > would be more common, in which case a get_ino() method makes a bit
> > more sense.
> >
> > > And I don't buy that is suddenly rush hour for this.
> >
> > I don't think Mickaël ever characterized this as a "rush hour" issue
> > and I know I didn't.  It definitely caught us by surprise to learn
> > that inode->i_no wasn't always maintained, and we want to find a
> > solution, but I'm not hearing anyone screaming for a solution
> > "yesterday".
> >
> > > Seemingly no one noticed this in the past idk how many years.
> >
> > Yet the issue has been noticed and we would like to find a solution,
> > one that is acceptable both to the VFS and LSM folks.
> >
> > Can we start with compiling a list of filesystems that maintain their
> > inode numbers outside of inode->i_no?  NFS is obviously first on that
> > list, are there others that the VFS devs can add?
> >
>
> Pretty much any filesystem that uses 64-bit inode numbers has the same
> problem: if the application calls stat(), 32-bit glibc will happily
> convert that into a call to fstatat64() and then cry foul if the kernel
> dares return an inode number that doesn't fit in 32 bits.

Okay, good to know, but I was hoping that there we could come up with
an explicit list of filesystems that maintain their own private inode
numbers outside of inode-i_ino.
Christoph Hellwig Oct. 17, 2024, 2:56 p.m. UTC | #42
On Wed, Oct 16, 2024 at 04:23:02PM +0200, Christian Brauner wrote:
> And I don't buy that is suddenly rush hour for this. Seemingly no one
> noticed this in the past idk how many years.

Asuming this showed up with the initial import of kernel/audit.c that
would be more than 20 years.
Christoph Hellwig Oct. 17, 2024, 2:58 p.m. UTC | #43
On Thu, Oct 17, 2024 at 10:54:12AM -0400, Paul Moore wrote:
> Okay, good to know, but I was hoping that there we could come up with
> an explicit list of filesystems that maintain their own private inode
> numbers outside of inode-i_ino.

Anything using iget5_locked is a good start.  Add to that file systems
implementing their own inode cache (at least xfs and bcachefs).
Paul Moore Oct. 17, 2024, 3:15 p.m. UTC | #44
On Thu, Oct 17, 2024 at 10:58 AM Christoph Hellwig <hch@infradead.org> wrote:
> On Thu, Oct 17, 2024 at 10:54:12AM -0400, Paul Moore wrote:
> > Okay, good to know, but I was hoping that there we could come up with
> > an explicit list of filesystems that maintain their own private inode
> > numbers outside of inode-i_ino.
>
> Anything using iget5_locked is a good start.  Add to that file systems
> implementing their own inode cache (at least xfs and bcachefs).

Also good to know, thanks.  However, at this point the lack of a clear
answer is making me wonder a bit more about inode numbers in the view
of VFS developers; do you folks care about inode numbers?  I'm not
asking to start an argument, it's a genuine question so I can get a
better understanding about the durability and sustainability of
inode->i_no.  If all of you (the VFS folks) aren't concerned about
inode numbers, I suspect we are going to have similar issues in the
future and we (the LSM folks) likely need to move away from reporting
inode numbers as they aren't reliably maintained by the VFS layer.
Christoph Hellwig Oct. 17, 2024, 3:25 p.m. UTC | #45
On Thu, Oct 17, 2024 at 11:15:49AM -0400, Paul Moore wrote:
> Also good to know, thanks.  However, at this point the lack of a clear
> answer is making me wonder a bit more about inode numbers in the view
> of VFS developers; do you folks care about inode numbers?

The VFS itself does not care much about inode numbers.  The Posix API
does, although btrfs ignores that and seems to get away with that
(mostly because applications put in btrfs-specific hacks).  Various
other non-native file systems that don't support real inodes numbers
also get away with that, but usually the applications used on those
file systems are very limited.
Jan Kara Oct. 17, 2024, 4:43 p.m. UTC | #46
On Thu 17-10-24 08:25:19, Christoph Hellwig wrote:
> On Thu, Oct 17, 2024 at 11:15:49AM -0400, Paul Moore wrote:
> > Also good to know, thanks.  However, at this point the lack of a clear
> > answer is making me wonder a bit more about inode numbers in the view
> > of VFS developers; do you folks care about inode numbers?
> 
> The VFS itself does not care much about inode numbers.  The Posix API
> does, although btrfs ignores that and seems to get away with that
> (mostly because applications put in btrfs-specific hacks).

Well, btrfs plays tricks with *device* numbers, right? Exactly so that
st_ino + st_dev actually stay unique for each file. Whether it matters for
audit I don't dare to say :). Bcachefs does not care and returns non-unique
inode numbers.

								Honza
Jeff Layton Oct. 17, 2024, 5:05 p.m. UTC | #47
On Thu, 2024-10-17 at 11:15 -0400, Paul Moore wrote:
> On Thu, Oct 17, 2024 at 10:58 AM Christoph Hellwig <hch@infradead.org> wrote:
> > On Thu, Oct 17, 2024 at 10:54:12AM -0400, Paul Moore wrote:
> > > Okay, good to know, but I was hoping that there we could come up with
> > > an explicit list of filesystems that maintain their own private inode
> > > numbers outside of inode-i_ino.
> > 
> > Anything using iget5_locked is a good start.  Add to that file systems
> > implementing their own inode cache (at least xfs and bcachefs).
> 
> Also good to know, thanks.  However, at this point the lack of a clear
> answer is making me wonder a bit more about inode numbers in the view
> of VFS developers; do you folks care about inode numbers?  I'm not
> asking to start an argument, it's a genuine question so I can get a
> better understanding about the durability and sustainability of
> inode->i_no.  If all of you (the VFS folks) aren't concerned about
> inode numbers, I suspect we are going to have similar issues in the
> future and we (the LSM folks) likely need to move away from reporting
> inode numbers as they aren't reliably maintained by the VFS layer.
> 

Like Christoph said, the kernel doesn't care much about inode numbers.

People care about them though, and sometimes we have things in the
kernel that report them in some fashion (tracepoints, procfiles, audit
events, etc.). Having those match what the userland stat() st_ino field
tells you is ideal, and for the most part that's the way it works.

The main exception is when people use 32-bit interfaces (somewhat rare
these days), or they have a 32-bit kernel with a filesystem that has a
64-bit inode number space (NFS being one of those). The NFS client has
basically hacked around this for years by tracking its own fileid field
in its inode. That's really a waste though. That could be converted
over to use i_ino instead if it were always wide enough.

It'd be better to stop with these sort of hacks and just fix this the
right way once and for all, by making i_ino 64 bits everywhere.

A lot of the changes can probably be automated via coccinelle. I'd
probably start by turning all of the direct i_ino accesses into static
inline wrapper function calls. The hard part will be parceling out that
work into digestable chunks. If you can avoid "flag day" changes, then
that's ideal.  You'd want a patch per subsystem so you can collect
ACKs. 

The hardest part will probably be the format string changes. I'm not
sure you can easily use coccinelle for that, so that may need to be
done by hand or scripted with python or something.
Trond Myklebust Oct. 17, 2024, 5:09 p.m. UTC | #48
On Thu, 2024-10-17 at 13:05 -0400, Jeff Layton wrote:
> On Thu, 2024-10-17 at 11:15 -0400, Paul Moore wrote:
> > On Thu, Oct 17, 2024 at 10:58 AM Christoph Hellwig
> > <hch@infradead.org> wrote:
> > > On Thu, Oct 17, 2024 at 10:54:12AM -0400, Paul Moore wrote:
> > > > Okay, good to know, but I was hoping that there we could come
> > > > up with
> > > > an explicit list of filesystems that maintain their own private
> > > > inode
> > > > numbers outside of inode-i_ino.
> > > 
> > > Anything using iget5_locked is a good start.  Add to that file
> > > systems
> > > implementing their own inode cache (at least xfs and bcachefs).
> > 
> > Also good to know, thanks.  However, at this point the lack of a
> > clear
> > answer is making me wonder a bit more about inode numbers in the
> > view
> > of VFS developers; do you folks care about inode numbers?  I'm not
> > asking to start an argument, it's a genuine question so I can get a
> > better understanding about the durability and sustainability of
> > inode->i_no.  If all of you (the VFS folks) aren't concerned about
> > inode numbers, I suspect we are going to have similar issues in the
> > future and we (the LSM folks) likely need to move away from
> > reporting
> > inode numbers as they aren't reliably maintained by the VFS layer.
> > 
> 
> Like Christoph said, the kernel doesn't care much about inode
> numbers.
> 
> People care about them though, and sometimes we have things in the
> kernel that report them in some fashion (tracepoints, procfiles,
> audit
> events, etc.). Having those match what the userland stat() st_ino
> field
> tells you is ideal, and for the most part that's the way it works.
> 
> The main exception is when people use 32-bit interfaces (somewhat
> rare
> these days), or they have a 32-bit kernel with a filesystem that has
> a
> 64-bit inode number space (NFS being one of those). The NFS client
> has
> basically hacked around this for years by tracking its own fileid
> field
> in its inode. That's really a waste though. That could be converted
> over to use i_ino instead if it were always wide enough.
> 
> It'd be better to stop with these sort of hacks and just fix this the
> right way once and for all, by making i_ino 64 bits everywhere.

Nope.

That won't fix glibc, which is the main problem NFS has to work around.
Jeff Layton Oct. 17, 2024, 5:59 p.m. UTC | #49
On Thu, 2024-10-17 at 17:09 +0000, Trond Myklebust wrote:
> On Thu, 2024-10-17 at 13:05 -0400, Jeff Layton wrote:
> > On Thu, 2024-10-17 at 11:15 -0400, Paul Moore wrote:
> > > On Thu, Oct 17, 2024 at 10:58 AM Christoph Hellwig
> > > <hch@infradead.org> wrote:
> > > > On Thu, Oct 17, 2024 at 10:54:12AM -0400, Paul Moore wrote:
> > > > > Okay, good to know, but I was hoping that there we could come
> > > > > up with
> > > > > an explicit list of filesystems that maintain their own private
> > > > > inode
> > > > > numbers outside of inode-i_ino.
> > > > 
> > > > Anything using iget5_locked is a good start.  Add to that file
> > > > systems
> > > > implementing their own inode cache (at least xfs and bcachefs).
> > > 
> > > Also good to know, thanks.  However, at this point the lack of a
> > > clear
> > > answer is making me wonder a bit more about inode numbers in the
> > > view
> > > of VFS developers; do you folks care about inode numbers?  I'm not
> > > asking to start an argument, it's a genuine question so I can get a
> > > better understanding about the durability and sustainability of
> > > inode->i_no.  If all of you (the VFS folks) aren't concerned about
> > > inode numbers, I suspect we are going to have similar issues in the
> > > future and we (the LSM folks) likely need to move away from
> > > reporting
> > > inode numbers as they aren't reliably maintained by the VFS layer.
> > > 
> > 
> > Like Christoph said, the kernel doesn't care much about inode
> > numbers.
> > 
> > People care about them though, and sometimes we have things in the
> > kernel that report them in some fashion (tracepoints, procfiles,
> > audit
> > events, etc.). Having those match what the userland stat() st_ino
> > field
> > tells you is ideal, and for the most part that's the way it works.
> > 
> > The main exception is when people use 32-bit interfaces (somewhat
> > rare
> > these days), or they have a 32-bit kernel with a filesystem that has
> > a
> > 64-bit inode number space (NFS being one of those). The NFS client
> > has
> > basically hacked around this for years by tracking its own fileid
> > field
> > in its inode. That's really a waste though. That could be converted
> > over to use i_ino instead if it were always wide enough.
> > 
> > It'd be better to stop with these sort of hacks and just fix this the
> > right way once and for all, by making i_ino 64 bits everywhere.
> 
> Nope.
> 
> That won't fix glibc, which is the main problem NFS has to work around.
> 

True, but that's really a separate problem.

It also doesn't inform how we track inode numbers inside the kernel.
Inode numbers have been 64 bits for years on "real" filesystems. If we
were designing this today, i_ino would be a u64, and we'd only hash
that down to 32 bits when necessary.
Paul Moore Oct. 17, 2024, 8:21 p.m. UTC | #50
On Thu, Oct 17, 2024 at 1:05 PM Jeff Layton <jlayton@kernel.org> wrote:
> On Thu, 2024-10-17 at 11:15 -0400, Paul Moore wrote:
> > On Thu, Oct 17, 2024 at 10:58 AM Christoph Hellwig <hch@infradead.org> wrote:
> > > On Thu, Oct 17, 2024 at 10:54:12AM -0400, Paul Moore wrote:
> > > > Okay, good to know, but I was hoping that there we could come up with
> > > > an explicit list of filesystems that maintain their own private inode
> > > > numbers outside of inode-i_ino.
> > >
> > > Anything using iget5_locked is a good start.  Add to that file systems
> > > implementing their own inode cache (at least xfs and bcachefs).
> >
> > Also good to know, thanks.  However, at this point the lack of a clear
> > answer is making me wonder a bit more about inode numbers in the view
> > of VFS developers; do you folks care about inode numbers?  I'm not
> > asking to start an argument, it's a genuine question so I can get a
> > better understanding about the durability and sustainability of
> > inode->i_no.  If all of you (the VFS folks) aren't concerned about
> > inode numbers, I suspect we are going to have similar issues in the
> > future and we (the LSM folks) likely need to move away from reporting
> > inode numbers as they aren't reliably maintained by the VFS layer.
> >
>
> Like Christoph said, the kernel doesn't care much about inode numbers.
>
> People care about them though, and sometimes we have things in the
> kernel that report them in some fashion (tracepoints, procfiles, audit
> events, etc.). Having those match what the userland stat() st_ino field
> tells you is ideal, and for the most part that's the way it works.
>
> The main exception is when people use 32-bit interfaces (somewhat rare
> these days), or they have a 32-bit kernel with a filesystem that has a
> 64-bit inode number space (NFS being one of those). The NFS client has
> basically hacked around this for years by tracking its own fileid field
> in its inode.

When I asked if the VFS dev cared about inode numbers this is more of
what I was wondering about.  Regardless of if the kernel itself uses
inode numbers for anything, it does appear that users do care about
inode numbers to some extent, and I wanted to know if the VFS devs
viewed the inode numbers as a first order UAPI interface/thing, or if
it was of lesser importance and not something the kernel was going to
provide much of a guarantee around.  Once again, I'm not asking this
to start a war, I'm just trying to get some perspective from the VFS
dev side of things.

> A lot of the changes can probably be automated via coccinelle. I'd
> probably start by turning all of the direct i_ino accesses into static
> inline wrapper function calls. The hard part will be parceling out that
> work into digestable chunks. If you can avoid "flag day" changes, then
> that's ideal.  You'd want a patch per subsystem so you can collect
> ACKs.
>
> The hardest part will probably be the format string changes. I'm not
> sure you can easily use coccinelle for that, so that may need to be
> done by hand or scripted with python or something.

Out of curiosity, is this on anyone's roadmap?  I've already got
enough work in security land to keep myself occupied until I'm hit by
that mythical bus, so I can't volunteer in good conscience, but I (and
many others in security land) would be grateful for a single,
consistent way to fetch inode numbers :)
Trond Myklebust Oct. 17, 2024, 9:06 p.m. UTC | #51
On Thu, 2024-10-17 at 13:59 -0400, Jeff Layton wrote:
> On Thu, 2024-10-17 at 17:09 +0000, Trond Myklebust wrote:
> > On Thu, 2024-10-17 at 13:05 -0400, Jeff Layton wrote:
> > > On Thu, 2024-10-17 at 11:15 -0400, Paul Moore wrote:
> > > > On Thu, Oct 17, 2024 at 10:58 AM Christoph Hellwig
> > > > <hch@infradead.org> wrote:
> > > > > On Thu, Oct 17, 2024 at 10:54:12AM -0400, Paul Moore wrote:
> > > > > > Okay, good to know, but I was hoping that there we could
> > > > > > come
> > > > > > up with
> > > > > > an explicit list of filesystems that maintain their own
> > > > > > private
> > > > > > inode
> > > > > > numbers outside of inode-i_ino.
> > > > > 
> > > > > Anything using iget5_locked is a good start.  Add to that
> > > > > file
> > > > > systems
> > > > > implementing their own inode cache (at least xfs and
> > > > > bcachefs).
> > > > 
> > > > Also good to know, thanks.  However, at this point the lack of
> > > > a
> > > > clear
> > > > answer is making me wonder a bit more about inode numbers in
> > > > the
> > > > view
> > > > of VFS developers; do you folks care about inode numbers?  I'm
> > > > not
> > > > asking to start an argument, it's a genuine question so I can
> > > > get a
> > > > better understanding about the durability and sustainability of
> > > > inode->i_no.  If all of you (the VFS folks) aren't concerned
> > > > about
> > > > inode numbers, I suspect we are going to have similar issues in
> > > > the
> > > > future and we (the LSM folks) likely need to move away from
> > > > reporting
> > > > inode numbers as they aren't reliably maintained by the VFS
> > > > layer.
> > > > 
> > > 
> > > Like Christoph said, the kernel doesn't care much about inode
> > > numbers.
> > > 
> > > People care about them though, and sometimes we have things in
> > > the
> > > kernel that report them in some fashion (tracepoints, procfiles,
> > > audit
> > > events, etc.). Having those match what the userland stat() st_ino
> > > field
> > > tells you is ideal, and for the most part that's the way it
> > > works.
> > > 
> > > The main exception is when people use 32-bit interfaces (somewhat
> > > rare
> > > these days), or they have a 32-bit kernel with a filesystem that
> > > has
> > > a
> > > 64-bit inode number space (NFS being one of those). The NFS
> > > client
> > > has
> > > basically hacked around this for years by tracking its own fileid
> > > field
> > > in its inode. That's really a waste though. That could be
> > > converted
> > > over to use i_ino instead if it were always wide enough.
> > > 
> > > It'd be better to stop with these sort of hacks and just fix this
> > > the
> > > right way once and for all, by making i_ino 64 bits everywhere.
> > 
> > Nope.
> > 
> > That won't fix glibc, which is the main problem NFS has to work
> > around.
> > 
> 
> True, but that's really a separate problem.


Currently, the problem where the kernel needs to use one inode number
in iget5() and a different one when replying to stat() is limited to
the set of 64-bit kernels that can operate in 32-bit userland
compability mode. So mainly on x86_64 kernels that are set up to run in
i386 userland compatibility mode.

If you now decree that all kernels will use 64-bit inode numbers
internally, then you've suddenly expanded the problem to encompass all
the remaining 32-bit kernels. In order to avoid stat() returning
EOVERFLOW to the applications, they too will have to start generating
separate 32-bit inode numbers.

> 
> It also doesn't inform how we track inode numbers inside the kernel.
> Inode numbers have been 64 bits for years on "real" filesystems. If
> we
> were designing this today, i_ino would be a u64, and we'd only hash
> that down to 32 bits when necessary.

"I'm doing a (free) operating system (just a hobby, won't be big and
professional like gnu) for 386(486) AT clones."

History is a bitch...
Christoph Hellwig Oct. 18, 2024, 5:15 a.m. UTC | #52
On Thu, Oct 17, 2024 at 06:43:38PM +0200, Jan Kara wrote:
> On Thu 17-10-24 08:25:19, Christoph Hellwig wrote:
> > On Thu, Oct 17, 2024 at 11:15:49AM -0400, Paul Moore wrote:
> > > Also good to know, thanks.  However, at this point the lack of a clear
> > > answer is making me wonder a bit more about inode numbers in the view
> > > of VFS developers; do you folks care about inode numbers?
> > 
> > The VFS itself does not care much about inode numbers.  The Posix API
> > does, although btrfs ignores that and seems to get away with that
> > (mostly because applications put in btrfs-specific hacks).
> 
> Well, btrfs plays tricks with *device* numbers, right? Exactly so that
> st_ino + st_dev actually stay unique for each file. Whether it matters for
> audit I don't dare to say :). Bcachefs does not care and returns non-unique
> inode numbers.

But st_ino + st_dev is the only thing Posix and thus historically Linux
has guaranteed to applications.  So if st_dev is unique, but you need
an unknown scope in which it is unique it might as well not be for
that purpose.  And I think for any kind of audit report that is true
as well.
Christoph Hellwig Oct. 18, 2024, 5:18 a.m. UTC | #53
On Thu, Oct 17, 2024 at 05:09:17PM +0000, Trond Myklebust wrote:
> > It'd be better to stop with these sort of hacks and just fix this the
> > right way once and for all, by making i_ino 64 bits everywhere.
> 
> Nope.
> 
> That won't fix glibc, which is the main problem NFS has to work around.

There's no Problem in glibc here, it's mostly the Linux syscall layer
being stupid for the 32-bit non-LFS interface.

That being said with my XFS hat on we've not really had issue with
32-bit non-LFS code for a long time.  Not saying it doesn't exist,
but it's probably limited to retro computing by now.  The way we
support it is with the inode32 option that never allocates larger
inode numbers.  I think something similar should work for NFS where
an option is required for the ino mangling.
Jan Kara Oct. 18, 2024, 12:25 p.m. UTC | #54
On Thu 17-10-24 16:21:34, Paul Moore wrote:
> On Thu, Oct 17, 2024 at 1:05 PM Jeff Layton <jlayton@kernel.org> wrote:
> > On Thu, 2024-10-17 at 11:15 -0400, Paul Moore wrote:
> > > On Thu, Oct 17, 2024 at 10:58 AM Christoph Hellwig <hch@infradead.org> wrote:
> > > > On Thu, Oct 17, 2024 at 10:54:12AM -0400, Paul Moore wrote:
> > > > > Okay, good to know, but I was hoping that there we could come up with
> > > > > an explicit list of filesystems that maintain their own private inode
> > > > > numbers outside of inode-i_ino.
> > > >
> > > > Anything using iget5_locked is a good start.  Add to that file systems
> > > > implementing their own inode cache (at least xfs and bcachefs).
> > >
> > > Also good to know, thanks.  However, at this point the lack of a clear
> > > answer is making me wonder a bit more about inode numbers in the view
> > > of VFS developers; do you folks care about inode numbers?  I'm not
> > > asking to start an argument, it's a genuine question so I can get a
> > > better understanding about the durability and sustainability of
> > > inode->i_no.  If all of you (the VFS folks) aren't concerned about
> > > inode numbers, I suspect we are going to have similar issues in the
> > > future and we (the LSM folks) likely need to move away from reporting
> > > inode numbers as they aren't reliably maintained by the VFS layer.
> > >
> >
> > Like Christoph said, the kernel doesn't care much about inode numbers.
> >
> > People care about them though, and sometimes we have things in the
> > kernel that report them in some fashion (tracepoints, procfiles, audit
> > events, etc.). Having those match what the userland stat() st_ino field
> > tells you is ideal, and for the most part that's the way it works.
> >
> > The main exception is when people use 32-bit interfaces (somewhat rare
> > these days), or they have a 32-bit kernel with a filesystem that has a
> > 64-bit inode number space (NFS being one of those). The NFS client has
> > basically hacked around this for years by tracking its own fileid field
> > in its inode.
> 
> When I asked if the VFS dev cared about inode numbers this is more of
> what I was wondering about.  Regardless of if the kernel itself uses
> inode numbers for anything, it does appear that users do care about
> inode numbers to some extent, and I wanted to know if the VFS devs
> viewed the inode numbers as a first order UAPI interface/thing, or if
> it was of lesser importance and not something the kernel was going to
> provide much of a guarantee around.  Once again, I'm not asking this
> to start a war, I'm just trying to get some perspective from the VFS
> dev side of things.

Well, we do care to not break our users. So our opinion about "first order
UAPI" doesn't matter that much. If userspace is using it, we have to
avoid breaking it. And there definitely is userspace depending on st_ino +
st_dev being unique identifier of a file / directory so we want to maintain
that as much as possible (at least as long as there's userspace depending
on it which I don't see changing in the near future).

That being said historically people have learned NFS has its quirks,
similarly as btrfs needing occasionally a special treatment and adapted to
it, bcachefs is new enough that userspace didn't notice yet, that's going
to be interesting.

There's another aspect that even 64-bits start to be expensive to pack
things into for some filesystems (either due to external protocol
constraints such as for AFS or due to the combination of features such as
subvolumes, snapshotting, etc.). Going to 128-bits for everybody seems
like a waste so at last LSF summit we've discussed about starting to push
file handles (output of name_to_handle_at(2)) as a replacement of st_ino
for file/dir identifier in a filesystem. For the kernel this would be
convenient because each filesystem can pack there what it needs. But
userspace guys were not thrilled by this (mainly due to the complexities of
dynamically sized identifier and passing it around). So this transition
isn't currently getting much traction and we'll see how things evolve.

								Honza
diff mbox series

Patch

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 542c7d97b235..5dfc176b6d92 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -83,18 +83,19 @@  EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
 
 /**
  * nfs_compat_user_ino64 - returns the user-visible inode number
- * @fileid: 64-bit fileid
+ * @inode: inode pointer
  *
  * This function returns a 32-bit inode number if the boot parameter
  * nfs.enable_ino64 is zero.
  */
-u64 nfs_compat_user_ino64(u64 fileid)
+u64 nfs_compat_user_ino64(const struct *inode)
 {
 #ifdef CONFIG_COMPAT
 	compat_ulong_t ino;
 #else	
 	unsigned long ino;
 #endif
+	u64 fileid = NFS_FILEID(inode);
 
 	if (enable_ino64)
 		return fileid;
@@ -103,6 +104,7 @@  u64 nfs_compat_user_ino64(u64 fileid)
 		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
 	return ino;
 }
+EXPORT_SYMBOL_GPL(nfs_compat_user_ino64);
 
 int nfs_drop_inode(struct inode *inode)
 {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 430733e3eff2..f5555a71a733 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -451,6 +451,7 @@  extern void nfs_zap_acl_cache(struct inode *inode);
 extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
 extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+extern u64 nfs_compat_user_ino64(const struct *inode);
 
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 /* localio.c */
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e7494cdd957e..d9b1e0606833 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -232,11 +232,13 @@  nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 const struct inode_operations nfs_mountpoint_inode_operations = {
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
+	.get_ino	= nfs_compat_user_ino64,
 };
 
 const struct inode_operations nfs_referral_inode_operations = {
 	.getattr	= nfs_namespace_getattr,
 	.setattr	= nfs_namespace_setattr,
+	.get_ino	= nfs_compat_user_ino64,
 };
 
 static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/stat.c b/fs/stat.c
index 41e598376d7e..05636919f94b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -50,7 +50,7 @@  void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
 	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
 
 	stat->dev = inode->i_sb->s_dev;
-	stat->ino = inode->i_ino;
+	stat->ino = inode_get_ino(inode);
 	stat->mode = inode->i_mode;
 	stat->nlink = inode->i_nlink;
 	stat->uid = vfsuid_into_kuid(vfsuid);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e3c603d01337..0eba09a21cf7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2165,6 +2165,7 @@  struct inode_operations {
 			    struct dentry *dentry, struct fileattr *fa);
 	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
 	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
+	u64 (*get_ino)(const struct inode *inode);
 } ____cacheline_aligned;
 
 static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
@@ -2172,6 +2173,14 @@  static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
 	return file->f_op->mmap(file, vma);
 }
 
+static inline u64 inode_get_ino(struct inode *inode)
+{
+	if (unlikely(inode->i_op->get_ino))
+		return inode->i_op->get_ino(inode);
+
+	return inode->i_ino;
+}
+
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,