diff mbox series

[RFC] xfs: initialise attr fork on inode create

Message ID 20201202232724.1730114-1-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series [RFC] xfs: initialise attr fork on inode create | expand

Commit Message

Dave Chinner Dec. 2, 2020, 11:27 p.m. UTC
From: Dave Chinner <dchinner@redhat.com>

When we allocate a new inode, we often need to add an attribute to
the inode as part of the create. This can happen as a result of
needing to add default ACLs or security labels before the inode is
made visible to userspace.

This is highly inefficient right now. We do the create transaction
to allocate the inode, then we do an "add attr fork" transaction to
modify the just created empty inode to set the inode fork offset to
allow attributes to be stored, then we go and do the attribute
creation.

This means 3 transactions instead of 1 to allocate an inode, and
this greatly increases the load on the CIL commit code, resulting in
excessive contention on the CIL spin locks and performance
degradation:

 18.99%  [kernel]                [k] __pv_queued_spin_lock_slowpath
  3.57%  [kernel]                [k] do_raw_spin_lock
  2.51%  [kernel]                [k] __raw_callee_save___pv_queued_spin_unlock
  2.48%  [kernel]                [k] memcpy
  2.34%  [kernel]                [k] xfs_log_commit_cil

The typical profile resulting from running fsmark on a selinux enabled
filesytem is adds this overhead to the create path:

  - 15.30% xfs_init_security
     - 15.23% security_inode_init_security
	- 13.05% xfs_initxattrs
	   - 12.94% xfs_attr_set
	      - 6.75% xfs_bmap_add_attrfork
		 - 5.51% xfs_trans_commit
		    - 5.48% __xfs_trans_commit
		       - 5.35% xfs_log_commit_cil
			  - 3.86% _raw_spin_lock
			     - do_raw_spin_lock
				  __pv_queued_spin_lock_slowpath
		 - 0.70% xfs_trans_alloc
		      0.52% xfs_trans_reserve
	      - 5.41% xfs_attr_set_args
		 - 5.39% xfs_attr_set_shortform.constprop.0
		    - 4.46% xfs_trans_commit
		       - 4.46% __xfs_trans_commit
			  - 4.33% xfs_log_commit_cil
			     - 2.74% _raw_spin_lock
				- do_raw_spin_lock
				     __pv_queued_spin_lock_slowpath
			       0.60% xfs_inode_item_format
		      0.90% xfs_attr_try_sf_addname
	- 1.99% selinux_inode_init_security
	   - 1.02% security_sid_to_context_force
	      - 1.00% security_sid_to_context_core
		 - 0.92% sidtab_entry_to_string
		    - 0.90% sidtab_sid2str_get
			 0.59% sidtab_sid2str_put.part.0
	   - 0.82% selinux_determine_inode_label
	      - 0.77% security_transition_sid
		   0.70% security_compute_sid.part.0

And fsmark creation rate performance drops by ~25%. The key point to
note here is that half the additional overhead comes from adding the
attribute fork to the newly created inode. That's crazy, considering
we can do this same thing at inode create time with a couple of
lines of code and no extra overhead.

So, if we know we are going to add an attribute immediately after
creating the inode, let's just initialise the attribute fork inside
the create transaction and chop that whole chunk of code out of
the create fast path. This completely removes the performance
drop caused by enabling SELinux, and the profile looks like:

     - 8.99% xfs_init_security
         - 9.00% security_inode_init_security
            - 6.43% xfs_initxattrs
               - 6.37% xfs_attr_set
                  - 5.45% xfs_attr_set_args
                     - 5.42% xfs_attr_set_shortform.constprop.0
                        - 4.51% xfs_trans_commit
                           - 4.54% __xfs_trans_commit
                              - 4.59% xfs_log_commit_cil
                                 - 2.67% _raw_spin_lock
                                    - 3.28% do_raw_spin_lock
                                         3.08% __pv_queued_spin_lock_slowpath
                                   0.66% xfs_inode_item_format
                        - 0.90% xfs_attr_try_sf_addname
                  - 0.60% xfs_trans_alloc
            - 2.35% selinux_inode_init_security
               - 1.25% security_sid_to_context_force
                  - 1.21% security_sid_to_context_core
                     - 1.19% sidtab_entry_to_string
                        - 1.20% sidtab_sid2str_get
                           - 0.86% sidtab_sid2str_put.part.0
                              - 0.62% _raw_spin_lock_irqsave
                                 - 0.77% do_raw_spin_lock
                                      __pv_queued_spin_lock_slowpath
               - 0.84% selinux_determine_inode_label
                  - 0.83% security_transition_sid
                       0.86% security_compute_sid.part.0

Which indicates the XFS overhead of creating the selinux xattr has
been halved. This doesn't fix the CIL lock contention problem, just
means it's not a limiting factor for this workload. Lock contention
in the security subsystems is going to be an issue soon, though...

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_inode_fork.c | 20 +++++++++++++++-----
 fs/xfs/libxfs/xfs_inode_fork.h |  1 +
 fs/xfs/xfs_inode.c             | 24 ++++++++++++++++++++----
 fs/xfs/xfs_inode.h             |  5 +++--
 fs/xfs/xfs_iops.c              | 10 +++++++++-
 fs/xfs/xfs_qm.c                |  2 +-
 fs/xfs/xfs_symlink.c           |  2 +-
 7 files changed, 50 insertions(+), 14 deletions(-)

Comments

Christoph Hellwig Dec. 3, 2020, 8:40 a.m. UTC | #1
This looks pretty sensible, and pretty simple.  Why the RFC?

This looks good to me modulo a few tiny nitpicks below:

> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 1414ab79eacf..75b44b82ad1f 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -126,6 +126,7 @@ xfs_cleanup_inode(
>  	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
>  }
>  
> +
>  STATIC int
>  xfs_generic_create(
>  	struct inode	*dir,

Nit: this adds a spuurious empty line.

> @@ -161,7 +162,14 @@ xfs_generic_create(
>  		goto out_free_acl;
>  
>  	if (!tmpfile) {
> -		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
> +		bool need_xattr = false;
> +
> +		if ((IS_ENABLED(CONFIG_SECURITY) && dir->i_sb->s_security) ||
> +		    default_acl || acl)
> +			need_xattr = true;
> +
> +		error = xfs_create(XFS_I(dir), &name, mode, rdev,
> +					need_xattr, &ip);

It might be wort to factor the condition into a little helper.  Also
I think we also have security labels for O_TMPFILE inodes, so it might
be worth plugging into that path as well.
Dave Chinner Dec. 3, 2020, 9:44 p.m. UTC | #2
On Thu, Dec 03, 2020 at 08:40:12AM +0000, Christoph Hellwig wrote:
> This looks pretty sensible, and pretty simple.  Why the RFC?
> 
> This looks good to me modulo a few tiny nitpicks below:
> 
> > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > index 1414ab79eacf..75b44b82ad1f 100644
> > --- a/fs/xfs/xfs_iops.c
> > +++ b/fs/xfs/xfs_iops.c
> > @@ -126,6 +126,7 @@ xfs_cleanup_inode(
> >  	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
> >  }
> >  
> > +
> >  STATIC int
> >  xfs_generic_create(
> >  	struct inode	*dir,
> 
> Nit: this adds a spuurious empty line.

Fixed.

> > @@ -161,7 +162,14 @@ xfs_generic_create(
> >  		goto out_free_acl;
> >  
> >  	if (!tmpfile) {
> > -		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
> > +		bool need_xattr = false;
> > +
> > +		if ((IS_ENABLED(CONFIG_SECURITY) && dir->i_sb->s_security) ||
> > +		    default_acl || acl)
> > +			need_xattr = true;
> > +
> > +		error = xfs_create(XFS_I(dir), &name, mode, rdev,
> > +					need_xattr, &ip);
> 
> It might be wort to factor the condition into a little helper.  Also
> I think we also have security labels for O_TMPFILE inodes, so it might
> be worth plugging into that path as well.

Yeah, a helper is a good idea - I just wanted to get some feedback
first on whether it's a good idea to peek directly at
i_sb->s_security or whether there is some other way of knowing ahead
of time that a security xattr is going to be created. I couldn't
find one, but that doesn't mean such an interface doesn't exist in
all the twisty passages of the LSM layers...

You didn't shout and run screaming, so that's a positive :)

Cheers,

Dave.
Christoph Hellwig Dec. 4, 2020, 7:54 a.m. UTC | #3
On Fri, Dec 04, 2020 at 08:44:26AM +1100, Dave Chinner wrote:
> > > +		if ((IS_ENABLED(CONFIG_SECURITY) && dir->i_sb->s_security) ||
> > > +		    default_acl || acl)
> > > +			need_xattr = true;
> > > +
> > > +		error = xfs_create(XFS_I(dir), &name, mode, rdev,
> > > +					need_xattr, &ip);
> > 
> > It might be wort to factor the condition into a little helper.  Also
> > I think we also have security labels for O_TMPFILE inodes, so it might
> > be worth plugging into that path as well.
> 
> Yeah, a helper is a good idea - I just wanted to get some feedback
> first on whether it's a good idea to peek directly at
> i_sb->s_security or whether there is some other way of knowing ahead
> of time that a security xattr is going to be created. I couldn't
> find one, but that doesn't mean such an interface doesn't exist in
> all the twisty passages of the LSM layers...

I've added the relevant list, maybe someone there has an opinion.
Brian Foster Dec. 4, 2020, 12:31 p.m. UTC | #4
On Thu, Dec 03, 2020 at 10:27:24AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> When we allocate a new inode, we often need to add an attribute to
> the inode as part of the create. This can happen as a result of
> needing to add default ACLs or security labels before the inode is
> made visible to userspace.
> 
> This is highly inefficient right now. We do the create transaction
> to allocate the inode, then we do an "add attr fork" transaction to
> modify the just created empty inode to set the inode fork offset to
> allow attributes to be stored, then we go and do the attribute
> creation.
> 
> This means 3 transactions instead of 1 to allocate an inode, and
> this greatly increases the load on the CIL commit code, resulting in
> excessive contention on the CIL spin locks and performance
> degradation:
> 
>  18.99%  [kernel]                [k] __pv_queued_spin_lock_slowpath
>   3.57%  [kernel]                [k] do_raw_spin_lock
>   2.51%  [kernel]                [k] __raw_callee_save___pv_queued_spin_unlock
>   2.48%  [kernel]                [k] memcpy
>   2.34%  [kernel]                [k] xfs_log_commit_cil
> 
> The typical profile resulting from running fsmark on a selinux enabled
> filesytem is adds this overhead to the create path:
> 
...
> 
> And fsmark creation rate performance drops by ~25%. The key point to
> note here is that half the additional overhead comes from adding the
> attribute fork to the newly created inode. That's crazy, considering
> we can do this same thing at inode create time with a couple of
> lines of code and no extra overhead.
> 
> So, if we know we are going to add an attribute immediately after
> creating the inode, let's just initialise the attribute fork inside
> the create transaction and chop that whole chunk of code out of
> the create fast path. This completely removes the performance
> drop caused by enabling SELinux, and the profile looks like:
> 
...
> 
> Which indicates the XFS overhead of creating the selinux xattr has
> been halved. This doesn't fix the CIL lock contention problem, just
> means it's not a limiting factor for this workload. Lock contention
> in the security subsystems is going to be an issue soon, though...
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_inode_fork.c | 20 +++++++++++++++-----
>  fs/xfs/libxfs/xfs_inode_fork.h |  1 +
>  fs/xfs/xfs_inode.c             | 24 ++++++++++++++++++++----
>  fs/xfs/xfs_inode.h             |  5 +++--
>  fs/xfs/xfs_iops.c              | 10 +++++++++-
>  fs/xfs/xfs_qm.c                |  2 +-
>  fs/xfs/xfs_symlink.c           |  2 +-
>  7 files changed, 50 insertions(+), 14 deletions(-)
> 
...
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 2bfbcf28b1bd..9ee2e0b4c6fd 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
...
> @@ -918,6 +919,18 @@ xfs_ialloc(
>  		ASSERT(0);
>  	}
>  
> +	/*
> +	 * If we need to create attributes immediately after allocating the
> +	 * inode, initialise an empty attribute fork right now. We use the
> +	 * default fork offset for attributes here as we don't know exactly what
> +	 * size or how many attributes we might be adding. We can do this safely
> +	 * here because we know the data fork is completely empty right now.
> +	 */
> +	if (init_attrs) {
> +		ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
> +		ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> +	}
> +

Seems reasonable in principle, but why not refactor
xfs_bmap_add_attrfork() such that the internals (i.e. everything within
the transaction/ilock code) can be properly reused in both contexts
rather than open-coding (and thus duplicating) a somewhat stripped down
version? At a glance, it looks like there are some subtle differences in
the initial setup of the attr fork for a device node inode, for example.

Brian

>  	/*
>  	 * Log the new values stuffed into the inode.
>  	 */
> @@ -951,6 +964,7 @@ xfs_dir_ialloc(
>  	xfs_nlink_t	nlink,
>  	dev_t		rdev,
>  	prid_t		prid,		/* project id */
> +	bool		init_attrs,
>  	xfs_inode_t	**ipp)		/* pointer to inode; it will be
>  					   locked. */
>  {
> @@ -980,7 +994,7 @@ xfs_dir_ialloc(
>  	 * the inode(s) that we've just allocated.
>  	 */
>  	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
> -			&ip);
> +			init_attrs, &ip);
>  
>  	/*
>  	 * Return an error if we were unable to allocate a new inode.
> @@ -1050,7 +1064,7 @@ xfs_dir_ialloc(
>  		 * this call should always succeed.
>  		 */
>  		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
> -				  &ialloc_context, &ip);
> +				  &ialloc_context, init_attrs, &ip);
>  
>  		/*
>  		 * If we get an error at this point, return to the caller
> @@ -1112,6 +1126,7 @@ xfs_create(
>  	struct xfs_name		*name,
>  	umode_t			mode,
>  	dev_t			rdev,
> +	bool			init_attrs,
>  	xfs_inode_t		**ipp)
>  {
>  	int			is_dir = S_ISDIR(mode);
> @@ -1182,7 +1197,8 @@ xfs_create(
>  	 * entry pointing to them, but a directory also the "." entry
>  	 * pointing to itself.
>  	 */
> -	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
> +	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid,
> +				init_attrs, &ip);
>  	if (error)
>  		goto out_trans_cancel;
>  
> @@ -1304,7 +1320,7 @@ xfs_create_tmpfile(
>  	if (error)
>  		goto out_trans_cancel;
>  
> -	error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
> +	error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, false, &ip);
>  	if (error)
>  		goto out_trans_cancel;
>  
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 751a3d1d7d84..670dfab334de 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -370,7 +370,8 @@ void		xfs_inactive(struct xfs_inode *ip);
>  int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
>  			   struct xfs_inode **ipp, struct xfs_name *ci_name);
>  int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
> -			   umode_t mode, dev_t rdev, struct xfs_inode **ipp);
> +			   umode_t mode, dev_t rdev, bool need_xattr,
> +			   struct xfs_inode **ipp);
>  int		xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
>  			   struct xfs_inode **ipp);
>  int		xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
> @@ -408,7 +409,7 @@ xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
>  xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
>  
>  int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
> -			       xfs_nlink_t, dev_t, prid_t,
> +			       xfs_nlink_t, dev_t, prid_t, bool need_xattr,
>  			       struct xfs_inode **);
>  
>  static inline int
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 1414ab79eacf..75b44b82ad1f 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -126,6 +126,7 @@ xfs_cleanup_inode(
>  	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
>  }
>  
> +
>  STATIC int
>  xfs_generic_create(
>  	struct inode	*dir,
> @@ -161,7 +162,14 @@ xfs_generic_create(
>  		goto out_free_acl;
>  
>  	if (!tmpfile) {
> -		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
> +		bool need_xattr = false;
> +
> +		if ((IS_ENABLED(CONFIG_SECURITY) && dir->i_sb->s_security) ||
> +		    default_acl || acl)
> +			need_xattr = true;
> +
> +		error = xfs_create(XFS_I(dir), &name, mode, rdev,
> +					need_xattr, &ip);
>  	} else {
>  		error = xfs_create_tmpfile(XFS_I(dir), mode, &ip);
>  	}
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index b2a9abee8b2b..45faddfb4234 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -787,7 +787,7 @@ xfs_qm_qino_alloc(
>  		return error;
>  
>  	if (need_alloc) {
> -		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip);
> +		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, false, ip);
>  		if (error) {
>  			xfs_trans_cancel(tp);
>  			return error;
> diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
> index 8e88a7ca387e..20919aaea981 100644
> --- a/fs/xfs/xfs_symlink.c
> +++ b/fs/xfs/xfs_symlink.c
> @@ -224,7 +224,7 @@ xfs_symlink(
>  	 * Allocate an inode for the symlink.
>  	 */
>  	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
> -			       prid, &ip);
> +			       prid, false, &ip);
>  	if (error)
>  		goto out_trans_cancel;
>  
> -- 
> 2.28.0
>
Dave Chinner Dec. 4, 2020, 9:22 p.m. UTC | #5
On Fri, Dec 04, 2020 at 07:31:37AM -0500, Brian Foster wrote:
> On Thu, Dec 03, 2020 at 10:27:24AM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > When we allocate a new inode, we often need to add an attribute to
> > the inode as part of the create. This can happen as a result of
> > needing to add default ACLs or security labels before the inode is
> > made visible to userspace.
> > 
> > This is highly inefficient right now. We do the create transaction
> > to allocate the inode, then we do an "add attr fork" transaction to
> > modify the just created empty inode to set the inode fork offset to
> > allow attributes to be stored, then we go and do the attribute
> > creation.
> > 
> > This means 3 transactions instead of 1 to allocate an inode, and
> > this greatly increases the load on the CIL commit code, resulting in
> > excessive contention on the CIL spin locks and performance
> > degradation:
> > 
> >  18.99%  [kernel]                [k] __pv_queued_spin_lock_slowpath
> >   3.57%  [kernel]                [k] do_raw_spin_lock
> >   2.51%  [kernel]                [k] __raw_callee_save___pv_queued_spin_unlock
> >   2.48%  [kernel]                [k] memcpy
> >   2.34%  [kernel]                [k] xfs_log_commit_cil
> > 
> > The typical profile resulting from running fsmark on a selinux enabled
> > filesytem is adds this overhead to the create path:
> > 
> ...
> > 
> > And fsmark creation rate performance drops by ~25%. The key point to
> > note here is that half the additional overhead comes from adding the
> > attribute fork to the newly created inode. That's crazy, considering
> > we can do this same thing at inode create time with a couple of
> > lines of code and no extra overhead.
> > 
> > So, if we know we are going to add an attribute immediately after
> > creating the inode, let's just initialise the attribute fork inside
> > the create transaction and chop that whole chunk of code out of
> > the create fast path. This completely removes the performance
> > drop caused by enabling SELinux, and the profile looks like:
> > 
> ...
> > 
> > Which indicates the XFS overhead of creating the selinux xattr has
> > been halved. This doesn't fix the CIL lock contention problem, just
> > means it's not a limiting factor for this workload. Lock contention
> > in the security subsystems is going to be an issue soon, though...
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/libxfs/xfs_inode_fork.c | 20 +++++++++++++++-----
> >  fs/xfs/libxfs/xfs_inode_fork.h |  1 +
> >  fs/xfs/xfs_inode.c             | 24 ++++++++++++++++++++----
> >  fs/xfs/xfs_inode.h             |  5 +++--
> >  fs/xfs/xfs_iops.c              | 10 +++++++++-
> >  fs/xfs/xfs_qm.c                |  2 +-
> >  fs/xfs/xfs_symlink.c           |  2 +-
> >  7 files changed, 50 insertions(+), 14 deletions(-)
> > 
> ...
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 2bfbcf28b1bd..9ee2e0b4c6fd 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> ...
> > @@ -918,6 +919,18 @@ xfs_ialloc(
> >  		ASSERT(0);
> >  	}
> >  
> > +	/*
> > +	 * If we need to create attributes immediately after allocating the
> > +	 * inode, initialise an empty attribute fork right now. We use the
> > +	 * default fork offset for attributes here as we don't know exactly what
> > +	 * size or how many attributes we might be adding. We can do this safely
> > +	 * here because we know the data fork is completely empty right now.
> > +	 */
> > +	if (init_attrs) {
> > +		ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
> > +		ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> > +	}
> > +
> 
> Seems reasonable in principle, but why not refactor
> xfs_bmap_add_attrfork() such that the internals (i.e. everything within
> the transaction/ilock code) can be properly reused in both contexts
> rather than open-coding (and thus duplicating) a somewhat stripped down
> version?

We don't know the size of the attribute that is being created, so
the attr size dependent parts of it can't be used.

> At a glance, it looks like there are some subtle differences in
> the initial setup of the attr fork for a device node inode, for example.

Yes, there's a difference, but it's largely irrelevant as adding
the first attribute to a device format inode will reset the
forkoffset to the min via xfs_attr_shortform_bytesfit().

And if the attribute is larger than will fit in the default fork
offset space, but can fit the attr in shrotform by shrinking the
empty data fork space, xfs_attr_shortform_bytesfit() will do that as
well. IOWs, we only need to set a non-zero fork offset here and init
the ip->i_afp pointer - immediately setting an attribute on the
empty inode literal area will do the rest for the fork offset setup
for us...

Cheers,

Dave.
Brian Foster Dec. 5, 2020, 11:34 a.m. UTC | #6
On Sat, Dec 05, 2020 at 08:22:22AM +1100, Dave Chinner wrote:
> On Fri, Dec 04, 2020 at 07:31:37AM -0500, Brian Foster wrote:
> > On Thu, Dec 03, 2020 at 10:27:24AM +1100, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > When we allocate a new inode, we often need to add an attribute to
> > > the inode as part of the create. This can happen as a result of
> > > needing to add default ACLs or security labels before the inode is
> > > made visible to userspace.
> > > 
> > > This is highly inefficient right now. We do the create transaction
> > > to allocate the inode, then we do an "add attr fork" transaction to
> > > modify the just created empty inode to set the inode fork offset to
> > > allow attributes to be stored, then we go and do the attribute
> > > creation.
> > > 
> > > This means 3 transactions instead of 1 to allocate an inode, and
> > > this greatly increases the load on the CIL commit code, resulting in
> > > excessive contention on the CIL spin locks and performance
> > > degradation:
> > > 
> > >  18.99%  [kernel]                [k] __pv_queued_spin_lock_slowpath
> > >   3.57%  [kernel]                [k] do_raw_spin_lock
> > >   2.51%  [kernel]                [k] __raw_callee_save___pv_queued_spin_unlock
> > >   2.48%  [kernel]                [k] memcpy
> > >   2.34%  [kernel]                [k] xfs_log_commit_cil
> > > 
> > > The typical profile resulting from running fsmark on a selinux enabled
> > > filesytem is adds this overhead to the create path:
> > > 
> > ...
> > > 
> > > And fsmark creation rate performance drops by ~25%. The key point to
> > > note here is that half the additional overhead comes from adding the
> > > attribute fork to the newly created inode. That's crazy, considering
> > > we can do this same thing at inode create time with a couple of
> > > lines of code and no extra overhead.
> > > 
> > > So, if we know we are going to add an attribute immediately after
> > > creating the inode, let's just initialise the attribute fork inside
> > > the create transaction and chop that whole chunk of code out of
> > > the create fast path. This completely removes the performance
> > > drop caused by enabling SELinux, and the profile looks like:
> > > 
> > ...
> > > 
> > > Which indicates the XFS overhead of creating the selinux xattr has
> > > been halved. This doesn't fix the CIL lock contention problem, just
> > > means it's not a limiting factor for this workload. Lock contention
> > > in the security subsystems is going to be an issue soon, though...
> > > 
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > ---
> > >  fs/xfs/libxfs/xfs_inode_fork.c | 20 +++++++++++++++-----
> > >  fs/xfs/libxfs/xfs_inode_fork.h |  1 +
> > >  fs/xfs/xfs_inode.c             | 24 ++++++++++++++++++++----
> > >  fs/xfs/xfs_inode.h             |  5 +++--
> > >  fs/xfs/xfs_iops.c              | 10 +++++++++-
> > >  fs/xfs/xfs_qm.c                |  2 +-
> > >  fs/xfs/xfs_symlink.c           |  2 +-
> > >  7 files changed, 50 insertions(+), 14 deletions(-)
> > > 
> > ...
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index 2bfbcf28b1bd..9ee2e0b4c6fd 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > ...
> > > @@ -918,6 +919,18 @@ xfs_ialloc(
> > >  		ASSERT(0);
> > >  	}
> > >  
> > > +	/*
> > > +	 * If we need to create attributes immediately after allocating the
> > > +	 * inode, initialise an empty attribute fork right now. We use the
> > > +	 * default fork offset for attributes here as we don't know exactly what
> > > +	 * size or how many attributes we might be adding. We can do this safely
> > > +	 * here because we know the data fork is completely empty right now.
> > > +	 */
> > > +	if (init_attrs) {
> > > +		ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
> > > +		ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> > > +	}
> > > +
> > 
> > Seems reasonable in principle, but why not refactor
> > xfs_bmap_add_attrfork() such that the internals (i.e. everything within
> > the transaction/ilock code) can be properly reused in both contexts
> > rather than open-coding (and thus duplicating) a somewhat stripped down
> > version?
> 
> We don't know the size of the attribute that is being created, so
> the attr size dependent parts of it can't be used.
> 

Not sure I see the problem here. It looks to me that
xfs_bmap_add_attrfork() would do the right thing if we just passed a
size of zero. The only place the size value is actually used is down in
xfs_attr_shortform_bytesfit(), and I'd expect that to identify that the
requested size is <= than the current afork size (also zero for a newly
allocated inode..?) and bail out.

That said, I wouldn't be opposed to tweaking xfs_bmap_set_attrforkoff()
by a line or two to just skip the shortform call if size == 0. Then we
can be more explicit about the "size == 0 means preemptive fork alloc,
use the default offset" use case and perhaps actually document it with
some comments as well.

Brian

> > At a glance, it looks like there are some subtle differences in
> > the initial setup of the attr fork for a device node inode, for example.
> 
> Yes, there's a difference, but it's largely irrelevant as adding
> the first attribute to a device format inode will reset the
> forkoffset to the min via xfs_attr_shortform_bytesfit().
> 
> And if the attribute is larger than will fit in the default fork
> offset space, but can fit the attr in shrotform by shrinking the
> empty data fork space, xfs_attr_shortform_bytesfit() will do that as
> well. IOWs, we only need to set a non-zero fork offset here and init
> the ip->i_afp pointer - immediately setting an attribute on the
> empty inode literal area will do the rest for the fork offset setup
> for us...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Dave Chinner Dec. 6, 2020, 11:33 p.m. UTC | #7
On Sat, Dec 05, 2020 at 06:34:44AM -0500, Brian Foster wrote:
> On Sat, Dec 05, 2020 at 08:22:22AM +1100, Dave Chinner wrote:
> > On Fri, Dec 04, 2020 at 07:31:37AM -0500, Brian Foster wrote:
> > > On Thu, Dec 03, 2020 at 10:27:24AM +1100, Dave Chinner wrote:
> > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > index 2bfbcf28b1bd..9ee2e0b4c6fd 100644
> > > > --- a/fs/xfs/xfs_inode.c
> > > > +++ b/fs/xfs/xfs_inode.c
> > > ...
> > > > @@ -918,6 +919,18 @@ xfs_ialloc(
> > > >  		ASSERT(0);
> > > >  	}
> > > >  
> > > > +	/*
> > > > +	 * If we need to create attributes immediately after allocating the
> > > > +	 * inode, initialise an empty attribute fork right now. We use the
> > > > +	 * default fork offset for attributes here as we don't know exactly what
> > > > +	 * size or how many attributes we might be adding. We can do this safely
> > > > +	 * here because we know the data fork is completely empty right now.
> > > > +	 */
> > > > +	if (init_attrs) {
> > > > +		ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
> > > > +		ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> > > > +	}
> > > > +
> > > 
> > > Seems reasonable in principle, but why not refactor
> > > xfs_bmap_add_attrfork() such that the internals (i.e. everything within
> > > the transaction/ilock code) can be properly reused in both contexts
> > > rather than open-coding (and thus duplicating) a somewhat stripped down
> > > version?
> > 
> > We don't know the size of the attribute that is being created, so
> > the attr size dependent parts of it can't be used.
> 
> Not sure I see the problem here. It looks to me that
> xfs_bmap_add_attrfork() would do the right thing if we just passed a
> size of zero.

Yes, but it also does an awful lot that we do not need.

> The only place the size value is actually used is down in
> xfs_attr_shortform_bytesfit(), and I'd expect that to identify that the
> requested size is <= than the current afork size (also zero for a newly
> allocated inode..?) and bail out.

RIght it ends up doing that because an uninitialised inode fork
(di_forkoff = 0) is the same size as the requested size of zero, and
then it does ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;

But that's decided another two function calls deep, after a lot of
branches and shifting and comparisons to determine that the attr
fork is empty. Yet we already know that the attr fork is empty here
so all that extra CPU work is completely unnecessary.

Keep in mind we do exactly the same thing in
xfs_bmap_forkoff_reset(). We don't care about all the setup stuff in
xfs_bmap_add_attrfork(), we just reset the attr fork offset to the
default if the attr fork had grown larger than the default offset.

> That said, I wouldn't be opposed to tweaking xfs_bmap_set_attrforkoff()
> by a line or two to just skip the shortform call if size == 0. Then we
> can be more explicit about the "size == 0 means preemptive fork alloc,
> use the default offset" use case and perhaps actually document it with
> some comments as well.

It just seems wrong to me to code a special case into some function
to optimise that special case when the code that needs the special
case has no need to call that function in the first place.....

Cheers,

Dave.
Brian Foster Dec. 7, 2020, 4:30 p.m. UTC | #8
On Mon, Dec 07, 2020 at 10:33:22AM +1100, Dave Chinner wrote:
> On Sat, Dec 05, 2020 at 06:34:44AM -0500, Brian Foster wrote:
> > On Sat, Dec 05, 2020 at 08:22:22AM +1100, Dave Chinner wrote:
> > > On Fri, Dec 04, 2020 at 07:31:37AM -0500, Brian Foster wrote:
> > > > On Thu, Dec 03, 2020 at 10:27:24AM +1100, Dave Chinner wrote:
> > > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > > index 2bfbcf28b1bd..9ee2e0b4c6fd 100644
> > > > > --- a/fs/xfs/xfs_inode.c
> > > > > +++ b/fs/xfs/xfs_inode.c
> > > > ...
> > > > > @@ -918,6 +919,18 @@ xfs_ialloc(
> > > > >  		ASSERT(0);
> > > > >  	}
> > > > >  
> > > > > +	/*
> > > > > +	 * If we need to create attributes immediately after allocating the
> > > > > +	 * inode, initialise an empty attribute fork right now. We use the
> > > > > +	 * default fork offset for attributes here as we don't know exactly what
> > > > > +	 * size or how many attributes we might be adding. We can do this safely
> > > > > +	 * here because we know the data fork is completely empty right now.
> > > > > +	 */
> > > > > +	if (init_attrs) {
> > > > > +		ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
> > > > > +		ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> > > > > +	}
> > > > > +
> > > > 
> > > > Seems reasonable in principle, but why not refactor
> > > > xfs_bmap_add_attrfork() such that the internals (i.e. everything within
> > > > the transaction/ilock code) can be properly reused in both contexts
> > > > rather than open-coding (and thus duplicating) a somewhat stripped down
> > > > version?
> > > 
> > > We don't know the size of the attribute that is being created, so
> > > the attr size dependent parts of it can't be used.
> > 
> > Not sure I see the problem here. It looks to me that
> > xfs_bmap_add_attrfork() would do the right thing if we just passed a
> > size of zero.
> 
> Yes, but it also does an awful lot that we do not need.
> 

Hence the suggestion to refactor it..

> > The only place the size value is actually used is down in
> > xfs_attr_shortform_bytesfit(), and I'd expect that to identify that the
> > requested size is <= than the current afork size (also zero for a newly
> > allocated inode..?) and bail out.
> 
> RIght it ends up doing that because an uninitialised inode fork
> (di_forkoff = 0) is the same size as the requested size of zero, and
> then it does ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> 
> But that's decided another two function calls deep, after a lot of
> branches and shifting and comparisons to determine that the attr
> fork is empty. Yet we already know that the attr fork is empty here
> so all that extra CPU work is completely unnecessary.
> 

xfs_bmap_add_attrfork() already asserts that the attr fork is
nonexistent at the very top of the function, for one. The 25-30 lines of
that function that we need can be trivially lifted out into a new helper
that can equally as trivially accommodate the size == 0 case and skip
all those shortform calculations.

> Keep in mind we do exactly the same thing in
> xfs_bmap_forkoff_reset(). We don't care about all the setup stuff in
> xfs_bmap_add_attrfork(), we just reset the attr fork offset to the
> default if the attr fork had grown larger than the default offset.
> 

I'm not arguing that the attr fork needs to be set up in a particular
way on initial creation. I'm arguing that we don't need yet a third
unique code path to set/initialize a default/empty attr fork. We can
slowly normalize them all to the _reset() technique you're effectively
reimplementing here if that works well enough and is preferred...

> > That said, I wouldn't be opposed to tweaking xfs_bmap_set_attrforkoff()
> > by a line or two to just skip the shortform call if size == 0. Then we
> > can be more explicit about the "size == 0 means preemptive fork alloc,
> > use the default offset" use case and perhaps actually document it with
> > some comments as well.
> 
> It just seems wrong to me to code a special case into some function
> to optimise that special case when the code that needs the special
> case has no need to call that function in the first place.....
> 

I'm not sure what's so odd or controversial about refactoring and
reusing an existing operational (i.e. add fork) function to facilitate
review and future maintenance of that particular operation being
performed from new and various contexts. And speaking in generalities
like this just obfuscates and overcomplicates the argument. Let's be
clear, we're literally arguing over a delta that would look something
like this:

xfs_bmap_set_attrforkoff()
{
...
+		if (size)
-               ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+			ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
                if (!ip->i_d.di_forkoff)
                        ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
		...
}

Given all of that, I'm not convinced this is nearly the problem you seem
to insinuate, yet I also don't think I'll convince you otherwise so it's
probably not worth continuing to debate. You have my feedback, I'll let
others determine how this patch comes together from here...

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Darrick J. Wong Dec. 7, 2020, 5:12 p.m. UTC | #9
On Thu, Dec 03, 2020 at 10:27:24AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> When we allocate a new inode, we often need to add an attribute to
> the inode as part of the create. This can happen as a result of
> needing to add default ACLs or security labels before the inode is
> made visible to userspace.
> 
> This is highly inefficient right now. We do the create transaction
> to allocate the inode, then we do an "add attr fork" transaction to
> modify the just created empty inode to set the inode fork offset to
> allow attributes to be stored, then we go and do the attribute
> creation.
> 
> This means 3 transactions instead of 1 to allocate an inode, and
> this greatly increases the load on the CIL commit code, resulting in
> excessive contention on the CIL spin locks and performance
> degradation:
> 
>  18.99%  [kernel]                [k] __pv_queued_spin_lock_slowpath
>   3.57%  [kernel]                [k] do_raw_spin_lock
>   2.51%  [kernel]                [k] __raw_callee_save___pv_queued_spin_unlock
>   2.48%  [kernel]                [k] memcpy
>   2.34%  [kernel]                [k] xfs_log_commit_cil
> 
> The typical profile resulting from running fsmark on a selinux enabled
> filesytem is adds this overhead to the create path:
> 
>   - 15.30% xfs_init_security
>      - 15.23% security_inode_init_security
> 	- 13.05% xfs_initxattrs
> 	   - 12.94% xfs_attr_set
> 	      - 6.75% xfs_bmap_add_attrfork
> 		 - 5.51% xfs_trans_commit
> 		    - 5.48% __xfs_trans_commit
> 		       - 5.35% xfs_log_commit_cil
> 			  - 3.86% _raw_spin_lock
> 			     - do_raw_spin_lock
> 				  __pv_queued_spin_lock_slowpath
> 		 - 0.70% xfs_trans_alloc
> 		      0.52% xfs_trans_reserve
> 	      - 5.41% xfs_attr_set_args
> 		 - 5.39% xfs_attr_set_shortform.constprop.0
> 		    - 4.46% xfs_trans_commit
> 		       - 4.46% __xfs_trans_commit
> 			  - 4.33% xfs_log_commit_cil
> 			     - 2.74% _raw_spin_lock
> 				- do_raw_spin_lock
> 				     __pv_queued_spin_lock_slowpath
> 			       0.60% xfs_inode_item_format
> 		      0.90% xfs_attr_try_sf_addname
> 	- 1.99% selinux_inode_init_security
> 	   - 1.02% security_sid_to_context_force
> 	      - 1.00% security_sid_to_context_core
> 		 - 0.92% sidtab_entry_to_string
> 		    - 0.90% sidtab_sid2str_get
> 			 0.59% sidtab_sid2str_put.part.0
> 	   - 0.82% selinux_determine_inode_label
> 	      - 0.77% security_transition_sid
> 		   0.70% security_compute_sid.part.0
> 
> And fsmark creation rate performance drops by ~25%. The key point to
> note here is that half the additional overhead comes from adding the
> attribute fork to the newly created inode. That's crazy, considering
> we can do this same thing at inode create time with a couple of
> lines of code and no extra overhead.
> 
> So, if we know we are going to add an attribute immediately after
> creating the inode, let's just initialise the attribute fork inside
> the create transaction and chop that whole chunk of code out of
> the create fast path. This completely removes the performance
> drop caused by enabling SELinux, and the profile looks like:
> 
>      - 8.99% xfs_init_security
>          - 9.00% security_inode_init_security
>             - 6.43% xfs_initxattrs
>                - 6.37% xfs_attr_set
>                   - 5.45% xfs_attr_set_args
>                      - 5.42% xfs_attr_set_shortform.constprop.0
>                         - 4.51% xfs_trans_commit
>                            - 4.54% __xfs_trans_commit
>                               - 4.59% xfs_log_commit_cil
>                                  - 2.67% _raw_spin_lock
>                                     - 3.28% do_raw_spin_lock
>                                          3.08% __pv_queued_spin_lock_slowpath
>                                    0.66% xfs_inode_item_format
>                         - 0.90% xfs_attr_try_sf_addname
>                   - 0.60% xfs_trans_alloc
>             - 2.35% selinux_inode_init_security
>                - 1.25% security_sid_to_context_force
>                   - 1.21% security_sid_to_context_core
>                      - 1.19% sidtab_entry_to_string
>                         - 1.20% sidtab_sid2str_get
>                            - 0.86% sidtab_sid2str_put.part.0
>                               - 0.62% _raw_spin_lock_irqsave
>                                  - 0.77% do_raw_spin_lock
>                                       __pv_queued_spin_lock_slowpath
>                - 0.84% selinux_determine_inode_label
>                   - 0.83% security_transition_sid
>                        0.86% security_compute_sid.part.0
> 
> Which indicates the XFS overhead of creating the selinux xattr has
> been halved. This doesn't fix the CIL lock contention problem, just
> means it's not a limiting factor for this workload. Lock contention
> in the security subsystems is going to be an issue soon, though...
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_inode_fork.c | 20 +++++++++++++++-----
>  fs/xfs/libxfs/xfs_inode_fork.h |  1 +
>  fs/xfs/xfs_inode.c             | 24 ++++++++++++++++++++----
>  fs/xfs/xfs_inode.h             |  5 +++--
>  fs/xfs/xfs_iops.c              | 10 +++++++++-
>  fs/xfs/xfs_qm.c                |  2 +-
>  fs/xfs/xfs_symlink.c           |  2 +-
>  7 files changed, 50 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
> index 7575de5cecb1..5d5b466bd787 100644
> --- a/fs/xfs/libxfs/xfs_inode_fork.c
> +++ b/fs/xfs/libxfs/xfs_inode_fork.c
> @@ -280,6 +280,19 @@ xfs_dfork_attr_shortform_size(
>  	return be16_to_cpu(atp->hdr.totsize);
>  }
>  
> +struct xfs_ifork *
> +xfs_ifork_alloc(
> +	int8_t			format,

enum xfs_dinode_fmt	format, ?

> +	xfs_extnum_t		nextents)
> +{
> +	struct xfs_ifork	*ifp;
> +
> +	ifp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL);
> +	ifp->if_format = format;
> +	ifp->if_nextents = nextents;
> +	return ifp;
> +}
> +
>  int
>  xfs_iformat_attr_fork(
>  	struct xfs_inode	*ip,
> @@ -291,11 +304,8 @@ xfs_iformat_attr_fork(
>  	 * Initialize the extent count early, as the per-format routines may
>  	 * depend on it.
>  	 */
> -	ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL);
> -	ip->i_afp->if_format = dip->di_aformat;
> -	if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */
> -		ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS;

Can we still mount an irix 6.1 filesystem?  I would guess not, but
removing old cruft is a separate patch.  Granted you did say RFC...

> -	ip->i_afp->if_nextents = be16_to_cpu(dip->di_anextents);
> +	ip->i_afp = xfs_ifork_alloc(dip->di_aformat,
> +				be16_to_cpu(dip->di_anextents));
>  
>  	switch (ip->i_afp->if_format) {
>  	case XFS_DINODE_FMT_LOCAL:
> diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
> index a4953e95c4f3..3ad088c6a9d2 100644
> --- a/fs/xfs/libxfs/xfs_inode_fork.h
> +++ b/fs/xfs/libxfs/xfs_inode_fork.h
> @@ -80,6 +80,7 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
>  	return ifp->if_format;
>  }
>  
> +struct xfs_ifork *xfs_ifork_alloc(int8_t format, xfs_extnum_t nextents);
>  struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
>  
>  int		xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *);
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 2bfbcf28b1bd..9ee2e0b4c6fd 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -800,6 +800,7 @@ xfs_ialloc(
>  	dev_t		rdev,
>  	prid_t		prid,
>  	xfs_buf_t	**ialloc_context,
> +	bool		init_attrs,

init_xattrs ?  Just to distinguish them from regular file attrs like
mode/size/owner/etc.

>  	xfs_inode_t	**ipp)
>  {
>  	struct xfs_mount *mp = tp->t_mountp;
> @@ -918,6 +919,18 @@ xfs_ialloc(
>  		ASSERT(0);
>  	}
>  
> +	/*
> +	 * If we need to create attributes immediately after allocating the
> +	 * inode, initialise an empty attribute fork right now. We use the
> +	 * default fork offset for attributes here as we don't know exactly what
> +	 * size or how many attributes we might be adding. We can do this safely
> +	 * here because we know the data fork is completely empty right now.
> +	 */
> +	if (init_attrs) {
> +		ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
> +		ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> +	}
> +
>  	/*
>  	 * Log the new values stuffed into the inode.
>  	 */
> @@ -951,6 +964,7 @@ xfs_dir_ialloc(
>  	xfs_nlink_t	nlink,
>  	dev_t		rdev,
>  	prid_t		prid,		/* project id */
> +	bool		init_attrs,
>  	xfs_inode_t	**ipp)		/* pointer to inode; it will be
>  					   locked. */
>  {
> @@ -980,7 +994,7 @@ xfs_dir_ialloc(
>  	 * the inode(s) that we've just allocated.
>  	 */
>  	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
> -			&ip);
> +			init_attrs, &ip);
>  
>  	/*
>  	 * Return an error if we were unable to allocate a new inode.
> @@ -1050,7 +1064,7 @@ xfs_dir_ialloc(
>  		 * this call should always succeed.
>  		 */
>  		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
> -				  &ialloc_context, &ip);
> +				  &ialloc_context, init_attrs, &ip);
>  
>  		/*
>  		 * If we get an error at this point, return to the caller
> @@ -1112,6 +1126,7 @@ xfs_create(
>  	struct xfs_name		*name,
>  	umode_t			mode,
>  	dev_t			rdev,
> +	bool			init_attrs,
>  	xfs_inode_t		**ipp)
>  {
>  	int			is_dir = S_ISDIR(mode);
> @@ -1182,7 +1197,8 @@ xfs_create(
>  	 * entry pointing to them, but a directory also the "." entry
>  	 * pointing to itself.
>  	 */
> -	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
> +	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid,
> +				init_attrs, &ip);
>  	if (error)
>  		goto out_trans_cancel;
>  
> @@ -1304,7 +1320,7 @@ xfs_create_tmpfile(
>  	if (error)
>  		goto out_trans_cancel;
>  
> -	error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
> +	error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, false, &ip);
>  	if (error)
>  		goto out_trans_cancel;
>  
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 751a3d1d7d84..670dfab334de 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -370,7 +370,8 @@ void		xfs_inactive(struct xfs_inode *ip);
>  int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
>  			   struct xfs_inode **ipp, struct xfs_name *ci_name);
>  int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
> -			   umode_t mode, dev_t rdev, struct xfs_inode **ipp);
> +			   umode_t mode, dev_t rdev, bool need_xattr,
> +			   struct xfs_inode **ipp);
>  int		xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
>  			   struct xfs_inode **ipp);
>  int		xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
> @@ -408,7 +409,7 @@ xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
>  xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
>  
>  int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
> -			       xfs_nlink_t, dev_t, prid_t,
> +			       xfs_nlink_t, dev_t, prid_t, bool need_xattr,
>  			       struct xfs_inode **);
>  
>  static inline int
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 1414ab79eacf..75b44b82ad1f 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -126,6 +126,7 @@ xfs_cleanup_inode(
>  	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
>  }
>  
> +
>  STATIC int
>  xfs_generic_create(
>  	struct inode	*dir,
> @@ -161,7 +162,14 @@ xfs_generic_create(
>  		goto out_free_acl;
>  
>  	if (!tmpfile) {
> -		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
> +		bool need_xattr = false;
> +
> +		if ((IS_ENABLED(CONFIG_SECURITY) && dir->i_sb->s_security) ||
> +		    default_acl || acl)
> +			need_xattr = true;

This should be a separate predicate in case the logic gets more twisty
later.

> +
> +		error = xfs_create(XFS_I(dir), &name, mode, rdev,
> +					need_xattr, &ip);
>  	} else {
>  		error = xfs_create_tmpfile(XFS_I(dir), mode, &ip);

/me bets O_TMPFILE files can have xattr-based security labels applied to
them too, right?

>  	}
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index b2a9abee8b2b..45faddfb4234 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -787,7 +787,7 @@ xfs_qm_qino_alloc(
>  		return error;
>  
>  	if (need_alloc) {
> -		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip);
> +		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, false, ip);
>  		if (error) {
>  			xfs_trans_cancel(tp);
>  			return error;
> diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
> index 8e88a7ca387e..20919aaea981 100644
> --- a/fs/xfs/xfs_symlink.c
> +++ b/fs/xfs/xfs_symlink.c
> @@ -224,7 +224,7 @@ xfs_symlink(
>  	 * Allocate an inode for the symlink.
>  	 */
>  	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
> -			       prid, &ip);
> +			       prid, false, &ip);
>  	if (error)
>  		goto out_trans_cancel;
>  
> -- 
> 2.28.0
>
Darrick J. Wong Dec. 7, 2020, 5:18 p.m. UTC | #10
On Mon, Dec 07, 2020 at 11:30:18AM -0500, Brian Foster wrote:
> On Mon, Dec 07, 2020 at 10:33:22AM +1100, Dave Chinner wrote:
> > On Sat, Dec 05, 2020 at 06:34:44AM -0500, Brian Foster wrote:
> > > On Sat, Dec 05, 2020 at 08:22:22AM +1100, Dave Chinner wrote:
> > > > On Fri, Dec 04, 2020 at 07:31:37AM -0500, Brian Foster wrote:
> > > > > On Thu, Dec 03, 2020 at 10:27:24AM +1100, Dave Chinner wrote:
> > > > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > > > index 2bfbcf28b1bd..9ee2e0b4c6fd 100644
> > > > > > --- a/fs/xfs/xfs_inode.c
> > > > > > +++ b/fs/xfs/xfs_inode.c
> > > > > ...
> > > > > > @@ -918,6 +919,18 @@ xfs_ialloc(
> > > > > >  		ASSERT(0);
> > > > > >  	}
> > > > > >  
> > > > > > +	/*
> > > > > > +	 * If we need to create attributes immediately after allocating the
> > > > > > +	 * inode, initialise an empty attribute fork right now. We use the
> > > > > > +	 * default fork offset for attributes here as we don't know exactly what
> > > > > > +	 * size or how many attributes we might be adding. We can do this safely
> > > > > > +	 * here because we know the data fork is completely empty right now.
> > > > > > +	 */
> > > > > > +	if (init_attrs) {
> > > > > > +		ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
> > > > > > +		ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> > > > > > +	}
> > > > > > +
> > > > > 
> > > > > Seems reasonable in principle, but why not refactor
> > > > > xfs_bmap_add_attrfork() such that the internals (i.e. everything within
> > > > > the transaction/ilock code) can be properly reused in both contexts
> > > > > rather than open-coding (and thus duplicating) a somewhat stripped down
> > > > > version?
> > > > 
> > > > We don't know the size of the attribute that is being created, so
> > > > the attr size dependent parts of it can't be used.
> > > 
> > > Not sure I see the problem here. It looks to me that
> > > xfs_bmap_add_attrfork() would do the right thing if we just passed a
> > > size of zero.
> > 
> > Yes, but it also does an awful lot that we do not need.
> > 
> 
> Hence the suggestion to refactor it..
> 
> > > The only place the size value is actually used is down in
> > > xfs_attr_shortform_bytesfit(), and I'd expect that to identify that the
> > > requested size is <= than the current afork size (also zero for a newly
> > > allocated inode..?) and bail out.
> > 
> > RIght it ends up doing that because an uninitialised inode fork
> > (di_forkoff = 0) is the same size as the requested size of zero, and
> > then it does ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> > 
> > But that's decided another two function calls deep, after a lot of
> > branches and shifting and comparisons to determine that the attr
> > fork is empty. Yet we already know that the attr fork is empty here
> > so all that extra CPU work is completely unnecessary.
> > 
> 
> xfs_bmap_add_attrfork() already asserts that the attr fork is
> nonexistent at the very top of the function, for one. The 25-30 lines of
> that function that we need can be trivially lifted out into a new helper
> that can equally as trivially accommodate the size == 0 case and skip
> all those shortform calculations.
> 
> > Keep in mind we do exactly the same thing in
> > xfs_bmap_forkoff_reset(). We don't care about all the setup stuff in
> > xfs_bmap_add_attrfork(), we just reset the attr fork offset to the
> > default if the attr fork had grown larger than the default offset.
> > 
> 
> I'm not arguing that the attr fork needs to be set up in a particular
> way on initial creation. I'm arguing that we don't need yet a third
> unique code path to set/initialize a default/empty attr fork. We can
> slowly normalize them all to the _reset() technique you're effectively
> reimplementing here if that works well enough and is preferred...
> 
> > > That said, I wouldn't be opposed to tweaking xfs_bmap_set_attrforkoff()
> > > by a line or two to just skip the shortform call if size == 0. Then we
> > > can be more explicit about the "size == 0 means preemptive fork alloc,
> > > use the default offset" use case and perhaps actually document it with
> > > some comments as well.
> > 
> > It just seems wrong to me to code a special case into some function
> > to optimise that special case when the code that needs the special
> > case has no need to call that function in the first place.....
> > 
> 
> I'm not sure what's so odd or controversial about refactoring and
> reusing an existing operational (i.e. add fork) function to facilitate
> review and future maintenance of that particular operation being
> performed from new and various contexts. And speaking in generalities
> like this just obfuscates and overcomplicates the argument. Let's be
> clear, we're literally arguing over a delta that would look something
> like this:
> 
> xfs_bmap_set_attrforkoff()
> {
> ...
> +		if (size)
> -               ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
> +			ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
>                 if (!ip->i_d.di_forkoff)
>                         ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
> 		...
> }
> 
> Given all of that, I'm not convinced this is nearly the problem you seem
> to insinuate, yet I also don't think I'll convince you otherwise so it's
> probably not worth continuing to debate. You have my feedback, I'll let
> others determine how this patch comes together from here...

Alternate take: If you /are/ going to have a "init empty attr fork"
shortcut function, can it at least be placed next to the regular one in
xfs_bmap.c so that the di_forkoff setters are only split across three
source files instead of four?

--D

> Brian
> 
> > Cheers,
> > 
> > Dave.
> > -- 
> > Dave Chinner
> > david@fromorbit.com
> > 
>
Casey Schaufler Dec. 7, 2020, 5:22 p.m. UTC | #11
On 12/3/2020 11:54 PM, Christoph Hellwig wrote:
> On Fri, Dec 04, 2020 at 08:44:26AM +1100, Dave Chinner wrote:
>>>> +		if ((IS_ENABLED(CONFIG_SECURITY) && dir->i_sb->s_security) ||
>>>> +		    default_acl || acl)
>>>> +			need_xattr = true;
>>>> +
>>>> +		error = xfs_create(XFS_I(dir), &name, mode, rdev,
>>>> +					need_xattr, &ip);
>>> It might be wort to factor the condition into a little helper.  Also
>>> I think we also have security labels for O_TMPFILE inodes, so it might
>>> be worth plugging into that path as well.
>> Yeah, a helper is a good idea - I just wanted to get some feedback
>> first on whether it's a good idea to peek directly at
>> i_sb->s_security

Only security modules should ever look at what's in the security blob.
In fact, you can't assume that the presence of a security blob
(i.e. ...->s_security != NULL) implies "need_xattr", or any other
state for the superblock.

>>  or whether there is some other way of knowing ahead
>> of time that a security xattr is going to be created. I couldn't
>> find one, but that doesn't mean such an interface doesn't exist in
>> all the twisty passages of the LSM layers...
> I've added the relevant list, maybe someone there has an opinion.

How is what you're looking for different from security_ismaclabel() ?
Christoph Hellwig Dec. 7, 2020, 5:25 p.m. UTC | #12
On Mon, Dec 07, 2020 at 09:22:13AM -0800, Casey Schaufler wrote:
> Only security modules should ever look at what's in the security blob.
> In fact, you can't assume that the presence of a security blob
> (i.e. ...->s_security != NULL) implies "need_xattr", or any other
> state for the superblock.

Maybe "strongly suggests that an xattr will be added" is the better
wording.

> 
> >>  or whether there is some other way of knowing ahead
> >> of time that a security xattr is going to be created. I couldn't
> >> find one, but that doesn't mean such an interface doesn't exist in
> >> all the twisty passages of the LSM layers...
> > I've added the relevant list, maybe someone there has an opinion.
> 
> How is what you're looking for different from security_ismaclabel() ?

Not at all.  What this needs is a guestimate (which doesn't have
to be 100% reliable) that a new inode created by ->create, ->mknod,
or ->mkdir will have an xattr set on it during the creation syscall.
Christoph Hellwig Dec. 7, 2020, 5:31 p.m. UTC | #13
Btw, while looking at the code before replying to Casey I noticed
something else in this area of code which we should probably fix
if we touch all this.  We are really supposed to create the ACLs
and security labels atomically with the actual inode creation.  And
I think we have all the infrastructure to do this without too much
pain now for ACLs.  Security labels with the weird
security_inode_init_security interface might be a little harder but
not impossible.

And I suspect security_inode_init_security might be right thing
to reuse for the helper to figure out what attrs would be set.  If
security_inode_init_security with an idempotent callback is
idempotent itself we might be able to use it directly, but all the
weird hooking makes it rather hard to read.
Dave Chinner Dec. 7, 2020, 8:42 p.m. UTC | #14
On Mon, Dec 07, 2020 at 05:31:11PM +0000, Christoph Hellwig wrote:
> Btw, while looking at the code before replying to Casey I noticed
> something else in this area of code which we should probably fix
> if we touch all this.  We are really supposed to create the ACLs
> and security labels atomically with the actual inode creation.  And
> I think we have all the infrastructure to do this without too much
> pain now for ACLs.  Security labels with the weird
> security_inode_init_security interface might be a little harder but
> not impossible.

Yes, that's long been a known problem - it was a problem way back in
the day for DMF and the DMAPI attributes that needed to be added at
create time. That's where XFS_TRANS_RESERVE originally came from, so
that ENOSPC didn't prevent the dmapi xattr from being added to a
newly created inode.

This atomicity problem is one of the things that Allison's attribute
defer-op rework is intended to address.  i.e. being able to
atomically create xattrs at inode create time and have them fully
recoverable by intent replay if the initial inode allocation and
directory linkage transaction succeeds.

Essential the transaction context would start in
xfs_generic_create(), not xfs_create(), and roll through to the
xattr creations for ACLs and security contexts...

> And I suspect security_inode_init_security might be right thing
> to reuse for the helper to figure out what attrs would be set.

Problem is that it appears to need an inode to already be allocated
and instantiated to work....

> If
> security_inode_init_security with an idempotent callback is
> idempotent itself we might be able to use it directly, but all the
> weird hooking makes it rather hard to read.

Yeah, it's like a layer of inscrutible obscurity has been added to a
simple ops table... :/

Cheers,

Dave.
Dave Chinner Dec. 7, 2020, 8:49 p.m. UTC | #15
On Mon, Dec 07, 2020 at 05:25:45PM +0000, Christoph Hellwig wrote:
> On Mon, Dec 07, 2020 at 09:22:13AM -0800, Casey Schaufler wrote:
> > Only security modules should ever look at what's in the security blob.
> > In fact, you can't assume that the presence of a security blob
> > (i.e. ...->s_security != NULL) implies "need_xattr", or any other
> > state for the superblock.
> 
> Maybe "strongly suggests that an xattr will be added" is the better
> wording.

Right, I did this knowing that only selinux and smack actually use
sb->s_security so it's not 100% reliable. However, these are also
the only two security modules that hook inode_init_security and
create xattrs.

So it seems like peeking at ->s_security here gives us a fairly
reliable indicator that we're going to have to create xattrs on this
new inode before we complete the create process...

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 7575de5cecb1..5d5b466bd787 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -280,6 +280,19 @@  xfs_dfork_attr_shortform_size(
 	return be16_to_cpu(atp->hdr.totsize);
 }
 
+struct xfs_ifork *
+xfs_ifork_alloc(
+	int8_t			format,
+	xfs_extnum_t		nextents)
+{
+	struct xfs_ifork	*ifp;
+
+	ifp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL);
+	ifp->if_format = format;
+	ifp->if_nextents = nextents;
+	return ifp;
+}
+
 int
 xfs_iformat_attr_fork(
 	struct xfs_inode	*ip,
@@ -291,11 +304,8 @@  xfs_iformat_attr_fork(
 	 * Initialize the extent count early, as the per-format routines may
 	 * depend on it.
 	 */
-	ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL);
-	ip->i_afp->if_format = dip->di_aformat;
-	if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */
-		ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS;
-	ip->i_afp->if_nextents = be16_to_cpu(dip->di_anextents);
+	ip->i_afp = xfs_ifork_alloc(dip->di_aformat,
+				be16_to_cpu(dip->di_anextents));
 
 	switch (ip->i_afp->if_format) {
 	case XFS_DINODE_FMT_LOCAL:
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index a4953e95c4f3..3ad088c6a9d2 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -80,6 +80,7 @@  static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
 	return ifp->if_format;
 }
 
+struct xfs_ifork *xfs_ifork_alloc(int8_t format, xfs_extnum_t nextents);
 struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
 
 int		xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2bfbcf28b1bd..9ee2e0b4c6fd 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -800,6 +800,7 @@  xfs_ialloc(
 	dev_t		rdev,
 	prid_t		prid,
 	xfs_buf_t	**ialloc_context,
+	bool		init_attrs,
 	xfs_inode_t	**ipp)
 {
 	struct xfs_mount *mp = tp->t_mountp;
@@ -918,6 +919,18 @@  xfs_ialloc(
 		ASSERT(0);
 	}
 
+	/*
+	 * If we need to create attributes immediately after allocating the
+	 * inode, initialise an empty attribute fork right now. We use the
+	 * default fork offset for attributes here as we don't know exactly what
+	 * size or how many attributes we might be adding. We can do this safely
+	 * here because we know the data fork is completely empty right now.
+	 */
+	if (init_attrs) {
+		ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
+		ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+	}
+
 	/*
 	 * Log the new values stuffed into the inode.
 	 */
@@ -951,6 +964,7 @@  xfs_dir_ialloc(
 	xfs_nlink_t	nlink,
 	dev_t		rdev,
 	prid_t		prid,		/* project id */
+	bool		init_attrs,
 	xfs_inode_t	**ipp)		/* pointer to inode; it will be
 					   locked. */
 {
@@ -980,7 +994,7 @@  xfs_dir_ialloc(
 	 * the inode(s) that we've just allocated.
 	 */
 	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
-			&ip);
+			init_attrs, &ip);
 
 	/*
 	 * Return an error if we were unable to allocate a new inode.
@@ -1050,7 +1064,7 @@  xfs_dir_ialloc(
 		 * this call should always succeed.
 		 */
 		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
-				  &ialloc_context, &ip);
+				  &ialloc_context, init_attrs, &ip);
 
 		/*
 		 * If we get an error at this point, return to the caller
@@ -1112,6 +1126,7 @@  xfs_create(
 	struct xfs_name		*name,
 	umode_t			mode,
 	dev_t			rdev,
+	bool			init_attrs,
 	xfs_inode_t		**ipp)
 {
 	int			is_dir = S_ISDIR(mode);
@@ -1182,7 +1197,8 @@  xfs_create(
 	 * entry pointing to them, but a directory also the "." entry
 	 * pointing to itself.
 	 */
-	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
+	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid,
+				init_attrs, &ip);
 	if (error)
 		goto out_trans_cancel;
 
@@ -1304,7 +1320,7 @@  xfs_create_tmpfile(
 	if (error)
 		goto out_trans_cancel;
 
-	error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
+	error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, false, &ip);
 	if (error)
 		goto out_trans_cancel;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 751a3d1d7d84..670dfab334de 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -370,7 +370,8 @@  void		xfs_inactive(struct xfs_inode *ip);
 int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 			   struct xfs_inode **ipp, struct xfs_name *ci_name);
 int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
-			   umode_t mode, dev_t rdev, struct xfs_inode **ipp);
+			   umode_t mode, dev_t rdev, bool need_xattr,
+			   struct xfs_inode **ipp);
 int		xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
 			   struct xfs_inode **ipp);
 int		xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
@@ -408,7 +409,7 @@  xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
 xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
 
 int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
-			       xfs_nlink_t, dev_t, prid_t,
+			       xfs_nlink_t, dev_t, prid_t, bool need_xattr,
 			       struct xfs_inode **);
 
 static inline int
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1414ab79eacf..75b44b82ad1f 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -126,6 +126,7 @@  xfs_cleanup_inode(
 	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
 }
 
+
 STATIC int
 xfs_generic_create(
 	struct inode	*dir,
@@ -161,7 +162,14 @@  xfs_generic_create(
 		goto out_free_acl;
 
 	if (!tmpfile) {
-		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+		bool need_xattr = false;
+
+		if ((IS_ENABLED(CONFIG_SECURITY) && dir->i_sb->s_security) ||
+		    default_acl || acl)
+			need_xattr = true;
+
+		error = xfs_create(XFS_I(dir), &name, mode, rdev,
+					need_xattr, &ip);
 	} else {
 		error = xfs_create_tmpfile(XFS_I(dir), mode, &ip);
 	}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b2a9abee8b2b..45faddfb4234 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -787,7 +787,7 @@  xfs_qm_qino_alloc(
 		return error;
 
 	if (need_alloc) {
-		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip);
+		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, false, ip);
 		if (error) {
 			xfs_trans_cancel(tp);
 			return error;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 8e88a7ca387e..20919aaea981 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -224,7 +224,7 @@  xfs_symlink(
 	 * Allocate an inode for the symlink.
 	 */
 	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
-			       prid, &ip);
+			       prid, false, &ip);
 	if (error)
 		goto out_trans_cancel;