[RFC,V2,08/12] fs/xfs: Add lock/unlock mode to xfs
diff mbox series

Message ID 20200110192942.25021-9-ira.weiny@intel.com
State New
Headers show
Series
  • Enable per-file/directory DAX operations V2
Related show

Commit Message

Ira Weiny Jan. 10, 2020, 7:29 p.m. UTC
From: Ira Weiny <ira.weiny@intel.com>

XFS requires regular files to be locked while changing to/from DAX mode.

Define a new DAX lock type and implement the [un]lock_mode() inode
operation callbacks.

We define a new XFS_DAX_* lock type to carry the lock through the
transaction because we don't want to use IOLOCK as that would cause
performance issues with locking of the inode itself.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 fs/xfs/xfs_icache.c |  2 ++
 fs/xfs/xfs_inode.c  | 37 +++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_inode.h  | 12 ++++++++++--
 fs/xfs/xfs_iops.c   | 24 +++++++++++++++++++++++-
 4 files changed, 70 insertions(+), 5 deletions(-)

Comments

Darrick J. Wong Jan. 13, 2020, 10:19 p.m. UTC | #1
On Fri, Jan 10, 2020 at 11:29:38AM -0800, ira.weiny@intel.com wrote:
> From: Ira Weiny <ira.weiny@intel.com>
> 
> XFS requires regular files to be locked while changing to/from DAX mode.
> 
> Define a new DAX lock type and implement the [un]lock_mode() inode
> operation callbacks.
> 
> We define a new XFS_DAX_* lock type to carry the lock through the
> transaction because we don't want to use IOLOCK as that would cause
> performance issues with locking of the inode itself.
> 
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> ---
>  fs/xfs/xfs_icache.c |  2 ++
>  fs/xfs/xfs_inode.c  | 37 +++++++++++++++++++++++++++++++++++--
>  fs/xfs/xfs_inode.h  | 12 ++++++++++--
>  fs/xfs/xfs_iops.c   | 24 +++++++++++++++++++++++-
>  4 files changed, 70 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 8dc2e5414276..0288672e8902 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -74,6 +74,8 @@ xfs_inode_alloc(
>  	INIT_LIST_HEAD(&ip->i_ioend_list);
>  	spin_lock_init(&ip->i_ioend_lock);
>  
> +	percpu_init_rwsem(&ip->i_dax_sem);
> +
>  	return ip;
>  }
>  
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 401da197f012..e8fd95b75e5b 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -142,12 +142,12 @@ xfs_ilock_attr_map_shared(
>   *
>   * Basic locking order:
>   *
> - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
> + * i_rwsem -> i_dax_sem -> i_mmap_lock -> page_lock -> i_ilock

Mmmmmm, more locks.  Can we skip the extra lock if CONFIG_FSDAX=n or if
the filesystem devices don't support DAX at all?

Also, I don't think we're actually following the i_rwsem -> i_daxsem
order in fallocate, and possibly elsewhere too?

Does the vfs have to take the i_dax_sem to do remapping things like
reflink?  (Pretend that reflink and dax are compatible for the moment)

>   * mmap_sem locking order:
>   *
>   * i_rwsem -> page lock -> mmap_sem
> - * mmap_sem -> i_mmap_lock -> page_lock
> + * mmap_sem -> i_dax_sem -> i_mmap_lock -> page_lock
>   *
>   * The difference in mmap_sem locking order mean that we cannot hold the
>   * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
> @@ -181,6 +181,13 @@ xfs_ilock(
>  	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
>  	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
>  	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
> +	ASSERT((lock_flags & (XFS_DAX_SHARED | XFS_DAX_EXCL)) !=
> +	       (XFS_DAX_SHARED | XFS_DAX_EXCL));
> +
> +	if (lock_flags & XFS_DAX_EXCL)
> +		percpu_down_write(&ip->i_dax_sem);
> +	else if (lock_flags & XFS_DAX_SHARED)
> +		percpu_down_read(&ip->i_dax_sem);
>  
>  	if (lock_flags & XFS_IOLOCK_EXCL) {
>  		down_write_nested(&VFS_I(ip)->i_rwsem,
> @@ -224,6 +231,8 @@ xfs_ilock_nowait(
>  	 * You can't set both SHARED and EXCL for the same lock,
>  	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
>  	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
> +	 *
> +	 * XFS_DAX_* is not allowed
>  	 */
>  	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
>  	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
> @@ -232,6 +241,7 @@ xfs_ilock_nowait(
>  	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
>  	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
>  	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
> +	ASSERT((lock_flags & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
>  
>  	if (lock_flags & XFS_IOLOCK_EXCL) {
>  		if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
> @@ -302,6 +312,8 @@ xfs_iunlock(
>  	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
>  	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
>  	ASSERT(lock_flags != 0);
> +	ASSERT((lock_flags & (XFS_DAX_SHARED | XFS_DAX_EXCL)) !=
> +	       (XFS_DAX_SHARED | XFS_DAX_EXCL));
>  
>  	if (lock_flags & XFS_IOLOCK_EXCL)
>  		up_write(&VFS_I(ip)->i_rwsem);
> @@ -318,6 +330,11 @@ xfs_iunlock(
>  	else if (lock_flags & XFS_ILOCK_SHARED)
>  		mrunlock_shared(&ip->i_lock);
>  
> +	if (lock_flags & XFS_DAX_EXCL)
> +		percpu_up_write(&ip->i_dax_sem);
> +	else if (lock_flags & XFS_DAX_SHARED)
> +		percpu_up_read(&ip->i_dax_sem);
> +
>  	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
>  }
>  
> @@ -333,6 +350,8 @@ xfs_ilock_demote(
>  	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
>  	ASSERT((lock_flags &
>  		~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
> +	/* XFS_DAX_* is not allowed */
> +	ASSERT((lock_flags & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
>  
>  	if (lock_flags & XFS_ILOCK_EXCL)
>  		mrdemote(&ip->i_lock);
> @@ -369,6 +388,13 @@ xfs_isilocked(
>  		return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
>  	}
>  
> +	if (lock_flags & (XFS_DAX_EXCL|XFS_DAX_SHARED)) {
> +		if (!(lock_flags & XFS_DAX_SHARED))
> +			return !debug_locks ||
> +				percpu_rwsem_is_held(&ip->i_dax_sem, 0);
> +		return rwsem_is_locked(&ip->i_dax_sem);
> +	}
> +
>  	ASSERT(0);
>  	return 0;
>  }
> @@ -465,6 +491,9 @@ xfs_lock_inodes(
>  	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
>  		inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
>  
> +	/* XFS_DAX_* is not allowed */
> +	ASSERT((lock_mode & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
> +
>  	if (lock_mode & XFS_IOLOCK_EXCL) {
>  		ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
>  	} else if (lock_mode & XFS_MMAPLOCK_EXCL)
> @@ -566,6 +595,10 @@ xfs_lock_two_inodes(
>  	ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
>  	       !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
>  
> +	/* XFS_DAX_* is not allowed */
> +	ASSERT((ip0_mode & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
> +	ASSERT((ip1_mode & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
> +
>  	ASSERT(ip0->i_ino != ip1->i_ino);
>  
>  	if (ip0->i_ino > ip1->i_ino) {
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 492e53992fa9..693ca66bd89b 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -67,6 +67,9 @@ typedef struct xfs_inode {
>  	spinlock_t		i_ioend_lock;
>  	struct work_struct	i_ioend_work;
>  	struct list_head	i_ioend_list;
> +
> +	/* protect changing the mode to/from DAX */
> +	struct percpu_rw_semaphore i_dax_sem;
>  } xfs_inode_t;
>  
>  /* Convert from vfs inode to xfs inode */
> @@ -278,10 +281,13 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
>  #define	XFS_ILOCK_SHARED	(1<<3)
>  #define	XFS_MMAPLOCK_EXCL	(1<<4)
>  #define	XFS_MMAPLOCK_SHARED	(1<<5)
> +#define	XFS_DAX_EXCL		(1<<6)
> +#define	XFS_DAX_SHARED		(1<<7)
>  
>  #define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
>  				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
> -				| XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
> +				| XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED \
> +				| XFS_DAX_EXCL | XFS_DAX_SHARED)
>  
>  #define XFS_LOCK_FLAGS \
>  	{ XFS_IOLOCK_EXCL,	"IOLOCK_EXCL" }, \
> @@ -289,7 +295,9 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
>  	{ XFS_ILOCK_EXCL,	"ILOCK_EXCL" }, \
>  	{ XFS_ILOCK_SHARED,	"ILOCK_SHARED" }, \
>  	{ XFS_MMAPLOCK_EXCL,	"MMAPLOCK_EXCL" }, \
> -	{ XFS_MMAPLOCK_SHARED,	"MMAPLOCK_SHARED" }
> +	{ XFS_MMAPLOCK_SHARED,	"MMAPLOCK_SHARED" }, \
> +	{ XFS_DAX_EXCL,   	"DAX_EXCL" }, \

Whitespace between the comma & string.

> +	{ XFS_DAX_SHARED,	"DAX_SHARED" }
>  
>  
>  /*
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index d6843cdb51d0..a2f2604c3187 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -1158,6 +1158,16 @@ xfs_vn_tmpfile(
>  	return xfs_generic_create(dir, dentry, mode, 0, true);
>  }
>  
> +static void xfs_lock_mode(struct inode *inode)
> +{
> +	xfs_ilock(XFS_I(inode), XFS_DAX_SHARED);
> +}
> +
> +static void xfs_unlock_mode(struct inode *inode)
> +{
> +	xfs_iunlock(XFS_I(inode), XFS_DAX_SHARED);
> +}
> +
>  static const struct inode_operations xfs_inode_operations = {
>  	.get_acl		= xfs_get_acl,
>  	.set_acl		= xfs_set_acl,
> @@ -1168,6 +1178,18 @@ static const struct inode_operations xfs_inode_operations = {
>  	.update_time		= xfs_vn_update_time,
>  };
>  
> +static const struct inode_operations xfs_reg_inode_operations = {
> +	.get_acl		= xfs_get_acl,
> +	.set_acl		= xfs_set_acl,
> +	.getattr		= xfs_vn_getattr,
> +	.setattr		= xfs_vn_setattr,
> +	.listxattr		= xfs_vn_listxattr,
> +	.fiemap			= xfs_vn_fiemap,
> +	.update_time		= xfs_vn_update_time,
> +	.lock_mode              = xfs_lock_mode,
> +	.unlock_mode            = xfs_unlock_mode,
> +};
> +
>  static const struct inode_operations xfs_dir_inode_operations = {
>  	.create			= xfs_vn_create,
>  	.lookup			= xfs_vn_lookup,
> @@ -1372,7 +1394,7 @@ xfs_setup_iops(
>  
>  	switch (inode->i_mode & S_IFMT) {
>  	case S_IFREG:
> -		inode->i_op = &xfs_inode_operations;
> +		inode->i_op = &xfs_reg_inode_operations;

xfs_file_inode_operations?

--D

>  		inode->i_fop = &xfs_file_operations;
>  		if (IS_DAX(inode))
>  			inode->i_mapping->a_ops = &xfs_dax_aops;
> -- 
> 2.21.0
>
Ira Weiny Jan. 14, 2020, 12:35 a.m. UTC | #2
On Mon, Jan 13, 2020 at 02:19:57PM -0800, Darrick J. Wong wrote:
> On Fri, Jan 10, 2020 at 11:29:38AM -0800, ira.weiny@intel.com wrote:
> > From: Ira Weiny <ira.weiny@intel.com>

[snip]

> >  
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 401da197f012..e8fd95b75e5b 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -142,12 +142,12 @@ xfs_ilock_attr_map_shared(
> >   *
> >   * Basic locking order:
> >   *
> > - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
> > + * i_rwsem -> i_dax_sem -> i_mmap_lock -> page_lock -> i_ilock
> 
> Mmmmmm, more locks.  Can we skip the extra lock if CONFIG_FSDAX=n or if
> the filesystem devices don't support DAX at all?

I'll look into it.

> 
> Also, I don't think we're actually following the i_rwsem -> i_daxsem
> order in fallocate, and possibly elsewhere too?

I'll have to verify.  It took a lot of iterations to get the order working so
I'm not going to claim perfection.

> 
> Does the vfs have to take the i_dax_sem to do remapping things like
> reflink?  (Pretend that reflink and dax are compatible for the moment)

Honestly I can't say for sure.  For this series I was careful to exclude
reflink from the locking requirement.

[snip]

> >  
> >  #define XFS_LOCK_FLAGS \
> >  	{ XFS_IOLOCK_EXCL,	"IOLOCK_EXCL" }, \
> > @@ -289,7 +295,9 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
> >  	{ XFS_ILOCK_EXCL,	"ILOCK_EXCL" }, \
> >  	{ XFS_ILOCK_SHARED,	"ILOCK_SHARED" }, \
> >  	{ XFS_MMAPLOCK_EXCL,	"MMAPLOCK_EXCL" }, \
> > -	{ XFS_MMAPLOCK_SHARED,	"MMAPLOCK_SHARED" }
> > +	{ XFS_MMAPLOCK_SHARED,	"MMAPLOCK_SHARED" }, \
> > +	{ XFS_DAX_EXCL,   	"DAX_EXCL" }, \
> 
> Whitespace between the comma & string.

Fixed.

[snip]

> > +
> >  static const struct inode_operations xfs_dir_inode_operations = {
> >  	.create			= xfs_vn_create,
> >  	.lookup			= xfs_vn_lookup,
> > @@ -1372,7 +1394,7 @@ xfs_setup_iops(
> >  
> >  	switch (inode->i_mode & S_IFMT) {
> >  	case S_IFREG:
> > -		inode->i_op = &xfs_inode_operations;
> > +		inode->i_op = &xfs_reg_inode_operations;
> 
> xfs_file_inode_operations?

Sounds better.  Fixed.

Ira
Ira Weiny Jan. 15, 2020, 12:57 a.m. UTC | #3
On Mon, Jan 13, 2020 at 04:35:21PM -0800, 'Ira Weiny' wrote:
> On Mon, Jan 13, 2020 at 02:19:57PM -0800, Darrick J. Wong wrote:
> > On Fri, Jan 10, 2020 at 11:29:38AM -0800, ira.weiny@intel.com wrote:
> > > From: Ira Weiny <ira.weiny@intel.com>
> 
> [snip]
> 
> > >  
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index 401da197f012..e8fd95b75e5b 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > > @@ -142,12 +142,12 @@ xfs_ilock_attr_map_shared(
> > >   *
> > >   * Basic locking order:
> > >   *
> > > - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
> > > + * i_rwsem -> i_dax_sem -> i_mmap_lock -> page_lock -> i_ilock
> > 
> > Mmmmmm, more locks.  Can we skip the extra lock if CONFIG_FSDAX=n or if
> > the filesystem devices don't support DAX at all?
> 
> I'll look into it.
> 
> > 
> > Also, I don't think we're actually following the i_rwsem -> i_daxsem
> > order in fallocate, and possibly elsewhere too?
> 
> I'll have to verify.  It took a lot of iterations to get the order working so
> I'm not going to claim perfection.

Yes this was inconsistent.  The code was right WRT i_rwsem.

mmap_sem may have issues:

What about this?

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c5d11b70d067..8808782a085e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -142,12 +142,12 @@ xfs_ilock_attr_map_shared(
  *
  * Basic locking order:
  *
- * i_rwsem -> i_dax_sem -> i_mmap_lock -> page_lock -> i_ilock
+ * i_dax_sem -> i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
  *
  * mmap_sem locking order:
  *
  * i_rwsem -> page lock -> mmap_sem
- * mmap_sem -> i_dax_sem -> i_mmap_lock -> page_lock
+ * i_dax_sem -> mmap_sem -> i_mmap_lock -> page_lock
  *
  * The difference in mmap_sem locking order mean that we cannot hold the
  * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
diff --git a/mm/mmap.c b/mm/mmap.c
index e6b68924b7ca..b500aef30b27 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1547,18 +1547,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
                        vm_flags |= VM_NORESERVE;
        }
 
-       if (file)
-               lock_inode_mode(file_inode(file));
-
        addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
        if (!IS_ERR_VALUE(addr) &&
            ((vm_flags & VM_LOCKED) ||
             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
 
-       if (file)
-               unlock_inode_mode(file_inode(file));
-
        return addr;
 }
 
diff --git a/mm/util.c b/mm/util.c
index 988d11e6c17c..1cfead8cd1ce 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -501,11 +501,18 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 
        ret = security_mmap_file(file, prot, flag);
        if (!ret) {
-               if (down_write_killable(&mm->mmap_sem))
+               if (file)
+                       lock_inode_mode(file_inode(file));
+               if (down_write_killable(&mm->mmap_sem)) {
+                       if (file)
+                               unlock_inode_mode(file_inode(file));
                        return -EINTR;
+               }
                ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
                                    &populate, &uf);
                up_write(&mm->mmap_sem);
+               if (file)
+                       unlock_inode_mode(file_inode(file));
                userfaultfd_unmap_complete(mm, &uf);
                if (populate)
                        mm_populate(ret, populate);
Ira Weiny Jan. 15, 2020, 11:52 p.m. UTC | #4
On Mon, Jan 13, 2020 at 02:19:57PM -0800, Darrick J. Wong wrote:
> On Fri, Jan 10, 2020 at 11:29:38AM -0800, ira.weiny@intel.com wrote:
> > From: Ira Weiny <ira.weiny@intel.com>
> > 

[snip]

> >  
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 401da197f012..e8fd95b75e5b 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -142,12 +142,12 @@ xfs_ilock_attr_map_shared(
> >   *
> >   * Basic locking order:
> >   *
> > - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
> > + * i_rwsem -> i_dax_sem -> i_mmap_lock -> page_lock -> i_ilock
> 
> Mmmmmm, more locks.  Can we skip the extra lock if CONFIG_FSDAX=n or if
> the filesystem devices don't support DAX at all?

I've looked into this a bit more and I think skipping on CONFIG_FSDAX=n is ok
but doing so on individual devices is not going to be possible because we don't
have that information at the vfs layer.

I'll continue to think about how to mitigate this more while I add CONFIG_FSDAX
checks before rolling out a new patch set.

> 
> Also, I don't think we're actually following the i_rwsem -> i_daxsem
> order in fallocate, and possibly elsewhere too?
> 
> Does the vfs have to take the i_dax_sem to do remapping things like
> reflink?  (Pretend that reflink and dax are compatible for the moment)
> 

I spoke with Dan today about this and we believe the answer is going to be yes.
However, not until the code is added to support DAX in reflink.  Because
currently this is only required for places which use "IS_DAX()" to see the
state of the inode.

Currently the vfs layer does not have any IS_DAX() checks in the reflink path.
But when they are added they will be required to take this sem.

Ira
Jan Kara Jan. 16, 2020, 9:24 a.m. UTC | #5
On Fri 10-01-20 11:29:38, ira.weiny@intel.com wrote:
> From: Ira Weiny <ira.weiny@intel.com>
> 
> XFS requires regular files to be locked while changing to/from DAX mode.
> 
> Define a new DAX lock type and implement the [un]lock_mode() inode
> operation callbacks.
> 
> We define a new XFS_DAX_* lock type to carry the lock through the
> transaction because we don't want to use IOLOCK as that would cause
> performance issues with locking of the inode itself.
> 
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
...
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 492e53992fa9..693ca66bd89b 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -67,6 +67,9 @@ typedef struct xfs_inode {
>  	spinlock_t		i_ioend_lock;
>  	struct work_struct	i_ioend_work;
>  	struct list_head	i_ioend_list;
> +
> +	/* protect changing the mode to/from DAX */
> +	struct percpu_rw_semaphore i_dax_sem;
>  } xfs_inode_t;

This adds overhead of ~32k per inode for typical distro kernel. That's not
going to fly. That's why ext4 has similar kind of lock in the superblock
shared by all inodes. For read side it does not matter because that's
per-cpu and shared lock. For write side we don't care as changing inode
access mode should be rare.

								Honza
Ira Weiny Jan. 16, 2020, 7:12 p.m. UTC | #6
On Thu, Jan 16, 2020 at 10:24:46AM +0100, Jan Kara wrote:
> On Fri 10-01-20 11:29:38, ira.weiny@intel.com wrote:
> > From: Ira Weiny <ira.weiny@intel.com>
> > 
> > XFS requires regular files to be locked while changing to/from DAX mode.
> > 
> > Define a new DAX lock type and implement the [un]lock_mode() inode
> > operation callbacks.
> > 
> > We define a new XFS_DAX_* lock type to carry the lock through the
> > transaction because we don't want to use IOLOCK as that would cause
> > performance issues with locking of the inode itself.
> > 
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> ...
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index 492e53992fa9..693ca66bd89b 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -67,6 +67,9 @@ typedef struct xfs_inode {
> >  	spinlock_t		i_ioend_lock;
> >  	struct work_struct	i_ioend_work;
> >  	struct list_head	i_ioend_list;
> > +
> > +	/* protect changing the mode to/from DAX */
> > +	struct percpu_rw_semaphore i_dax_sem;
> >  } xfs_inode_t;
> 
> This adds overhead of ~32k per inode for typical distro kernel.

Wow!

> That's not
> going to fly.

Probably not...

> That's why ext4 has similar kind of lock in the superblock
> shared by all inodes. For read side it does not matter because that's
> per-cpu and shared lock. For write side we don't care as changing inode
> access mode should be rare.

Sounds reasonable to me.  I'll convert it.

Thanks for pointing this out, that would have been bad indeed.
Ira

Patch
diff mbox series

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 8dc2e5414276..0288672e8902 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -74,6 +74,8 @@  xfs_inode_alloc(
 	INIT_LIST_HEAD(&ip->i_ioend_list);
 	spin_lock_init(&ip->i_ioend_lock);
 
+	percpu_init_rwsem(&ip->i_dax_sem);
+
 	return ip;
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 401da197f012..e8fd95b75e5b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -142,12 +142,12 @@  xfs_ilock_attr_map_shared(
  *
  * Basic locking order:
  *
- * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> i_dax_sem -> i_mmap_lock -> page_lock -> i_ilock
  *
  * mmap_sem locking order:
  *
  * i_rwsem -> page lock -> mmap_sem
- * mmap_sem -> i_mmap_lock -> page_lock
+ * mmap_sem -> i_dax_sem -> i_mmap_lock -> page_lock
  *
  * The difference in mmap_sem locking order mean that we cannot hold the
  * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
@@ -181,6 +181,13 @@  xfs_ilock(
 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+	ASSERT((lock_flags & (XFS_DAX_SHARED | XFS_DAX_EXCL)) !=
+	       (XFS_DAX_SHARED | XFS_DAX_EXCL));
+
+	if (lock_flags & XFS_DAX_EXCL)
+		percpu_down_write(&ip->i_dax_sem);
+	else if (lock_flags & XFS_DAX_SHARED)
+		percpu_down_read(&ip->i_dax_sem);
 
 	if (lock_flags & XFS_IOLOCK_EXCL) {
 		down_write_nested(&VFS_I(ip)->i_rwsem,
@@ -224,6 +231,8 @@  xfs_ilock_nowait(
 	 * You can't set both SHARED and EXCL for the same lock,
 	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 *
+	 * XFS_DAX_* is not allowed
 	 */
 	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
@@ -232,6 +241,7 @@  xfs_ilock_nowait(
 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+	ASSERT((lock_flags & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
 
 	if (lock_flags & XFS_IOLOCK_EXCL) {
 		if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
@@ -302,6 +312,8 @@  xfs_iunlock(
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 	ASSERT(lock_flags != 0);
+	ASSERT((lock_flags & (XFS_DAX_SHARED | XFS_DAX_EXCL)) !=
+	       (XFS_DAX_SHARED | XFS_DAX_EXCL));
 
 	if (lock_flags & XFS_IOLOCK_EXCL)
 		up_write(&VFS_I(ip)->i_rwsem);
@@ -318,6 +330,11 @@  xfs_iunlock(
 	else if (lock_flags & XFS_ILOCK_SHARED)
 		mrunlock_shared(&ip->i_lock);
 
+	if (lock_flags & XFS_DAX_EXCL)
+		percpu_up_write(&ip->i_dax_sem);
+	else if (lock_flags & XFS_DAX_SHARED)
+		percpu_up_read(&ip->i_dax_sem);
+
 	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 }
 
@@ -333,6 +350,8 @@  xfs_ilock_demote(
 	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
 	ASSERT((lock_flags &
 		~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+	/* XFS_DAX_* is not allowed */
+	ASSERT((lock_flags & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
 
 	if (lock_flags & XFS_ILOCK_EXCL)
 		mrdemote(&ip->i_lock);
@@ -369,6 +388,13 @@  xfs_isilocked(
 		return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
 	}
 
+	if (lock_flags & (XFS_DAX_EXCL|XFS_DAX_SHARED)) {
+		if (!(lock_flags & XFS_DAX_SHARED))
+			return !debug_locks ||
+				percpu_rwsem_is_held(&ip->i_dax_sem, 0);
+		return rwsem_is_locked(&ip->i_dax_sem);
+	}
+
 	ASSERT(0);
 	return 0;
 }
@@ -465,6 +491,9 @@  xfs_lock_inodes(
 	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
 		inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
 
+	/* XFS_DAX_* is not allowed */
+	ASSERT((lock_mode & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
+
 	if (lock_mode & XFS_IOLOCK_EXCL) {
 		ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
 	} else if (lock_mode & XFS_MMAPLOCK_EXCL)
@@ -566,6 +595,10 @@  xfs_lock_two_inodes(
 	ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
 	       !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 
+	/* XFS_DAX_* is not allowed */
+	ASSERT((ip0_mode & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
+	ASSERT((ip1_mode & (XFS_DAX_SHARED | XFS_DAX_EXCL)) == 0);
+
 	ASSERT(ip0->i_ino != ip1->i_ino);
 
 	if (ip0->i_ino > ip1->i_ino) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 492e53992fa9..693ca66bd89b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,6 +67,9 @@  typedef struct xfs_inode {
 	spinlock_t		i_ioend_lock;
 	struct work_struct	i_ioend_work;
 	struct list_head	i_ioend_list;
+
+	/* protect changing the mode to/from DAX */
+	struct percpu_rw_semaphore i_dax_sem;
 } xfs_inode_t;
 
 /* Convert from vfs inode to xfs inode */
@@ -278,10 +281,13 @@  static inline void xfs_ifunlock(struct xfs_inode *ip)
 #define	XFS_ILOCK_SHARED	(1<<3)
 #define	XFS_MMAPLOCK_EXCL	(1<<4)
 #define	XFS_MMAPLOCK_SHARED	(1<<5)
+#define	XFS_DAX_EXCL		(1<<6)
+#define	XFS_DAX_SHARED		(1<<7)
 
 #define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
 				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
-				| XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
+				| XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED \
+				| XFS_DAX_EXCL | XFS_DAX_SHARED)
 
 #define XFS_LOCK_FLAGS \
 	{ XFS_IOLOCK_EXCL,	"IOLOCK_EXCL" }, \
@@ -289,7 +295,9 @@  static inline void xfs_ifunlock(struct xfs_inode *ip)
 	{ XFS_ILOCK_EXCL,	"ILOCK_EXCL" }, \
 	{ XFS_ILOCK_SHARED,	"ILOCK_SHARED" }, \
 	{ XFS_MMAPLOCK_EXCL,	"MMAPLOCK_EXCL" }, \
-	{ XFS_MMAPLOCK_SHARED,	"MMAPLOCK_SHARED" }
+	{ XFS_MMAPLOCK_SHARED,	"MMAPLOCK_SHARED" }, \
+	{ XFS_DAX_EXCL,   	"DAX_EXCL" }, \
+	{ XFS_DAX_SHARED,	"DAX_SHARED" }
 
 
 /*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d6843cdb51d0..a2f2604c3187 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1158,6 +1158,16 @@  xfs_vn_tmpfile(
 	return xfs_generic_create(dir, dentry, mode, 0, true);
 }
 
+static void xfs_lock_mode(struct inode *inode)
+{
+	xfs_ilock(XFS_I(inode), XFS_DAX_SHARED);
+}
+
+static void xfs_unlock_mode(struct inode *inode)
+{
+	xfs_iunlock(XFS_I(inode), XFS_DAX_SHARED);
+}
+
 static const struct inode_operations xfs_inode_operations = {
 	.get_acl		= xfs_get_acl,
 	.set_acl		= xfs_set_acl,
@@ -1168,6 +1178,18 @@  static const struct inode_operations xfs_inode_operations = {
 	.update_time		= xfs_vn_update_time,
 };
 
+static const struct inode_operations xfs_reg_inode_operations = {
+	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.listxattr		= xfs_vn_listxattr,
+	.fiemap			= xfs_vn_fiemap,
+	.update_time		= xfs_vn_update_time,
+	.lock_mode              = xfs_lock_mode,
+	.unlock_mode            = xfs_unlock_mode,
+};
+
 static const struct inode_operations xfs_dir_inode_operations = {
 	.create			= xfs_vn_create,
 	.lookup			= xfs_vn_lookup,
@@ -1372,7 +1394,7 @@  xfs_setup_iops(
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
-		inode->i_op = &xfs_inode_operations;
+		inode->i_op = &xfs_reg_inode_operations;
 		inode->i_fop = &xfs_file_operations;
 		if (IS_DAX(inode))
 			inode->i_mapping->a_ops = &xfs_dax_aops;