diff mbox

[v2,2/2] ext4: handle layout changes to pinned DAX mappings

Message ID 20180627212252.31032-3-ross.zwisler@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ross Zwisler June 27, 2018, 9:22 p.m. UTC
Follow the lead of xfs_break_dax_layouts() and add synchronization between
operations in ext4 which remove blocks from an inode (hole punch, truncate
down, etc.) and pages which are pinned due to DAX DMA operations.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h     |  1 +
 fs/ext4/extents.c  | 12 ++++++++++++
 fs/ext4/inode.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/truncate.h |  4 ++++
 4 files changed, 63 insertions(+)

Comments

Lukas Czerner June 29, 2018, 12:02 p.m. UTC | #1
On Wed, Jun 27, 2018 at 03:22:52PM -0600, Ross Zwisler wrote:
> Follow the lead of xfs_break_dax_layouts() and add synchronization between
> operations in ext4 which remove blocks from an inode (hole punch, truncate
> down, etc.) and pages which are pinned due to DAX DMA operations.
> 
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/ext4.h     |  1 +
>  fs/ext4/extents.c  | 12 ++++++++++++
>  fs/ext4/inode.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/truncate.h |  4 ++++
>  4 files changed, 63 insertions(+)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 0b127853c584..34bccd64d83d 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2460,6 +2460,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
>  extern int ext4_inode_attach_jinode(struct inode *inode);
>  extern int ext4_can_truncate(struct inode *inode);
>  extern int ext4_truncate(struct inode *);
> +extern int ext4_break_layouts(struct inode *);
>  extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
>  extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
>  extern void ext4_set_inode_flags(struct inode *);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 0057fe3f248d..a6aef06f455b 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4820,6 +4820,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  		 * released from page cache.
>  		 */
>  		down_write(&EXT4_I(inode)->i_mmap_sem);
> +
> +		ret = ext4_break_layouts(inode);
> +		if (ret) {
> +			up_write(&EXT4_I(inode)->i_mmap_sem);
> +			goto out_mutex;
> +		}
> +
>  		ret = ext4_update_disksize_before_punch(inode, offset, len);
>  		if (ret) {
>  			up_write(&EXT4_I(inode)->i_mmap_sem);
> @@ -5493,6 +5500,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
>  	 * page cache.
>  	 */
>  	down_write(&EXT4_I(inode)->i_mmap_sem);
> +
> +	ret = ext4_break_layouts(inode);
> +	if (ret)
> +		goto out_mmap;

Hi,

don't we need to do the same for ext4_insert_range() since we're about
to truncate_pagecache() as well ?

/thinking out loud/
Xfs seems to do this before every fallocate operation, but in ext4
it does not seem to be needed at least for simply allocating falocate...

Thanks!
-Lukas

> +
>  	/*
>  	 * Need to round down offset to be aligned with page size boundary
>  	 * for page size > block size.
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 2ea07efbe016..fadb8ecacb1e 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4193,6 +4193,39 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
>  	return 0;
>  }
>  
> +static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock)
> +{
> +	*did_unlock = true;
> +	up_write(&ei->i_mmap_sem);
> +	schedule();
> +	down_write(&ei->i_mmap_sem);
> +}
> +
> +int ext4_break_layouts(struct inode *inode)
> +{
> +	struct ext4_inode_info *ei = EXT4_I(inode);
> +	struct page *page;
> +	bool retry;
> +	int error;
> +
> +	if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
> +		return -EINVAL;
> +
> +	do {
> +		retry = false;
> +		page = dax_layout_busy_page(inode->i_mapping);
> +		if (!page)
> +			return 0;
> +
> +		error = ___wait_var_event(&page->_refcount,
> +				atomic_read(&page->_refcount) == 1,
> +				TASK_INTERRUPTIBLE, 0, 0,
> +				ext4_wait_dax_page(ei, &retry));
> +	} while (error == 0 && retry);
> +
> +	return error;
> +}
> +
>  /*
>   * ext4_punch_hole: punches a hole in a file by releasing the blocks
>   * associated with the given offset and length
> @@ -4266,6 +4299,11 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
>  	 * page cache.
>  	 */
>  	down_write(&EXT4_I(inode)->i_mmap_sem);
> +
> +	ret = ext4_break_layouts(inode);
> +	if (ret)
> +		goto out_dio;
> +
>  	first_block_offset = round_up(offset, sb->s_blocksize);
>  	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
>  
> @@ -5554,6 +5592,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>  				ext4_wait_for_tail_page_commit(inode);
>  		}
>  		down_write(&EXT4_I(inode)->i_mmap_sem);
> +
> +		rc = ext4_break_layouts(inode);
> +		if (rc) {
> +			up_write(&EXT4_I(inode)->i_mmap_sem);
> +			error = rc;
> +			goto err_out;
> +		}
> +
>  		/*
>  		 * Truncate pagecache after we've waited for commit
>  		 * in data=journal mode to make pages freeable.
> diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
> index 0cb13badf473..bcbe3668c1d4 100644
> --- a/fs/ext4/truncate.h
> +++ b/fs/ext4/truncate.h
> @@ -11,6 +11,10 @@
>   */
>  static inline void ext4_truncate_failed_write(struct inode *inode)
>  {
> +	/*
> +	 * We don't need to call ext4_break_layouts() because the blocks we
> +	 * are truncating were never visible to userspace.
> +	 */
>  	down_write(&EXT4_I(inode)->i_mmap_sem);
>  	truncate_inode_pages(inode->i_mapping, inode->i_size);
>  	ext4_truncate(inode);
> -- 
> 2.14.4
>
Ross Zwisler June 29, 2018, 3:13 p.m. UTC | #2
On Fri, Jun 29, 2018 at 02:02:23PM +0200, Lukas Czerner wrote:
> On Wed, Jun 27, 2018 at 03:22:52PM -0600, Ross Zwisler wrote:
> > Follow the lead of xfs_break_dax_layouts() and add synchronization between
> > operations in ext4 which remove blocks from an inode (hole punch, truncate
> > down, etc.) and pages which are pinned due to DAX DMA operations.
> > 
> > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > Reviewed-by: Jan Kara <jack@suse.cz>
> > ---
> >  fs/ext4/ext4.h     |  1 +
> >  fs/ext4/extents.c  | 12 ++++++++++++
> >  fs/ext4/inode.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/ext4/truncate.h |  4 ++++
> >  4 files changed, 63 insertions(+)
> > 
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index 0b127853c584..34bccd64d83d 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -2460,6 +2460,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
> >  extern int ext4_inode_attach_jinode(struct inode *inode);
> >  extern int ext4_can_truncate(struct inode *inode);
> >  extern int ext4_truncate(struct inode *);
> > +extern int ext4_break_layouts(struct inode *);
> >  extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
> >  extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
> >  extern void ext4_set_inode_flags(struct inode *);
> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > index 0057fe3f248d..a6aef06f455b 100644
> > --- a/fs/ext4/extents.c
> > +++ b/fs/ext4/extents.c
> > @@ -4820,6 +4820,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> >  		 * released from page cache.
> >  		 */
> >  		down_write(&EXT4_I(inode)->i_mmap_sem);
> > +
> > +		ret = ext4_break_layouts(inode);
> > +		if (ret) {
> > +			up_write(&EXT4_I(inode)->i_mmap_sem);
> > +			goto out_mutex;
> > +		}
> > +
> >  		ret = ext4_update_disksize_before_punch(inode, offset, len);
> >  		if (ret) {
> >  			up_write(&EXT4_I(inode)->i_mmap_sem);
> > @@ -5493,6 +5500,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
> >  	 * page cache.
> >  	 */
> >  	down_write(&EXT4_I(inode)->i_mmap_sem);
> > +
> > +	ret = ext4_break_layouts(inode);
> > +	if (ret)
> > +		goto out_mmap;
> 
> Hi,
> 
> don't we need to do the same for ext4_insert_range() since we're about
> to truncate_pagecache() as well ?
> 
> /thinking out loud/
> Xfs seems to do this before every fallocate operation, but in ext4
> it does not seem to be needed at least for simply allocating falocate...

I saw the case in ext4_insert_range(), and decided that we didn't need to
worry about synchronizing with DAX because no blocks were being removed from
the inode's extent map.  IIUC the truncate_pagecache() call is needed because
we are unmapping and removing any page cache mappings for the part of the file
after the insert because those blocks are now at a different offset in the
inode.  Because at the end of the operation we haven't removed any DAX pages
from the inode, we have nothing that we need to synchronize.

Hmm, unless this is a failure case we care about fixing?
 1) schedule I/O via O_DIRECT to page X
 2) fallocate(FALLOC_FL_INSERT_RANGE) to block < X, shifting X to a larger
    offset
 3) O_DIRECT I/O from 1) completes, but ends up writing into the *new* block
    that resides at X - the I/O from 1) completes

In this case the user is running I/O and issuing the fallocate at the same
time, and the sequencing could have worked out that #1 and #2 were reversed,
giving you the same behavior.  IMO this seems fine and that we shouldn't have
the DAX synchronization call in ext4_insert_range(), but I'm happy to add it
if I'm wrong.
Dave Chinner June 30, 2018, 1:12 a.m. UTC | #3
On Fri, Jun 29, 2018 at 02:02:23PM +0200, Lukas Czerner wrote:
> On Wed, Jun 27, 2018 at 03:22:52PM -0600, Ross Zwisler wrote:
> > Follow the lead of xfs_break_dax_layouts() and add synchronization between
> > operations in ext4 which remove blocks from an inode (hole punch, truncate
> > down, etc.) and pages which are pinned due to DAX DMA operations.
> > 
> > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > Reviewed-by: Jan Kara <jack@suse.cz>
> > ---
> >  fs/ext4/ext4.h     |  1 +
> >  fs/ext4/extents.c  | 12 ++++++++++++
> >  fs/ext4/inode.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/ext4/truncate.h |  4 ++++
> >  4 files changed, 63 insertions(+)
> > 
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index 0b127853c584..34bccd64d83d 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -2460,6 +2460,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
> >  extern int ext4_inode_attach_jinode(struct inode *inode);
> >  extern int ext4_can_truncate(struct inode *inode);
> >  extern int ext4_truncate(struct inode *);
> > +extern int ext4_break_layouts(struct inode *);
> >  extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
> >  extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
> >  extern void ext4_set_inode_flags(struct inode *);
> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > index 0057fe3f248d..a6aef06f455b 100644
> > --- a/fs/ext4/extents.c
> > +++ b/fs/ext4/extents.c
> > @@ -4820,6 +4820,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> >  		 * released from page cache.
> >  		 */
> >  		down_write(&EXT4_I(inode)->i_mmap_sem);
> > +
> > +		ret = ext4_break_layouts(inode);
> > +		if (ret) {
> > +			up_write(&EXT4_I(inode)->i_mmap_sem);
> > +			goto out_mutex;
> > +		}
> > +
> >  		ret = ext4_update_disksize_before_punch(inode, offset, len);
> >  		if (ret) {
> >  			up_write(&EXT4_I(inode)->i_mmap_sem);
> > @@ -5493,6 +5500,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
> >  	 * page cache.
> >  	 */
> >  	down_write(&EXT4_I(inode)->i_mmap_sem);
> > +
> > +	ret = ext4_break_layouts(inode);
> > +	if (ret)
> > +		goto out_mmap;
> 
> Hi,
> 
> don't we need to do the same for ext4_insert_range() since we're about
> to truncate_pagecache() as well ?
> 
> /thinking out loud/
> Xfs seems to do this before every fallocate operation, but in ext4
> it does not seem to be needed at least for simply allocating falocate...

The PNFS client may have mapped a hole over the range we are about
to allocate. Hence it's mapping will be stale, and it needs to remap
the range.

i.e. any change to the extent list needs to break the lease of any
client that may have a cached layout on that inode. i.e. it's not
just extent removal that we are protecting against - we are trying
to ensure clients using leases to access blocks directly remain
coherent with the filesystem's extent map. Hence any potential
change to the extent map requires breaking the lease....

Cheers,

Dave.
Jan Kara July 2, 2018, 7:34 a.m. UTC | #4
On Fri 29-06-18 09:13:00, Ross Zwisler wrote:
> On Fri, Jun 29, 2018 at 02:02:23PM +0200, Lukas Czerner wrote:
> > On Wed, Jun 27, 2018 at 03:22:52PM -0600, Ross Zwisler wrote:
> > > Follow the lead of xfs_break_dax_layouts() and add synchronization between
> > > operations in ext4 which remove blocks from an inode (hole punch, truncate
> > > down, etc.) and pages which are pinned due to DAX DMA operations.
> > > 
> > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > ---
> > >  fs/ext4/ext4.h     |  1 +
> > >  fs/ext4/extents.c  | 12 ++++++++++++
> > >  fs/ext4/inode.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/ext4/truncate.h |  4 ++++
> > >  4 files changed, 63 insertions(+)
> > > 
> > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > index 0b127853c584..34bccd64d83d 100644
> > > --- a/fs/ext4/ext4.h
> > > +++ b/fs/ext4/ext4.h
> > > @@ -2460,6 +2460,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
> > >  extern int ext4_inode_attach_jinode(struct inode *inode);
> > >  extern int ext4_can_truncate(struct inode *inode);
> > >  extern int ext4_truncate(struct inode *);
> > > +extern int ext4_break_layouts(struct inode *);
> > >  extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
> > >  extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
> > >  extern void ext4_set_inode_flags(struct inode *);
> > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > > index 0057fe3f248d..a6aef06f455b 100644
> > > --- a/fs/ext4/extents.c
> > > +++ b/fs/ext4/extents.c
> > > @@ -4820,6 +4820,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> > >  		 * released from page cache.
> > >  		 */
> > >  		down_write(&EXT4_I(inode)->i_mmap_sem);
> > > +
> > > +		ret = ext4_break_layouts(inode);
> > > +		if (ret) {
> > > +			up_write(&EXT4_I(inode)->i_mmap_sem);
> > > +			goto out_mutex;
> > > +		}
> > > +
> > >  		ret = ext4_update_disksize_before_punch(inode, offset, len);
> > >  		if (ret) {
> > >  			up_write(&EXT4_I(inode)->i_mmap_sem);
> > > @@ -5493,6 +5500,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
> > >  	 * page cache.
> > >  	 */
> > >  	down_write(&EXT4_I(inode)->i_mmap_sem);
> > > +
> > > +	ret = ext4_break_layouts(inode);
> > > +	if (ret)
> > > +		goto out_mmap;
> > 
> > Hi,
> > 
> > don't we need to do the same for ext4_insert_range() since we're about
> > to truncate_pagecache() as well ?
> > 
> > /thinking out loud/
> > Xfs seems to do this before every fallocate operation, but in ext4
> > it does not seem to be needed at least for simply allocating falocate...
> 
> I saw the case in ext4_insert_range(), and decided that we didn't need to
> worry about synchronizing with DAX because no blocks were being removed from
> the inode's extent map.  IIUC the truncate_pagecache() call is needed because
> we are unmapping and removing any page cache mappings for the part of the file
> after the insert because those blocks are now at a different offset in the
> inode.  Because at the end of the operation we haven't removed any DAX pages
> from the inode, we have nothing that we need to synchronize.
> 
> Hmm, unless this is a failure case we care about fixing?
>  1) schedule I/O via O_DIRECT to page X
>  2) fallocate(FALLOC_FL_INSERT_RANGE) to block < X, shifting X to a larger
>     offset
>  3) O_DIRECT I/O from 1) completes, but ends up writing into the *new* block
>     that resides at X - the I/O from 1) completes
> 
> In this case the user is running I/O and issuing the fallocate at the same
> time, and the sequencing could have worked out that #1 and #2 were reversed,
> giving you the same behavior.  IMO this seems fine and that we shouldn't have
> the DAX synchronization call in ext4_insert_range(), but I'm happy to add it
> if I'm wrong.

I also don't see a way how ext4_insert_range() not calling
ext4_break_layouts() could lead to corruption. However the whole operation
with splitting (and possible zeroing out) of extents, truncation of extents
beyond EOF, etc. is complex enough that I'm not sure I've thought through
all the corner cases :-) So it at least deserves a comment why
ext4_break_layouts() is not necessary here (also as a reminder in case we
change the implementation and it would be suddently needed).

								Honza
Lukas Czerner July 2, 2018, 7:59 a.m. UTC | #5
On Fri, Jun 29, 2018 at 09:13:00AM -0600, Ross Zwisler wrote:
> On Fri, Jun 29, 2018 at 02:02:23PM +0200, Lukas Czerner wrote:
> > On Wed, Jun 27, 2018 at 03:22:52PM -0600, Ross Zwisler wrote:
> > > Follow the lead of xfs_break_dax_layouts() and add synchronization between
> > > operations in ext4 which remove blocks from an inode (hole punch, truncate
> > > down, etc.) and pages which are pinned due to DAX DMA operations.
> > > 
> > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > ---
> > >  fs/ext4/ext4.h     |  1 +
> > >  fs/ext4/extents.c  | 12 ++++++++++++
> > >  fs/ext4/inode.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/ext4/truncate.h |  4 ++++
> > >  4 files changed, 63 insertions(+)
> > > 
> > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > index 0b127853c584..34bccd64d83d 100644
> > > --- a/fs/ext4/ext4.h
> > > +++ b/fs/ext4/ext4.h
> > > @@ -2460,6 +2460,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
> > >  extern int ext4_inode_attach_jinode(struct inode *inode);
> > >  extern int ext4_can_truncate(struct inode *inode);
> > >  extern int ext4_truncate(struct inode *);
> > > +extern int ext4_break_layouts(struct inode *);
> > >  extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
> > >  extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
> > >  extern void ext4_set_inode_flags(struct inode *);
> > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > > index 0057fe3f248d..a6aef06f455b 100644
> > > --- a/fs/ext4/extents.c
> > > +++ b/fs/ext4/extents.c
> > > @@ -4820,6 +4820,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> > >  		 * released from page cache.
> > >  		 */
> > >  		down_write(&EXT4_I(inode)->i_mmap_sem);
> > > +
> > > +		ret = ext4_break_layouts(inode);
> > > +		if (ret) {
> > > +			up_write(&EXT4_I(inode)->i_mmap_sem);
> > > +			goto out_mutex;
> > > +		}
> > > +
> > >  		ret = ext4_update_disksize_before_punch(inode, offset, len);
> > >  		if (ret) {
> > >  			up_write(&EXT4_I(inode)->i_mmap_sem);
> > > @@ -5493,6 +5500,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
> > >  	 * page cache.
> > >  	 */
> > >  	down_write(&EXT4_I(inode)->i_mmap_sem);
> > > +
> > > +	ret = ext4_break_layouts(inode);
> > > +	if (ret)
> > > +		goto out_mmap;
> > 
> > Hi,
> > 
> > don't we need to do the same for ext4_insert_range() since we're about
> > to truncate_pagecache() as well ?
> > 
> > /thinking out loud/
> > Xfs seems to do this before every fallocate operation, but in ext4
> > it does not seem to be needed at least for simply allocating falocate...
> 
> I saw the case in ext4_insert_range(), and decided that we didn't need to
> worry about synchronizing with DAX because no blocks were being removed from
> the inode's extent map.  IIUC the truncate_pagecache() call is needed because
> we are unmapping and removing any page cache mappings for the part of the file
> after the insert because those blocks are now at a different offset in the
> inode.  Because at the end of the operation we haven't removed any DAX pages
> from the inode, we have nothing that we need to synchronize.
> 
> Hmm, unless this is a failure case we care about fixing?
>  1) schedule I/O via O_DIRECT to page X
>  2) fallocate(FALLOC_FL_INSERT_RANGE) to block < X, shifting X to a larger
>     offset
>  3) O_DIRECT I/O from 1) completes, but ends up writing into the *new* block
>     that resides at X - the I/O from 1) completes
> 
> In this case the user is running I/O and issuing the fallocate at the same
> time, and the sequencing could have worked out that #1 and #2 were reversed,
> giving you the same behavior.  IMO this seems fine and that we shouldn't have
> the DAX synchronization call in ext4_insert_range(), but I'm happy to add it
> if I'm wrong.

Hi,

I think you're right, this case might mot matter much. I am just worried
about unforeseen consequences of changing the layout with dax pages
mapped. I guess we can also add this later fi we discover anything.

You can add

Reviewed-by: Lukas Czerner <lczerner@redhat.com>

Thanks!
-Lukas
Ross Zwisler July 2, 2018, 4:27 p.m. UTC | #6
On Mon, Jul 02, 2018 at 09:59:48AM +0200, Lukas Czerner wrote:
> On Fri, Jun 29, 2018 at 09:13:00AM -0600, Ross Zwisler wrote:
> > On Fri, Jun 29, 2018 at 02:02:23PM +0200, Lukas Czerner wrote:
> > > On Wed, Jun 27, 2018 at 03:22:52PM -0600, Ross Zwisler wrote:
> > > > Follow the lead of xfs_break_dax_layouts() and add synchronization between
> > > > operations in ext4 which remove blocks from an inode (hole punch, truncate
> > > > down, etc.) and pages which are pinned due to DAX DMA operations.
> > > > 
> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > > ---
> > > >  fs/ext4/ext4.h     |  1 +
> > > >  fs/ext4/extents.c  | 12 ++++++++++++
> > > >  fs/ext4/inode.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/ext4/truncate.h |  4 ++++
> > > >  4 files changed, 63 insertions(+)
> > > > 
> > > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > > index 0b127853c584..34bccd64d83d 100644
> > > > --- a/fs/ext4/ext4.h
> > > > +++ b/fs/ext4/ext4.h
> > > > @@ -2460,6 +2460,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
> > > >  extern int ext4_inode_attach_jinode(struct inode *inode);
> > > >  extern int ext4_can_truncate(struct inode *inode);
> > > >  extern int ext4_truncate(struct inode *);
> > > > +extern int ext4_break_layouts(struct inode *);
> > > >  extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
> > > >  extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
> > > >  extern void ext4_set_inode_flags(struct inode *);
> > > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > > > index 0057fe3f248d..a6aef06f455b 100644
> > > > --- a/fs/ext4/extents.c
> > > > +++ b/fs/ext4/extents.c
> > > > @@ -4820,6 +4820,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
> > > >  		 * released from page cache.
> > > >  		 */
> > > >  		down_write(&EXT4_I(inode)->i_mmap_sem);
> > > > +
> > > > +		ret = ext4_break_layouts(inode);
> > > > +		if (ret) {
> > > > +			up_write(&EXT4_I(inode)->i_mmap_sem);
> > > > +			goto out_mutex;
> > > > +		}
> > > > +
> > > >  		ret = ext4_update_disksize_before_punch(inode, offset, len);
> > > >  		if (ret) {
> > > >  			up_write(&EXT4_I(inode)->i_mmap_sem);
> > > > @@ -5493,6 +5500,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
> > > >  	 * page cache.
> > > >  	 */
> > > >  	down_write(&EXT4_I(inode)->i_mmap_sem);
> > > > +
> > > > +	ret = ext4_break_layouts(inode);
> > > > +	if (ret)
> > > > +		goto out_mmap;
> > > 
> > > Hi,
> > > 
> > > don't we need to do the same for ext4_insert_range() since we're about
> > > to truncate_pagecache() as well ?
> > > 
> > > /thinking out loud/
> > > Xfs seems to do this before every fallocate operation, but in ext4
> > > it does not seem to be needed at least for simply allocating falocate...
> > 
> > I saw the case in ext4_insert_range(), and decided that we didn't need to
> > worry about synchronizing with DAX because no blocks were being removed from
> > the inode's extent map.  IIUC the truncate_pagecache() call is needed because
> > we are unmapping and removing any page cache mappings for the part of the file
> > after the insert because those blocks are now at a different offset in the
> > inode.  Because at the end of the operation we haven't removed any DAX pages
> > from the inode, we have nothing that we need to synchronize.
> > 
> > Hmm, unless this is a failure case we care about fixing?
> >  1) schedule I/O via O_DIRECT to page X
> >  2) fallocate(FALLOC_FL_INSERT_RANGE) to block < X, shifting X to a larger
> >     offset
> >  3) O_DIRECT I/O from 1) completes, but ends up writing into the *new* block
> >     that resides at X - the I/O from 1) completes
> > 
> > In this case the user is running I/O and issuing the fallocate at the same
> > time, and the sequencing could have worked out that #1 and #2 were reversed,
> > giving you the same behavior.  IMO this seems fine and that we shouldn't have
> > the DAX synchronization call in ext4_insert_range(), but I'm happy to add it
> > if I'm wrong.
> 
> Hi,
> 
> I think you're right, this case might mot matter much. I am just worried
> about unforeseen consequences of changing the layout with dax pages
> mapped. I guess we can also add this later fi we discover anything.
> 
> You can add
> 
> Reviewed-by: Lukas Czerner <lczerner@redhat.com>
> 
> Thanks!
> -Lukas

Thank you for the review.  I'll add a comment to help explain my reasoning, as
Jan suggested.
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0b127853c584..34bccd64d83d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2460,6 +2460,7 @@  extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
 extern int ext4_truncate(struct inode *);
+extern int ext4_break_layouts(struct inode *);
 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0057fe3f248d..a6aef06f455b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4820,6 +4820,13 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 		 * released from page cache.
 		 */
 		down_write(&EXT4_I(inode)->i_mmap_sem);
+
+		ret = ext4_break_layouts(inode);
+		if (ret) {
+			up_write(&EXT4_I(inode)->i_mmap_sem);
+			goto out_mutex;
+		}
+
 		ret = ext4_update_disksize_before_punch(inode, offset, len);
 		if (ret) {
 			up_write(&EXT4_I(inode)->i_mmap_sem);
@@ -5493,6 +5500,11 @@  int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	 * page cache.
 	 */
 	down_write(&EXT4_I(inode)->i_mmap_sem);
+
+	ret = ext4_break_layouts(inode);
+	if (ret)
+		goto out_mmap;
+
 	/*
 	 * Need to round down offset to be aligned with page size boundary
 	 * for page size > block size.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2ea07efbe016..fadb8ecacb1e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4193,6 +4193,39 @@  int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
 	return 0;
 }
 
+static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock)
+{
+	*did_unlock = true;
+	up_write(&ei->i_mmap_sem);
+	schedule();
+	down_write(&ei->i_mmap_sem);
+}
+
+int ext4_break_layouts(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct page *page;
+	bool retry;
+	int error;
+
+	if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
+		return -EINVAL;
+
+	do {
+		retry = false;
+		page = dax_layout_busy_page(inode->i_mapping);
+		if (!page)
+			return 0;
+
+		error = ___wait_var_event(&page->_refcount,
+				atomic_read(&page->_refcount) == 1,
+				TASK_INTERRUPTIBLE, 0, 0,
+				ext4_wait_dax_page(ei, &retry));
+	} while (error == 0 && retry);
+
+	return error;
+}
+
 /*
  * ext4_punch_hole: punches a hole in a file by releasing the blocks
  * associated with the given offset and length
@@ -4266,6 +4299,11 @@  int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	 * page cache.
 	 */
 	down_write(&EXT4_I(inode)->i_mmap_sem);
+
+	ret = ext4_break_layouts(inode);
+	if (ret)
+		goto out_dio;
+
 	first_block_offset = round_up(offset, sb->s_blocksize);
 	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
 
@@ -5554,6 +5592,14 @@  int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				ext4_wait_for_tail_page_commit(inode);
 		}
 		down_write(&EXT4_I(inode)->i_mmap_sem);
+
+		rc = ext4_break_layouts(inode);
+		if (rc) {
+			up_write(&EXT4_I(inode)->i_mmap_sem);
+			error = rc;
+			goto err_out;
+		}
+
 		/*
 		 * Truncate pagecache after we've waited for commit
 		 * in data=journal mode to make pages freeable.
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index 0cb13badf473..bcbe3668c1d4 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -11,6 +11,10 @@ 
  */
 static inline void ext4_truncate_failed_write(struct inode *inode)
 {
+	/*
+	 * We don't need to call ext4_break_layouts() because the blocks we
+	 * are truncating were never visible to userspace.
+	 */
 	down_write(&EXT4_I(inode)->i_mmap_sem);
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
 	ext4_truncate(inode);