diff mbox series

[5/5] ext4: implement FALLOC_FL_ZEROINIT_RANGE

Message ID 163192867220.417973.4913917281472586603.stgit@magnolia (mailing list archive)
State New, archived
Headers show
Series vfs: enable userspace to reset damaged file storage | expand

Commit Message

Darrick J. Wong Sept. 18, 2021, 1:31 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

Implement this new fallocate mode so that persistent memory users can,
upon receipt of a pmem poison notification, cause the pmem to be
reinitialized to a known value (zero) and clear any hardware poison
state that might be lurking.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/ext4/extents.c           |   93 +++++++++++++++++++++++++++++++++++++++++++
 include/trace/events/ext4.h |    7 +++
 2 files changed, 99 insertions(+), 1 deletion(-)

Comments

Ritesh Harjani Sept. 18, 2021, 5:07 p.m. UTC | #1
+cc linux-ext4

[Thread]: https://lore.kernel.org/linux-xfs/163192864476.417973.143014658064006895.stgit@magnolia/T/#t

On 21/09/17 06:31PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Implement this new fallocate mode so that persistent memory users can,
> upon receipt of a pmem poison notification, cause the pmem to be
> reinitialized to a known value (zero) and clear any hardware poison
> state that might be lurking.
>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  fs/ext4/extents.c           |   93 +++++++++++++++++++++++++++++++++++++++++++
>  include/trace/events/ext4.h |    7 +++
>  2 files changed, 99 insertions(+), 1 deletion(-)
>
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index c0de30f25185..c345002e2da6 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -29,6 +29,7 @@
>  #include <linux/fiemap.h>
>  #include <linux/backing-dev.h>
>  #include <linux/iomap.h>
> +#include <linux/dax.h>
>  #include "ext4_jbd2.h"
>  #include "ext4_extents.h"
>  #include "xattr.h"
> @@ -4475,6 +4476,90 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
>
>  static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
>
> +static long ext4_zeroinit_range(struct file *file, loff_t offset, loff_t len)
> +{
> +	struct inode *inode = file_inode(file);
> +	struct address_space *mapping = inode->i_mapping;
> +	handle_t *handle = NULL;
> +	loff_t end = offset + len;
> +	long ret;
> +
> +	trace_ext4_zeroinit_range(inode, offset, len,
> +			FALLOC_FL_ZEROINIT_RANGE | FALLOC_FL_KEEP_SIZE);
> +
> +	/* We don't support data=journal mode */
> +	if (ext4_should_journal_data(inode))
> +		return -EOPNOTSUPP;
> +
> +	inode_lock(inode);
> +
> +	/*
> +	 * Indirect files do not support unwritten extents
> +	 */
> +	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> +		ret = -EOPNOTSUPP;
> +		goto out_mutex;
> +	}
> +
> +	/* Wait all existing dio workers, newcomers will block on i_mutex */
> +	inode_dio_wait(inode);
> +
> +	/*
> +	 * Prevent page faults from reinstantiating pages we have released from
> +	 * page cache.
> +	 */
> +	filemap_invalidate_lock(mapping);
> +
> +	ret = ext4_break_layouts(inode);
> +	if (ret)
> +		goto out_mmap;
> +
> +	/* Now release the pages and zero block aligned part of pages */
> +	truncate_pagecache_range(inode, offset, end - 1);
> +	inode->i_mtime = inode->i_ctime = current_time(inode);
> +
> +	if (IS_DAX(inode))
> +		ret = dax_zeroinit_range(inode, offset, len,
> +				&ext4_iomap_report_ops);
> +	else
> +		ret = iomap_zeroout_range(inode, offset, len,
> +				&ext4_iomap_report_ops);
> +	if (ret == -ECANCELED)
> +		ret = -EOPNOTSUPP;
> +	if (ret)
> +		goto out_mmap;
> +
> +	/*
> +	 * In worst case we have to writeout two nonadjacent unwritten
> +	 * blocks and update the inode
> +	 */

Is this comment true? We are actually not touching IOMAP_UNWRITTEN blocks no?
So is there any need for journal transaction for this?
We are essentially only writing to blocks which are already allocated on disk
and zeroing it out in both dax_zeroinit_range() and iomap_zeroinit_range().


> +	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);

I guess credits is 1 here since only inode is getting modified.


> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		ext4_std_error(inode->i_sb, ret);
> +		goto out_mmap;
> +	}
> +
> +	inode->i_mtime = inode->i_ctime = current_time(inode);
> +	ret = ext4_mark_inode_dirty(handle, inode);
> +	if (unlikely(ret))
> +		goto out_handle;
> +	ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
> +			(offset + len - 1) >> inode->i_sb->s_blocksize_bits);

I am not sure whether we need ext4_fc_track_range() here?
We are not doing any metadata operation except maybe updating inode timestamp
right?

-ritesh

> +	ext4_update_inode_fsync_trans(handle, inode, 1);
> +
> +	if (file->f_flags & O_SYNC)
> +		ext4_handle_sync(handle);
> +
> +out_handle:
> +	ext4_journal_stop(handle);
> +out_mmap:
> +	filemap_invalidate_unlock(mapping);
> +out_mutex:
> +	inode_unlock(inode);
> +	return ret;
> +}
> +
>  static long ext4_zero_range(struct file *file, loff_t offset,
>  			    loff_t len, int mode)
>  {
> @@ -4659,7 +4744,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>  	/* Return error if mode is not supported */
>  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>  		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
> -		     FALLOC_FL_INSERT_RANGE))
> +		     FALLOC_FL_INSERT_RANGE | FALLOC_FL_ZEROINIT_RANGE))
>  		return -EOPNOTSUPP;
>
>  	ext4_fc_start_update(inode);
> @@ -4687,6 +4772,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>  		ret = ext4_zero_range(file, offset, len, mode);
>  		goto exit;
>  	}
> +
> +	if (mode & FALLOC_FL_ZEROINIT_RANGE) {
> +		ret = ext4_zeroinit_range(file, offset, len);
> +		goto exit;
> +	}
> +
>  	trace_ext4_fallocate_enter(inode, offset, len, mode);
>  	lblk = offset >> blkbits;
>
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index 0ea36b2b0662..282f1208067f 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -1407,6 +1407,13 @@ DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
>  	TP_ARGS(inode, offset, len, mode)
>  );
>
> +DEFINE_EVENT(ext4__fallocate_mode, ext4_zeroinit_range,
> +
> +	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
> +
> +	TP_ARGS(inode, offset, len, mode)
> +);
> +
>  TRACE_EVENT(ext4_fallocate_exit,
>  	TP_PROTO(struct inode *inode, loff_t offset,
>  		 unsigned int max_blocks, int ret),
>
Darrick J. Wong Sept. 20, 2021, 6:11 p.m. UTC | #2
On Sat, Sep 18, 2021 at 10:37:57PM +0530, riteshh wrote:
> +cc linux-ext4
> 
> [Thread]: https://lore.kernel.org/linux-xfs/163192864476.417973.143014658064006895.stgit@magnolia/T/#t
> 
> On 21/09/17 06:31PM, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> >
> > Implement this new fallocate mode so that persistent memory users can,
> > upon receipt of a pmem poison notification, cause the pmem to be
> > reinitialized to a known value (zero) and clear any hardware poison
> > state that might be lurking.
> >
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  fs/ext4/extents.c           |   93 +++++++++++++++++++++++++++++++++++++++++++
> >  include/trace/events/ext4.h |    7 +++
> >  2 files changed, 99 insertions(+), 1 deletion(-)
> >
> >
> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > index c0de30f25185..c345002e2da6 100644
> > --- a/fs/ext4/extents.c
> > +++ b/fs/ext4/extents.c
> > @@ -29,6 +29,7 @@
> >  #include <linux/fiemap.h>
> >  #include <linux/backing-dev.h>
> >  #include <linux/iomap.h>
> > +#include <linux/dax.h>
> >  #include "ext4_jbd2.h"
> >  #include "ext4_extents.h"
> >  #include "xattr.h"
> > @@ -4475,6 +4476,90 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
> >
> >  static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
> >
> > +static long ext4_zeroinit_range(struct file *file, loff_t offset, loff_t len)
> > +{
> > +	struct inode *inode = file_inode(file);
> > +	struct address_space *mapping = inode->i_mapping;
> > +	handle_t *handle = NULL;
> > +	loff_t end = offset + len;
> > +	long ret;
> > +
> > +	trace_ext4_zeroinit_range(inode, offset, len,
> > +			FALLOC_FL_ZEROINIT_RANGE | FALLOC_FL_KEEP_SIZE);
> > +
> > +	/* We don't support data=journal mode */
> > +	if (ext4_should_journal_data(inode))
> > +		return -EOPNOTSUPP;
> > +
> > +	inode_lock(inode);
> > +
> > +	/*
> > +	 * Indirect files do not support unwritten extents
> > +	 */
> > +	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out_mutex;
> > +	}
> > +
> > +	/* Wait all existing dio workers, newcomers will block on i_mutex */
> > +	inode_dio_wait(inode);
> > +
> > +	/*
> > +	 * Prevent page faults from reinstantiating pages we have released from
> > +	 * page cache.
> > +	 */
> > +	filemap_invalidate_lock(mapping);
> > +
> > +	ret = ext4_break_layouts(inode);
> > +	if (ret)
> > +		goto out_mmap;
> > +
> > +	/* Now release the pages and zero block aligned part of pages */
> > +	truncate_pagecache_range(inode, offset, end - 1);
> > +	inode->i_mtime = inode->i_ctime = current_time(inode);
> > +
> > +	if (IS_DAX(inode))
> > +		ret = dax_zeroinit_range(inode, offset, len,
> > +				&ext4_iomap_report_ops);
> > +	else
> > +		ret = iomap_zeroout_range(inode, offset, len,
> > +				&ext4_iomap_report_ops);
> > +	if (ret == -ECANCELED)
> > +		ret = -EOPNOTSUPP;
> > +	if (ret)
> > +		goto out_mmap;
> > +
> > +	/*
> > +	 * In worst case we have to writeout two nonadjacent unwritten
> > +	 * blocks and update the inode
> > +	 */
> 
> Is this comment true? We are actually not touching IOMAP_UNWRITTEN blocks no?
> So is there any need for journal transaction for this?
> We are essentially only writing to blocks which are already allocated on disk
> and zeroing it out in both dax_zeroinit_range() and iomap_zeroinit_range().

Oops.  Yeah, the comment is wrong.  Deleted.

> > +	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
> 
> I guess credits is 1 here since only inode is getting modified.

Yep.

> 
> > +	if (IS_ERR(handle)) {
> > +		ret = PTR_ERR(handle);
> > +		ext4_std_error(inode->i_sb, ret);
> > +		goto out_mmap;
> > +	}
> > +
> > +	inode->i_mtime = inode->i_ctime = current_time(inode);
> > +	ret = ext4_mark_inode_dirty(handle, inode);
> > +	if (unlikely(ret))
> > +		goto out_handle;
> > +	ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
> > +			(offset + len - 1) >> inode->i_sb->s_blocksize_bits);
> 
> I am not sure whether we need ext4_fc_track_range() here?
> We are not doing any metadata operation except maybe updating inode timestamp
> right?

I wasn't sure what fastcommit needs to track about the range.  Is it
/only/ tracking changes to the file mapping?

/me is sadly falling further and further behind on where ext4 is these
days... :/

--D

> 
> -ritesh
> 
> > +	ext4_update_inode_fsync_trans(handle, inode, 1);
> > +
> > +	if (file->f_flags & O_SYNC)
> > +		ext4_handle_sync(handle);
> > +
> > +out_handle:
> > +	ext4_journal_stop(handle);
> > +out_mmap:
> > +	filemap_invalidate_unlock(mapping);
> > +out_mutex:
> > +	inode_unlock(inode);
> > +	return ret;
> > +}
> > +
> >  static long ext4_zero_range(struct file *file, loff_t offset,
> >  			    loff_t len, int mode)
> >  {
> > @@ -4659,7 +4744,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> >  	/* Return error if mode is not supported */
> >  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> >  		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
> > -		     FALLOC_FL_INSERT_RANGE))
> > +		     FALLOC_FL_INSERT_RANGE | FALLOC_FL_ZEROINIT_RANGE))
> >  		return -EOPNOTSUPP;
> >
> >  	ext4_fc_start_update(inode);
> > @@ -4687,6 +4772,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> >  		ret = ext4_zero_range(file, offset, len, mode);
> >  		goto exit;
> >  	}
> > +
> > +	if (mode & FALLOC_FL_ZEROINIT_RANGE) {
> > +		ret = ext4_zeroinit_range(file, offset, len);
> > +		goto exit;
> > +	}
> > +
> >  	trace_ext4_fallocate_enter(inode, offset, len, mode);
> >  	lblk = offset >> blkbits;
> >
> > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> > index 0ea36b2b0662..282f1208067f 100644
> > --- a/include/trace/events/ext4.h
> > +++ b/include/trace/events/ext4.h
> > @@ -1407,6 +1407,13 @@ DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
> >  	TP_ARGS(inode, offset, len, mode)
> >  );
> >
> > +DEFINE_EVENT(ext4__fallocate_mode, ext4_zeroinit_range,
> > +
> > +	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
> > +
> > +	TP_ARGS(inode, offset, len, mode)
> > +);
> > +
> >  TRACE_EVENT(ext4_fallocate_exit,
> >  	TP_PROTO(struct inode *inode, loff_t offset,
> >  		 unsigned int max_blocks, int ret),
> >
Ritesh Harjani Sept. 21, 2021, 6:10 a.m. UTC | #3
On 21/09/20 11:11AM, Darrick J. Wong wrote:
> On Sat, Sep 18, 2021 at 10:37:57PM +0530, riteshh wrote:
> > +cc linux-ext4
> >
> > [Thread]: https://lore.kernel.org/linux-xfs/163192864476.417973.143014658064006895.stgit@magnolia/T/#t
> >
> > On 21/09/17 06:31PM, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > >
> > > Implement this new fallocate mode so that persistent memory users can,
> > > upon receipt of a pmem poison notification, cause the pmem to be
> > > reinitialized to a known value (zero) and clear any hardware poison
> > > state that might be lurking.
> > >
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > ---
> > >  fs/ext4/extents.c           |   93 +++++++++++++++++++++++++++++++++++++++++++
> > >  include/trace/events/ext4.h |    7 +++
> > >  2 files changed, 99 insertions(+), 1 deletion(-)
> > >
> > >
> > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > > index c0de30f25185..c345002e2da6 100644
> > > --- a/fs/ext4/extents.c
> > > +++ b/fs/ext4/extents.c
> > > @@ -29,6 +29,7 @@
> > >  #include <linux/fiemap.h>
> > >  #include <linux/backing-dev.h>
> > >  #include <linux/iomap.h>
> > > +#include <linux/dax.h>
> > >  #include "ext4_jbd2.h"
> > >  #include "ext4_extents.h"
> > >  #include "xattr.h"
> > > @@ -4475,6 +4476,90 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
> > >
> > >  static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
> > >
> > > +static long ext4_zeroinit_range(struct file *file, loff_t offset, loff_t len)
> > > +{
> > > +	struct inode *inode = file_inode(file);
> > > +	struct address_space *mapping = inode->i_mapping;
> > > +	handle_t *handle = NULL;
> > > +	loff_t end = offset + len;
> > > +	long ret;
> > > +
> > > +	trace_ext4_zeroinit_range(inode, offset, len,
> > > +			FALLOC_FL_ZEROINIT_RANGE | FALLOC_FL_KEEP_SIZE);
> > > +
> > > +	/* We don't support data=journal mode */
> > > +	if (ext4_should_journal_data(inode))
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	inode_lock(inode);
> > > +
> > > +	/*
> > > +	 * Indirect files do not support unwritten extents
> > > +	 */
> > > +	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out_mutex;
> > > +	}
> > > +
> > > +	/* Wait all existing dio workers, newcomers will block on i_mutex */
> > > +	inode_dio_wait(inode);
> > > +
> > > +	/*
> > > +	 * Prevent page faults from reinstantiating pages we have released from
> > > +	 * page cache.
> > > +	 */
> > > +	filemap_invalidate_lock(mapping);
> > > +
> > > +	ret = ext4_break_layouts(inode);
> > > +	if (ret)
> > > +		goto out_mmap;
> > > +
> > > +	/* Now release the pages and zero block aligned part of pages */
> > > +	truncate_pagecache_range(inode, offset, end - 1);
> > > +	inode->i_mtime = inode->i_ctime = current_time(inode);
> > > +
> > > +	if (IS_DAX(inode))
> > > +		ret = dax_zeroinit_range(inode, offset, len,
> > > +				&ext4_iomap_report_ops);
> > > +	else
> > > +		ret = iomap_zeroout_range(inode, offset, len,
> > > +				&ext4_iomap_report_ops);
> > > +	if (ret == -ECANCELED)
> > > +		ret = -EOPNOTSUPP;
> > > +	if (ret)
> > > +		goto out_mmap;
> > > +
> > > +	/*
> > > +	 * In worst case we have to writeout two nonadjacent unwritten
> > > +	 * blocks and update the inode
> > > +	 */
> >
> > Is this comment true? We are actually not touching IOMAP_UNWRITTEN blocks no?
> > So is there any need for journal transaction for this?
> > We are essentially only writing to blocks which are already allocated on disk
> > and zeroing it out in both dax_zeroinit_range() and iomap_zeroinit_range().
>
> Oops.  Yeah, the comment is wrong.  Deleted.
>
> > > +	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
> >
> > I guess credits is 1 here since only inode is getting modified.
>
> Yep.
>
> >
> > > +	if (IS_ERR(handle)) {
> > > +		ret = PTR_ERR(handle);
> > > +		ext4_std_error(inode->i_sb, ret);
> > > +		goto out_mmap;
> > > +	}
> > > +
> > > +	inode->i_mtime = inode->i_ctime = current_time(inode);
> > > +	ret = ext4_mark_inode_dirty(handle, inode);
> > > +	if (unlikely(ret))
> > > +		goto out_handle;
> > > +	ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
> > > +			(offset + len - 1) >> inode->i_sb->s_blocksize_bits);
> >
> > I am not sure whether we need ext4_fc_track_range() here?
> > We are not doing any metadata operation except maybe updating inode timestamp
> > right?
>
> I wasn't sure what fastcommit needs to track about the range.  Is it
> /only/ tracking changes to the file mapping?

cc'ing Harshad as well (to correct if any of below is incorrect/incomplete).

I guess so yes, from reading this [1].
ext4_fc_track_range() - Track _changed_ logical block offsets inodes

For only inode update, I guess fastcommit uses ext4_fc_track_inode(), which is
implicit when we call ext4_mark_inode_dirty().

[1]: https://lore.kernel.org/all/20201015203802.3597742-6-harshadshirwadkar@gmail.com/

>
> /me is sadly falling further and further behind on where ext4 is these
> days... :/
>
> >
> > > +	ext4_update_inode_fsync_trans(handle, inode, 1);

Then do we even need above?

-ritesh


> > > +
> > > +	if (file->f_flags & O_SYNC)
> > > +		ext4_handle_sync(handle);
> > > +
> > > +out_handle:
> > > +	ext4_journal_stop(handle);
> > > +out_mmap:
> > > +	filemap_invalidate_unlock(mapping);
> > > +out_mutex:
> > > +	inode_unlock(inode);
> > > +	return ret;
> > > +}
> > > +
> > >  static long ext4_zero_range(struct file *file, loff_t offset,
> > >  			    loff_t len, int mode)
> > >  {
> > > @@ -4659,7 +4744,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> > >  	/* Return error if mode is not supported */
> > >  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> > >  		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
> > > -		     FALLOC_FL_INSERT_RANGE))
> > > +		     FALLOC_FL_INSERT_RANGE | FALLOC_FL_ZEROINIT_RANGE))
> > >  		return -EOPNOTSUPP;
> > >
> > >  	ext4_fc_start_update(inode);
> > > @@ -4687,6 +4772,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> > >  		ret = ext4_zero_range(file, offset, len, mode);
> > >  		goto exit;
> > >  	}
> > > +
> > > +	if (mode & FALLOC_FL_ZEROINIT_RANGE) {
> > > +		ret = ext4_zeroinit_range(file, offset, len);
> > > +		goto exit;
> > > +	}
> > > +
> > >  	trace_ext4_fallocate_enter(inode, offset, len, mode);
> > >  	lblk = offset >> blkbits;
> > >
> > > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> > > index 0ea36b2b0662..282f1208067f 100644
> > > --- a/include/trace/events/ext4.h
> > > +++ b/include/trace/events/ext4.h
> > > @@ -1407,6 +1407,13 @@ DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
> > >  	TP_ARGS(inode, offset, len, mode)
> > >  );
> > >
> > > +DEFINE_EVENT(ext4__fallocate_mode, ext4_zeroinit_range,
> > > +
> > > +	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
> > > +
> > > +	TP_ARGS(inode, offset, len, mode)
> > > +);
> > > +
> > >  TRACE_EVENT(ext4_fallocate_exit,
> > >  	TP_PROTO(struct inode *inode, loff_t offset,
> > >  		 unsigned int max_blocks, int ret),
> > >
diff mbox series

Patch

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c0de30f25185..c345002e2da6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -29,6 +29,7 @@ 
 #include <linux/fiemap.h>
 #include <linux/backing-dev.h>
 #include <linux/iomap.h>
+#include <linux/dax.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "xattr.h"
@@ -4475,6 +4476,90 @@  static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
 
 static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
 
+static long ext4_zeroinit_range(struct file *file, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	handle_t *handle = NULL;
+	loff_t end = offset + len;
+	long ret;
+
+	trace_ext4_zeroinit_range(inode, offset, len,
+			FALLOC_FL_ZEROINIT_RANGE | FALLOC_FL_KEEP_SIZE);
+
+	/* We don't support data=journal mode */
+	if (ext4_should_journal_data(inode))
+		return -EOPNOTSUPP;
+
+	inode_lock(inode);
+
+	/*
+	 * Indirect files do not support unwritten extents
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+		ret = -EOPNOTSUPP;
+		goto out_mutex;
+	}
+
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	inode_dio_wait(inode);
+
+	/*
+	 * Prevent page faults from reinstantiating pages we have released from
+	 * page cache.
+	 */
+	filemap_invalidate_lock(mapping);
+
+	ret = ext4_break_layouts(inode);
+	if (ret)
+		goto out_mmap;
+
+	/* Now release the pages and zero block aligned part of pages */
+	truncate_pagecache_range(inode, offset, end - 1);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
+
+	if (IS_DAX(inode))
+		ret = dax_zeroinit_range(inode, offset, len,
+				&ext4_iomap_report_ops);
+	else
+		ret = iomap_zeroout_range(inode, offset, len,
+				&ext4_iomap_report_ops);
+	if (ret == -ECANCELED)
+		ret = -EOPNOTSUPP;
+	if (ret)
+		goto out_mmap;
+
+	/*
+	 * In worst case we have to writeout two nonadjacent unwritten
+	 * blocks and update the inode
+	 */
+	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		ext4_std_error(inode->i_sb, ret);
+		goto out_mmap;
+	}
+
+	inode->i_mtime = inode->i_ctime = current_time(inode);
+	ret = ext4_mark_inode_dirty(handle, inode);
+	if (unlikely(ret))
+		goto out_handle;
+	ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
+			(offset + len - 1) >> inode->i_sb->s_blocksize_bits);
+	ext4_update_inode_fsync_trans(handle, inode, 1);
+
+	if (file->f_flags & O_SYNC)
+		ext4_handle_sync(handle);
+
+out_handle:
+	ext4_journal_stop(handle);
+out_mmap:
+	filemap_invalidate_unlock(mapping);
+out_mutex:
+	inode_unlock(inode);
+	return ret;
+}
+
 static long ext4_zero_range(struct file *file, loff_t offset,
 			    loff_t len, int mode)
 {
@@ -4659,7 +4744,7 @@  long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
 		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
-		     FALLOC_FL_INSERT_RANGE))
+		     FALLOC_FL_INSERT_RANGE | FALLOC_FL_ZEROINIT_RANGE))
 		return -EOPNOTSUPP;
 
 	ext4_fc_start_update(inode);
@@ -4687,6 +4772,12 @@  long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		ret = ext4_zero_range(file, offset, len, mode);
 		goto exit;
 	}
+
+	if (mode & FALLOC_FL_ZEROINIT_RANGE) {
+		ret = ext4_zeroinit_range(file, offset, len);
+		goto exit;
+	}
+
 	trace_ext4_fallocate_enter(inode, offset, len, mode);
 	lblk = offset >> blkbits;
 
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 0ea36b2b0662..282f1208067f 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -1407,6 +1407,13 @@  DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
 	TP_ARGS(inode, offset, len, mode)
 );
 
+DEFINE_EVENT(ext4__fallocate_mode, ext4_zeroinit_range,
+
+	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+	TP_ARGS(inode, offset, len, mode)
+);
+
 TRACE_EVENT(ext4_fallocate_exit,
 	TP_PROTO(struct inode *inode, loff_t offset,
 		 unsigned int max_blocks, int ret),