diff mbox

[v3] Btrfs: add support for fallocate's zero range operation

Message ID 20171103172037.7107-1-fdmanana@kernel.org (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Filipe Manana Nov. 3, 2017, 5:20 p.m. UTC
From: Filipe Manana <fdmanana@suse.com>

This implements support the zero range operation of fallocate. For now
at least it's as simple as possible while reusing most of the existing
fallocate and hole punching infrastructure.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---

V2: Removed double inode unlock on error path from failure to lock range.
V3: Factored common code to update isize and inode item into a helper
    function, plus some minor cleanup.

 fs/btrfs/file.c | 351 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 285 insertions(+), 66 deletions(-)

Comments

Edmund Nadolski Nov. 3, 2017, 8:59 p.m. UTC | #1
On 11/03/2017 11:20 AM, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
> 
> V2: Removed double inode unlock on error path from failure to lock range.
> V3: Factored common code to update isize and inode item into a helper
>     function, plus some minor cleanup.
> 
>  fs/btrfs/file.c | 351 +++++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 285 insertions(+), 66 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aafcc785f840..2cc1aed1c564 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>  	return ret;
>  }
>  
> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +static int btrfs_punch_hole_lock_range(struct inode *inode,
> +				       const u64 lockstart,
> +				       const u64 lockend,
> +				       struct extent_state **cached_state)
> +{
> +	while (1) {
> +		struct btrfs_ordered_extent *ordered;
> +		int ret;
> +
> +		truncate_pagecache_range(inode, lockstart, lockend);
> +
> +		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> +				 cached_state);
> +		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> +
> +		/*
> +		 * We need to make sure we have no ordered extents in this range
> +		 * and nobody raced in and read a page in this range, if we did
> +		 * we need to try again.
> +		 */
> +		if ((!ordered ||
> +		    (ordered->file_offset + ordered->len <= lockstart ||
> +		     ordered->file_offset > lockend)) &&
> +		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> +			if (ordered)
> +				btrfs_put_ordered_extent(ordered);
> +			break;
> +		}
> +		if (ordered)
> +			btrfs_put_ordered_extent(ordered);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, cached_state, GFP_NOFS);
> +		ret = btrfs_wait_ordered_range(inode, lockstart,
> +					       lockend - lockstart + 1);
> +		if (ret)
> +			return ret;
> +	}
> +	return 0;
> +}
> +
> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
> +			    bool lock_inode)

The inode_lock may no longer be needed, since it looks to be always true
in this version of the patch.

Ed

>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  	if (ret)
>  		return ret;
>  
> -	inode_lock(inode);
> +	if (lock_inode)
> +		inode_lock(inode);
>  	ino_size = round_up(inode->i_size, fs_info->sectorsize);
>  	ret = find_first_non_hole(inode, &offset, &len);
>  	if (ret < 0)
> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		truncated_block = true;
>  		ret = btrfs_truncate_block(inode, offset, 0, 0);
>  		if (ret) {
> -			inode_unlock(inode);
> +			if (lock_inode)
> +				inode_unlock(inode);
>  			return ret;
>  		}
>  	}
> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		goto out_only_mutex;
>  	}
>  
> -	while (1) {
> -		struct btrfs_ordered_extent *ordered;
> -
> -		truncate_pagecache_range(inode, lockstart, lockend);
> -
> -		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> -				 &cached_state);
> -		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> -
> -		/*
> -		 * We need to make sure we have no ordered extents in this range
> -		 * and nobody raced in and read a page in this range, if we did
> -		 * we need to try again.
> -		 */
> -		if ((!ordered ||
> -		    (ordered->file_offset + ordered->len <= lockstart ||
> -		     ordered->file_offset > lockend)) &&
> -		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> -			if (ordered)
> -				btrfs_put_ordered_extent(ordered);
> -			break;
> -		}
> -		if (ordered)
> -			btrfs_put_ordered_extent(ordered);
> -		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> -				     lockend, &cached_state, GFP_NOFS);
> -		ret = btrfs_wait_ordered_range(inode, lockstart,
> -					       lockend - lockstart + 1);
> -		if (ret) {
> +	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +					  &cached_state);
> +	if (ret) {
> +		if (lock_inode)
>  			inode_unlock(inode);
> -			return ret;
> -		}
> +		return ret;
>  	}
>  
>  	path = btrfs_alloc_path();
> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  			ret = btrfs_end_transaction(trans);
>  		}
>  	}
> -	inode_unlock(inode);
> +	if (lock_inode)
> +		inode_unlock(inode);
>  	if (ret && !err)
>  		err = ret;
>  	return err;
> @@ -2804,6 +2822,217 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>  	return 0;
>  }
>  
> +static int btrfs_fallocate_update_isize(struct inode *inode,
> +					const u64 end,
> +					const int mode)
> +{
> +	struct btrfs_trans_handle *trans;
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	int ret;
> +	int ret2;
> +
> +	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
> +		return 0;
> +
> +	trans = btrfs_start_transaction(root, 1);
> +	if (IS_ERR(trans))
> +		return PTR_ERR(trans);
> +
> +	inode->i_ctime = current_time(inode);
> +	i_size_write(inode, end);
> +	btrfs_ordered_update_i_size(inode, end, NULL);
> +	ret = btrfs_update_inode(trans, root, inode);
> +	ret2 = btrfs_end_transaction(trans);
> +
> +	return ret ? ret : ret2;
> +}
> +
> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
> +						 u64 offset)
> +{
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	struct extent_map *em = NULL;
> +	int ret = 0;
> +
> +	offset = round_down(offset, sectorsize);
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
> +	if (IS_ERR(em))
> +		return PTR_ERR(em);
> +
> +	if (em->block_start == EXTENT_MAP_HOLE)
> +		ret = 1;
> +
> +	free_extent_map(em);
> +	return ret;
> +}
> +
> +static int btrfs_zero_range(struct inode *inode,
> +			    loff_t offset,
> +			    loff_t len,
> +			    const int mode)
> +{
> +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
> +	struct extent_map *em;
> +	struct extent_changeset *data_reserved = NULL;
> +	int ret;
> +	u64 alloc_hint = 0;
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	u64 alloc_start = round_down(offset, sectorsize);
> +	u64 alloc_end = round_up(offset + len, sectorsize);
> +	u64 bytes_to_reserve = 0;
> +	bool space_reserved = false;
> +
> +	inode_dio_wait(inode);
> +
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +			      alloc_start, alloc_end - alloc_start, 0);
> +	if (IS_ERR(em)) {
> +		ret = PTR_ERR(em);
> +		goto out;
> +	}
> +
> +	/*
> +	 * Avoid hole punching and extent allocation for some cases. More cases
> +	 * could be considered, but these are unlikely common and we keep things
> +	 * as simple as possible for now. Also, intentionally, if the target
> +	 * range contains one or more prealloc extents together with regular
> +	 * extents and holes, we drop all the existing extents and allocate a
> +	 * new prealloc extent, so that we get a larger contiguous disk extent.
> +	 */
> +	if (em->start <= alloc_start &&
> +	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +		const u64 em_end = em->start + em->len;
> +
> +		if (em_end >= offset + len) {
> +			/*
> +			 * The whole range is already a prealloc extent,
> +			 * do nothing except updating the inode's i_size if
> +			 * needed.
> +			 */
> +			free_extent_map(em);
> +			ret = btrfs_fallocate_update_isize(inode, offset + len,
> +							   mode);
> +			goto out;
> +		}
> +		/*
> +		 * Part of the range is already a prealloc extent, so operate
> +		 * only on the remaining part of the range.
> +		 */
> +		alloc_start = em_end;
> +		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
> +		len = offset + len - alloc_start;
> +		offset = alloc_start;
> +		alloc_hint = em->block_start + em->len;
> +	}
> +	free_extent_map(em);
> +
> +	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
> +	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
> +		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +				      alloc_start, sectorsize, 0);
> +		if (IS_ERR(em)) {
> +			ret = PTR_ERR(em);
> +			goto out;
> +		}
> +
> +		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +			free_extent_map(em);
> +			ret = btrfs_fallocate_update_isize(inode, offset + len,
> +							   mode);
> +			goto out;
> +		}
> +		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
> +			free_extent_map(em);
> +			ret = btrfs_truncate_block(inode, offset, len, 0);
> +			if (!ret)
> +				ret = btrfs_fallocate_update_isize(inode,
> +								   offset + len,
> +								   mode);
> +			return ret;
> +		}
> +		free_extent_map(em);
> +		alloc_start = round_down(offset, sectorsize);
> +		alloc_end = alloc_start + sectorsize;
> +		goto reserve_space;
> +	}
> +
> +	alloc_start = round_up(offset, sectorsize);
> +	alloc_end = round_down(offset + len, sectorsize);
> +
> +	/*
> +	 * For unaligned ranges, check the pages at the boundaries, they might
> +	 * map to an extent, in which case we need to partially zero them, or
> +	 * they might map to a hole, in which case we need our allocation range
> +	 * to cover them.
> +	 */
> +	if (!IS_ALIGNED(offset, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode, offset);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_start = round_down(offset, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset, 0, 0);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +	if (!IS_ALIGNED(offset + len, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode,
> +							    offset + len);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_end = round_up(offset + len, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +reserve_space:
> +	if (alloc_start < alloc_end) {
> +		struct extent_state *cached_state = NULL;
> +		const u64 lockstart = alloc_start;
> +		const u64 lockend = alloc_end - 1;
> +
> +		bytes_to_reserve = alloc_end - alloc_start;
> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      bytes_to_reserve);
> +		if (ret < 0)
> +			goto out;
> +		space_reserved = true;
> +		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
> +						alloc_start, bytes_to_reserve);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +						  &cached_state);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
> +						alloc_end - alloc_start,
> +						i_blocksize(inode),
> +						offset + len, &alloc_hint);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, &cached_state, GFP_KERNEL);
> +		/* btrfs_prealloc_file_range releases reserved space on error */
> +		if (ret)
> +			space_reserved = false;
> +	}
> + out:
> +	if (ret && space_reserved)
> +		btrfs_free_reserved_data_space(inode, data_reserved,
> +					       alloc_start, bytes_to_reserve);
> +	extent_changeset_free(data_reserved);
> +
> +	return ret;
> +}
> +
>  static long btrfs_fallocate(struct file *file, int mode,
>  			    loff_t offset, loff_t len)
>  {
> @@ -2829,21 +3058,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	cur_offset = alloc_start;
>  
>  	/* Make sure we aren't being give some crap mode */
> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> +		     FALLOC_FL_ZERO_RANGE))
>  		return -EOPNOTSUPP;
>  
>  	if (mode & FALLOC_FL_PUNCH_HOLE)
> -		return btrfs_punch_hole(inode, offset, len);
> +		return btrfs_punch_hole(inode, offset, len, true);
>  
>  	/*
>  	 * Only trigger disk allocation, don't trigger qgroup reserve
>  	 *
>  	 * For qgroup space, it will be checked later.
>  	 */
> -	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> -			alloc_end - alloc_start);
> -	if (ret < 0)
> -		return ret;
> +	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      alloc_end - alloc_start);
> +		if (ret < 0)
> +			return ret;
> +	}
>  
>  	inode_lock(inode);
>  
> @@ -2885,6 +3117,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	if (ret)
>  		goto out;
>  
> +	if (mode & FALLOC_FL_ZERO_RANGE) {
> +		ret = btrfs_zero_range(inode, offset, len, mode);
> +		inode_unlock(inode);
> +		return ret;
> +	}
> +
>  	locked_end = alloc_end - 1;
>  	while (1) {
>  		struct btrfs_ordered_extent *ordered;
> @@ -2980,37 +3218,18 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	if (ret < 0)
>  		goto out_unlock;
>  
> -	if (actual_end > inode->i_size &&
> -	    !(mode & FALLOC_FL_KEEP_SIZE)) {
> -		struct btrfs_trans_handle *trans;
> -		struct btrfs_root *root = BTRFS_I(inode)->root;
> -
> -		/*
> -		 * We didn't need to allocate any more space, but we
> -		 * still extended the size of the file so we need to
> -		 * update i_size and the inode item.
> -		 */
> -		trans = btrfs_start_transaction(root, 1);
> -		if (IS_ERR(trans)) {
> -			ret = PTR_ERR(trans);
> -		} else {
> -			inode->i_ctime = current_time(inode);
> -			i_size_write(inode, actual_end);
> -			btrfs_ordered_update_i_size(inode, actual_end, NULL);
> -			ret = btrfs_update_inode(trans, root, inode);
> -			if (ret)
> -				btrfs_end_transaction(trans);
> -			else
> -				ret = btrfs_end_transaction(trans);
> -		}
> -	}
> +	/*
> +	 * We didn't need to allocate any more space, but we still extended the
> +	 * size of the file so we need to update i_size and the inode item.
> +	 */
> +	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
>  out_unlock:
>  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
>  			     &cached_state, GFP_KERNEL);
>  out:
>  	inode_unlock(inode);
>  	/* Let go of our reservation. */
> -	if (ret != 0)
> +	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>  		btrfs_free_reserved_data_space(inode, data_reserved,
>  				alloc_start, alloc_end - cur_offset);
>  	extent_changeset_free(data_reserved);
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aafcc785f840..2cc1aed1c564 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2448,7 +2448,48 @@  static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
 	return ret;
 }
 
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole_lock_range(struct inode *inode,
+				       const u64 lockstart,
+				       const u64 lockend,
+				       struct extent_state **cached_state)
+{
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		int ret;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len <= lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, cached_state, GFP_NOFS);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
+			    bool lock_inode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2477,7 +2518,8 @@  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
-	inode_lock(inode);
+	if (lock_inode)
+		inode_lock(inode);
 	ino_size = round_up(inode->i_size, fs_info->sectorsize);
 	ret = find_first_non_hole(inode, &offset, &len);
 	if (ret < 0)
@@ -2516,7 +2558,8 @@  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		truncated_block = true;
 		ret = btrfs_truncate_block(inode, offset, 0, 0);
 		if (ret) {
-			inode_unlock(inode);
+			if (lock_inode)
+				inode_unlock(inode);
 			return ret;
 		}
 	}
@@ -2564,38 +2607,12 @@  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out_only_mutex;
 	}
 
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		truncate_pagecache_range(inode, lockstart, lockend);
-
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 &cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
-		/*
-		 * We need to make sure we have no ordered extents in this range
-		 * and nobody raced in and read a page in this range, if we did
-		 * we need to try again.
-		 */
-		if ((!ordered ||
-		    (ordered->file_offset + ordered->len <= lockstart ||
-		     ordered->file_offset > lockend)) &&
-		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, &cached_state, GFP_NOFS);
-		ret = btrfs_wait_ordered_range(inode, lockstart,
-					       lockend - lockstart + 1);
-		if (ret) {
+	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+					  &cached_state);
+	if (ret) {
+		if (lock_inode)
 			inode_unlock(inode);
-			return ret;
-		}
+		return ret;
 	}
 
 	path = btrfs_alloc_path();
@@ -2758,7 +2775,8 @@  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			ret = btrfs_end_transaction(trans);
 		}
 	}
-	inode_unlock(inode);
+	if (lock_inode)
+		inode_unlock(inode);
 	if (ret && !err)
 		err = ret;
 	return err;
@@ -2804,6 +2822,217 @@  static int add_falloc_range(struct list_head *head, u64 start, u64 len)
 	return 0;
 }
 
+static int btrfs_fallocate_update_isize(struct inode *inode,
+					const u64 end,
+					const int mode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+	int ret2;
+
+	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+		return 0;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	inode->i_ctime = current_time(inode);
+	i_size_write(inode, end);
+	btrfs_ordered_update_i_size(inode, end, NULL);
+	ret = btrfs_update_inode(trans, root, inode);
+	ret2 = btrfs_end_transaction(trans);
+
+	return ret ? ret : ret2;
+}
+
+static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+						 u64 offset)
+{
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	struct extent_map *em = NULL;
+	int ret = 0;
+
+	offset = round_down(offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	if (em->block_start == EXTENT_MAP_HOLE)
+		ret = 1;
+
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+			    loff_t offset,
+			    loff_t len,
+			    const int mode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct extent_map *em;
+	struct extent_changeset *data_reserved = NULL;
+	int ret;
+	u64 alloc_hint = 0;
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	u64 alloc_start = round_down(offset, sectorsize);
+	u64 alloc_end = round_up(offset + len, sectorsize);
+	u64 bytes_to_reserve = 0;
+	bool space_reserved = false;
+
+	inode_dio_wait(inode);
+
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+			      alloc_start, alloc_end - alloc_start, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+
+	/*
+	 * Avoid hole punching and extent allocation for some cases. More cases
+	 * could be considered, but these are unlikely common and we keep things
+	 * as simple as possible for now. Also, intentionally, if the target
+	 * range contains one or more prealloc extents together with regular
+	 * extents and holes, we drop all the existing extents and allocate a
+	 * new prealloc extent, so that we get a larger contiguous disk extent.
+	 */
+	if (em->start <= alloc_start &&
+	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		const u64 em_end = em->start + em->len;
+
+		if (em_end >= offset + len) {
+			/*
+			 * The whole range is already a prealloc extent,
+			 * do nothing except updating the inode's i_size if
+			 * needed.
+			 */
+			free_extent_map(em);
+			ret = btrfs_fallocate_update_isize(inode, offset + len,
+							   mode);
+			goto out;
+		}
+		/*
+		 * Part of the range is already a prealloc extent, so operate
+		 * only on the remaining part of the range.
+		 */
+		alloc_start = em_end;
+		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+		len = offset + len - alloc_start;
+		offset = alloc_start;
+		alloc_hint = em->block_start + em->len;
+	}
+	free_extent_map(em);
+
+	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+				      alloc_start, sectorsize, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			free_extent_map(em);
+			ret = btrfs_fallocate_update_isize(inode, offset + len,
+							   mode);
+			goto out;
+		}
+		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+			free_extent_map(em);
+			ret = btrfs_truncate_block(inode, offset, len, 0);
+			if (!ret)
+				ret = btrfs_fallocate_update_isize(inode,
+								   offset + len,
+								   mode);
+			return ret;
+		}
+		free_extent_map(em);
+		alloc_start = round_down(offset, sectorsize);
+		alloc_end = alloc_start + sectorsize;
+		goto reserve_space;
+	}
+
+	alloc_start = round_up(offset, sectorsize);
+	alloc_end = round_down(offset + len, sectorsize);
+
+	/*
+	 * For unaligned ranges, check the pages at the boundaries, they might
+	 * map to an extent, in which case we need to partially zero them, or
+	 * they might map to a hole, in which case we need our allocation range
+	 * to cover them.
+	 */
+	if (!IS_ALIGNED(offset, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode, offset);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_start = round_down(offset, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset, 0, 0);
+			if (ret)
+				goto out;
+		}
+	}
+
+	if (!IS_ALIGNED(offset + len, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode,
+							    offset + len);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_end = round_up(offset + len, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+			if (ret)
+				goto out;
+		}
+	}
+
+reserve_space:
+	if (alloc_start < alloc_end) {
+		struct extent_state *cached_state = NULL;
+		const u64 lockstart = alloc_start;
+		const u64 lockend = alloc_end - 1;
+
+		bytes_to_reserve = alloc_end - alloc_start;
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      bytes_to_reserve);
+		if (ret < 0)
+			goto out;
+		space_reserved = true;
+		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+						alloc_start, bytes_to_reserve);
+		if (ret)
+			goto out;
+		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+						  &cached_state);
+		if (ret)
+			goto out;
+		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+						alloc_end - alloc_start,
+						i_blocksize(inode),
+						offset + len, &alloc_hint);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_KERNEL);
+		/* btrfs_prealloc_file_range releases reserved space on error */
+		if (ret)
+			space_reserved = false;
+	}
+ out:
+	if (ret && space_reserved)
+		btrfs_free_reserved_data_space(inode, data_reserved,
+					       alloc_start, bytes_to_reserve);
+	extent_changeset_free(data_reserved);
+
+	return ret;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
@@ -2829,21 +3058,24 @@  static long btrfs_fallocate(struct file *file, int mode,
 	cur_offset = alloc_start;
 
 	/* Make sure we aren't being give some crap mode */
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_ZERO_RANGE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		return btrfs_punch_hole(inode, offset, len);
+		return btrfs_punch_hole(inode, offset, len, true);
 
 	/*
 	 * Only trigger disk allocation, don't trigger qgroup reserve
 	 *
 	 * For qgroup space, it will be checked later.
 	 */
-	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
-			alloc_end - alloc_start);
-	if (ret < 0)
-		return ret;
+	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      alloc_end - alloc_start);
+		if (ret < 0)
+			return ret;
+	}
 
 	inode_lock(inode);
 
@@ -2885,6 +3117,12 @@  static long btrfs_fallocate(struct file *file, int mode,
 	if (ret)
 		goto out;
 
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = btrfs_zero_range(inode, offset, len, mode);
+		inode_unlock(inode);
+		return ret;
+	}
+
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -2980,37 +3218,18 @@  static long btrfs_fallocate(struct file *file, int mode,
 	if (ret < 0)
 		goto out_unlock;
 
-	if (actual_end > inode->i_size &&
-	    !(mode & FALLOC_FL_KEEP_SIZE)) {
-		struct btrfs_trans_handle *trans;
-		struct btrfs_root *root = BTRFS_I(inode)->root;
-
-		/*
-		 * We didn't need to allocate any more space, but we
-		 * still extended the size of the file so we need to
-		 * update i_size and the inode item.
-		 */
-		trans = btrfs_start_transaction(root, 1);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-		} else {
-			inode->i_ctime = current_time(inode);
-			i_size_write(inode, actual_end);
-			btrfs_ordered_update_i_size(inode, actual_end, NULL);
-			ret = btrfs_update_inode(trans, root, inode);
-			if (ret)
-				btrfs_end_transaction(trans);
-			else
-				ret = btrfs_end_transaction(trans);
-		}
-	}
+	/*
+	 * We didn't need to allocate any more space, but we still extended the
+	 * size of the file so we need to update i_size and the inode item.
+	 */
+	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
 out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_KERNEL);
 out:
 	inode_unlock(inode);
 	/* Let go of our reservation. */
-	if (ret != 0)
+	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
 		btrfs_free_reserved_data_space(inode, data_reserved,
 				alloc_start, alloc_end - cur_offset);
 	extent_changeset_free(data_reserved);