[v4] Btrfs: add support for fallocate's zero range operation

Message ID	20171104040747.8737-1-fdmanana@kernel.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-btrfs-owner@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org EEFC221870 From: fdmanana@kernel.org To: linux-btrfs@vger.kernel.org Subject: [PATCH v4] Btrfs: add support for fallocate's zero range operation Date: Sat, 4 Nov 2017 04:07:47 +0000 Message-Id: <20171104040747.8737-1-fdmanana@kernel.org> In-Reply-To: <20171103172037.7107-1-fdmanana@kernel.org> References: <20171103172037.7107-1-fdmanana@kernel.org> Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index aafcc785f840..ea2e863eb540 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2448,6 +2448,46 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) return ret; } +static int btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, + const u64 lockend, + struct extent_state **cached_state) +{ + while (1) { + struct btrfs_ordered_extent *ordered; + int ret; + + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + + /* + * We need to make sure we have no ordered extents in this range + * and nobody raced in and read a page in this range, if we did + * we need to try again. + */ + if ((!ordered || + (ordered->file_offset + ordered->len <= lockstart || + ordered->file_offset > lockend)) && + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, cached_state, GFP_NOFS); + ret = btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (ret) + return ret; + } + return 0; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2564,38 +2604,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - while (1) { - struct btrfs_ordered_extent *ordered; - - truncate_pagecache_range(inode, lockstart, lockend); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); - - /* - * We need to make sure we have no ordered extents in this range - * and nobody raced in and read a page in this range, if we did - * we need to try again. - */ - if ((!ordered || - (ordered->file_offset + ordered->len <= lockstart || - ordered->file_offset > lockend)) && - !btrfs_page_exists_in_range(inode, lockstart, lockend)) { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state, GFP_NOFS); - ret = btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (ret) { - inode_unlock(inode); - return ret; - } + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); + if (ret) { + inode_unlock(inode); + goto out_only_mutex; } path = btrfs_alloc_path(); @@ -2804,6 +2817,217 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len) return 0; } +static int btrfs_fallocate_update_isize(struct inode *inode, + const u64 end, + const int mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + int ret2; + + if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) + return 0; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + inode->i_ctime = current_time(inode); + i_size_write(inode, end); + btrfs_ordered_update_i_size(inode, end, NULL); + ret = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_end_transaction(trans); + + return ret ? ret : ret2; +} + +static int btrfs_zero_range_check_range_boundary(struct inode *inode, + u64 offset) +{ + const u64 sectorsize = btrfs_inode_sectorsize(inode); + struct extent_map *em; + int ret = 0; + + offset = round_down(offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start == EXTENT_MAP_HOLE) + ret = 1; + + free_extent_map(em); + return ret; +} + +static int btrfs_zero_range(struct inode *inode, + loff_t offset, + loff_t len, + const int mode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_map *em; + struct extent_changeset *data_reserved = NULL; + int ret; + u64 alloc_hint = 0; + const u64 sectorsize = btrfs_inode_sectorsize(inode); + u64 alloc_start = round_down(offset, sectorsize); + u64 alloc_end = round_up(offset + len, sectorsize); + u64 bytes_to_reserve = 0; + bool space_reserved = false; + + inode_dio_wait(inode); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + alloc_start, alloc_end - alloc_start, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + /* + * Avoid hole punching and extent allocation for some cases. More cases + * could be considered, but these are unlikely common and we keep things + * as simple as possible for now. Also, intentionally, if the target + * range contains one or more prealloc extents together with regular + * extents and holes, we drop all the existing extents and allocate a + * new prealloc extent, so that we get a larger contiguous disk extent. + */ + if (em->start <= alloc_start && + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + const u64 em_end = em->start + em->len; + + if (em_end >= offset + len) { + /* + * The whole range is already a prealloc extent, + * do nothing except updating the inode's i_size if + * needed. + */ + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + /* + * Part of the range is already a prealloc extent, so operate + * only on the remaining part of the range. + */ + alloc_start = em_end; + ASSERT(IS_ALIGNED(alloc_start, sectorsize)); + len = offset + len - alloc_start; + offset = alloc_start; + alloc_hint = em->block_start + em->len; + } + free_extent_map(em); + + if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == + BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + alloc_start, sectorsize, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { + free_extent_map(em); + ret = btrfs_truncate_block(inode, offset, len, 0); + if (!ret) + ret = btrfs_fallocate_update_isize(inode, + offset + len, + mode); + return ret; + } + free_extent_map(em); + alloc_start = round_down(offset, sectorsize); + alloc_end = alloc_start + sectorsize; + goto reserve_space; + } + + alloc_start = round_up(offset, sectorsize); + alloc_end = round_down(offset + len, sectorsize); + + /* + * For unaligned ranges, check the pages at the boundaries, they might + * map to an extent, in which case we need to partially zero them, or + * they might map to a hole, in which case we need our allocation range + * to cover them. + */ + if (!IS_ALIGNED(offset, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(inode, offset); + if (ret < 0) + goto out; + if (ret) { + alloc_start = round_down(offset, sectorsize); + ret = 0; + } else { + ret = btrfs_truncate_block(inode, offset, 0, 0); + if (ret) + goto out; + } + } + + if (!IS_ALIGNED(offset + len, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(inode, + offset + len); + if (ret < 0) + goto out; + if (ret) { + alloc_end = round_up(offset + len, sectorsize); + ret = 0; + } else { + ret = btrfs_truncate_block(inode, offset + len, 0, 1); + if (ret) + goto out; + } + } + +reserve_space: + if (alloc_start < alloc_end) { + struct extent_state *cached_state = NULL; + const u64 lockstart = alloc_start; + const u64 lockend = alloc_end - 1; + + bytes_to_reserve = alloc_end - alloc_start; + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + bytes_to_reserve); + if (ret < 0) + goto out; + space_reserved = true; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + alloc_start, bytes_to_reserve); + if (ret) + goto out; + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); + if (ret) + goto out; + ret = btrfs_prealloc_file_range(inode, mode, alloc_start, + alloc_end - alloc_start, + i_blocksize(inode), + offset + len, &alloc_hint); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state, GFP_KERNEL); + /* btrfs_prealloc_file_range releases reserved space on error */ + if (ret) + space_reserved = false; + } + out: + if (ret && space_reserved) + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, bytes_to_reserve); + extent_changeset_free(data_reserved); + + return ret; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -2829,7 +3053,8 @@ static long btrfs_fallocate(struct file *file, int mode, cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -2840,10 +3065,12 @@ static long btrfs_fallocate(struct file *file, int mode, * * For qgroup space, it will be checked later. */ - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), - alloc_end - alloc_start); - if (ret < 0) - return ret; + if (!(mode & FALLOC_FL_ZERO_RANGE)) { + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + alloc_end - alloc_start); + if (ret < 0) + return ret; + } inode_lock(inode); @@ -2885,6 +3112,12 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret) goto out; + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = btrfs_zero_range(inode, offset, len, mode); + inode_unlock(inode); + return ret; + } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -2980,37 +3213,18 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) goto out_unlock; - if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size and the inode item. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - inode->i_ctime = current_time(inode); - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_end_transaction(trans); - else - ret = btrfs_end_transaction(trans); - } - } + /* + * We didn't need to allocate any more space, but we still extended the + * size of the file so we need to update i_size and the inode item. + */ + ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_KERNEL); out: inode_unlock(inode); /* Let go of our reservation. */ - if (ret != 0) + if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(inode, data_reserved, alloc_start, alloc_end - cur_offset); extent_changeset_free(data_reserved);

[v4] Btrfs: add support for fallocate's zero range operation

Commit Message

Comments

Patch