diff mbox series

[4/6] btrfs: use the file extent tree infrastructure

Message ID 20200117140224.42495-5-josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show
Series btrfs: fix hole corruption issue with !NO_HOLES | expand

Commit Message

Josef Bacik Jan. 17, 2020, 2:02 p.m. UTC
We want to use this everywhere we modify the file extent items
permanently.  These include

1) Inserting new file extents for writes and prealloc extents.
2) Truncating inode items.
3) btrfs_cont_expand().
4) Insert inline extents.
5) Insert new extents from log replay.
6) Insert a new extent for clone, as it could be past isize.

We do not however call the clear helper for hole punching because it
simply swaps out an existing file extent for a hole, so there's
effectively no change as far as the i_size is concerned.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/delayed-inode.c |  4 +++
 fs/btrfs/file.c          |  6 ++++
 fs/btrfs/inode.c         | 59 +++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/tree-log.c      |  5 ++++
 4 files changed, 73 insertions(+), 1 deletion(-)

Comments

Filipe Manana March 6, 2020, 11:51 a.m. UTC | #1
On Fri, Jan 17, 2020 at 2:03 PM Josef Bacik <josef@toxicpanda.com> wrote:
>
> We want to use this everywhere we modify the file extent items
> permanently.  These include
>
> 1) Inserting new file extents for writes and prealloc extents.
> 2) Truncating inode items.
> 3) btrfs_cont_expand().
> 4) Insert inline extents.
> 5) Insert new extents from log replay.
> 6) Insert a new extent for clone, as it could be past isize.
>
> We do not however call the clear helper for hole punching because it
> simply swaps out an existing file extent for a hole, so there's
> effectively no change as far as the i_size is concerned.
>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> Reviewed-by: Filipe Manana <fdmanana@suse.com>
> ---
>  fs/btrfs/delayed-inode.c |  4 +++
>  fs/btrfs/file.c          |  6 ++++
>  fs/btrfs/inode.c         | 59 +++++++++++++++++++++++++++++++++++++++-
>  fs/btrfs/tree-log.c      |  5 ++++
>  4 files changed, 73 insertions(+), 1 deletion(-)
>
> diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
> index d3e15e1d4a91..8b4dcf4f6b3e 100644
> --- a/fs/btrfs/delayed-inode.c
> +++ b/fs/btrfs/delayed-inode.c
> @@ -1762,6 +1762,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
>  {
>         struct btrfs_delayed_node *delayed_node;
>         struct btrfs_inode_item *inode_item;
> +       struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
>
>         delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
>         if (!delayed_node)
> @@ -1779,6 +1780,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
>         i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
>         i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
>         btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
> +       btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
> +                                         round_up(i_size_read(inode),
> +                                                  fs_info->sectorsize));
>         inode->i_mode = btrfs_stack_inode_mode(inode_item);
>         set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
>         inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index c6f9029e3d49..f72fb38e9aaa 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2482,6 +2482,12 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
>         btrfs_mark_buffer_dirty(leaf);
>         btrfs_release_path(path);
>
> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
> +                                               clone_info->file_offset,
> +                                               clone_len);
> +       if (ret)
> +               return ret;
> +
>         /* If it's a hole, nothing more needs to be done. */
>         if (clone_info->disk_offset == 0)
>                 return 0;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index fd8f821a3919..21fb80292256 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -241,6 +241,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>         btrfs_mark_buffer_dirty(leaf);
>         btrfs_release_path(path);
>
> +       /*
> +        * We align size to sectorsize for inline extents just for simplicity
> +        * sake.
> +        */
> +       size = ALIGN(size, root->fs_info->sectorsize);
> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
> +       if (ret)
> +               goto fail;
> +
>         /*
>          * we're an inline extent, so nobody can
>          * extend the file past i_size without locking
> @@ -2375,6 +2384,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
>         ins.offset = disk_num_bytes;
>         ins.type = BTRFS_EXTENT_ITEM_KEY;
>
> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos,
> +                                               ram_bytes);
> +       if (ret)
> +               goto out;
> +
>         /*
>          * Release the reserved range from inode dirty range map, as it is
>          * already moved into delayed_ref_head
> @@ -4084,6 +4098,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>         }
>
>         while (1) {
> +               u64 clear_start = 0, clear_len = 0;
> +
>                 fi = NULL;
>                 leaf = path->nodes[0];
>                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> @@ -4134,6 +4150,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>
>                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
>                         u64 num_dec;
> +
> +                       clear_start = found_key.offset;
>                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
>                         if (!del_item) {
>                                 u64 orig_num_bytes =
> @@ -4141,6 +4159,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>                                 extent_num_bytes = ALIGN(new_size -
>                                                 found_key.offset,
>                                                 fs_info->sectorsize);
> +                               clear_start = ALIGN(new_size,
> +                                                   fs_info->sectorsize);
>                                 btrfs_set_file_extent_num_bytes(leaf, fi,
>                                                          extent_num_bytes);
>                                 num_dec = (orig_num_bytes -
> @@ -4166,6 +4186,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>                                                 inode_sub_bytes(inode, num_dec);
>                                 }
>                         }
> +                       clear_len = num_dec;
>                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
>                         /*
>                          * we can't truncate inline items that have had
> @@ -4187,12 +4208,34 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>                                  */
>                                 ret = NEED_TRUNCATE_BLOCK;
>                                 break;
> +                       } else {
> +                               /*
> +                                * Inline extents are special, we just treat
> +                                * them as a full sector worth in the file
> +                                * extent tree just for simplicity sake.
> +                                */
> +                               clear_len = fs_info->sectorsize;
>                         }
>
>                         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
>                                 inode_sub_bytes(inode, item_end + 1 - new_size);
>                 }
>  delete:
> +               /*
> +                * We use btrfs_truncate_inode_items() to clean up log trees for
> +                * multiple fsyncs, and in this case we don't want to clear the
> +                * file extent range because it's just the log.
> +                */
> +               if (root == BTRFS_I(inode)->root) {
> +                       ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
> +                                                                 clear_start,
> +                                                                 clear_len);
> +                       if (ret) {
> +                               btrfs_abort_transaction(trans, ret);
> +                               break;
> +                       }
> +               }
> +
>                 if (del_item)
>                         last_size = found_key.offset;
>                 else
> @@ -4513,14 +4556,22 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>                 }
>                 last_byte = min(extent_map_end(em), block_end);
>                 last_byte = ALIGN(last_byte, fs_info->sectorsize);
> +               hole_size = last_byte - cur_offset;
> +
>                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>                         struct extent_map *hole_em;
> -                       hole_size = last_byte - cur_offset;
>
>                         err = maybe_insert_hole(root, inode, cur_offset,
>                                                 hole_size);
>                         if (err)
>                                 break;
> +
> +                       err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
> +                                                               cur_offset,
> +                                                               hole_size);
> +                       if (err)
> +                               break;
> +
>                         btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
>                                                 cur_offset + hole_size - 1, 0);
>                         hole_em = alloc_extent_map();
> @@ -4552,6 +4603,12 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>                                                         hole_size - 1, 0);
>                         }
>                         free_extent_map(hole_em);
> +               } else {
> +                       err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
> +                                                               cur_offset,
> +                                                               hole_size);
> +                       if (err)
> +                               break;
>                 }
>  next:
>                 free_extent_map(em);
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 80978ebfdcbb..56278a5b69e4 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -830,6 +830,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
>                         goto out;
>         }
>
> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
> +                                               extent_end - start);
> +       if (ret)
> +               goto out;

So while working on making ranged fsyncs for the slow patch (inode has
the 'need full sync' flag set), I uncovered more cases where we end up
with missing file extent items.

When doing a fast fsync, we log only the extents in the given range,
but we log an inode item with the current i_size of the inode.
So after a log replay we can get missing file extent items.

For example this test case I made earlier today (also at
https://pastebin.com/ezFFGEf8 for better readability):

#! /bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (C) 2020 SUSE Linux Products GmbH. All Rights Reserved.
#
# FSQA Test No. XXX
#
# TODO description
#
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15

_cleanup()
{
_cleanup_flakey
cd /
rm -f $tmp.*
}

# get standard environment, filters and checks
. ./common/rc
. ./common/attr
. ./common/filter
. ./common/dmflakey

# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_dm_target flakey
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
_require_xfs_io_command "sync_range"

rm -f $seqres.full

_scratch_mkfs -O ^no-holes >>$seqres.full 2>&1
_require_metadata_journaling $SCRATCH_DEV
_init_flakey
_mount_flakey

touch $SCRATCH_MNT/foo
# Clear the full sync bit, so that the msync below triggers the fast fsync path.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo

# get rid of the log tree
sync

# Dirty some pages and flush only some of them at the beginning without updating
# the log tree.
$XFS_IO_PROG -c "pwrite -S 0xab 0 1M" \
     -c "sync_range -abw 0 256K" \
     $SCRATCH_MNT/foo >>$seqres.full

$XFS_IO_PROG \
        -c "mmap -w 512K 512K"        \
        -c "mwrite -S 0xcd 512K 512K" \
        -c "msync -s 512K 512K"       \
        -c "munmap"                   \
        $SCRATCH_MNT/foo >>$seqres.full

_flakey_drop_and_remount
_unmount_flakey

# On exit fsck is run and reports missing file extent for range from 0 to 512Kb.

status=0
exit

We end up an unmarked hole at 0-512K.
After thinking a bit, I don't see an easy/simple way to fix this.
The only reliable way I can think of so sar is:  after replaying the
extents of an inode, inserting file extent items to represent holes
(only when not using NO_HOLES of course). That would imply scanning
all extents items in the fs/subvolume tree to check for gaps.

Any better idea?

Thanks.




> +
>         inode_add_bytes(inode, nbytes);
>  update_inode:
>         ret = btrfs_update_inode(trans, root, inode);
> --
> 2.24.1
>
Josef Bacik March 6, 2020, 2:52 p.m. UTC | #2
On 3/6/20 6:51 AM, Filipe Manana wrote:
> On Fri, Jan 17, 2020 at 2:03 PM Josef Bacik <josef@toxicpanda.com> wrote:
>>
>> We want to use this everywhere we modify the file extent items
>> permanently.  These include
>>
>> 1) Inserting new file extents for writes and prealloc extents.
>> 2) Truncating inode items.
>> 3) btrfs_cont_expand().
>> 4) Insert inline extents.
>> 5) Insert new extents from log replay.
>> 6) Insert a new extent for clone, as it could be past isize.
>>
>> We do not however call the clear helper for hole punching because it
>> simply swaps out an existing file extent for a hole, so there's
>> effectively no change as far as the i_size is concerned.
>>
>> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
>> Reviewed-by: Filipe Manana <fdmanana@suse.com>
>> ---
>>   fs/btrfs/delayed-inode.c |  4 +++
>>   fs/btrfs/file.c          |  6 ++++
>>   fs/btrfs/inode.c         | 59 +++++++++++++++++++++++++++++++++++++++-
>>   fs/btrfs/tree-log.c      |  5 ++++
>>   4 files changed, 73 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
>> index d3e15e1d4a91..8b4dcf4f6b3e 100644
>> --- a/fs/btrfs/delayed-inode.c
>> +++ b/fs/btrfs/delayed-inode.c
>> @@ -1762,6 +1762,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
>>   {
>>          struct btrfs_delayed_node *delayed_node;
>>          struct btrfs_inode_item *inode_item;
>> +       struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
>>
>>          delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
>>          if (!delayed_node)
>> @@ -1779,6 +1780,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
>>          i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
>>          i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
>>          btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
>> +       btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
>> +                                         round_up(i_size_read(inode),
>> +                                                  fs_info->sectorsize));
>>          inode->i_mode = btrfs_stack_inode_mode(inode_item);
>>          set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
>>          inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
>> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
>> index c6f9029e3d49..f72fb38e9aaa 100644
>> --- a/fs/btrfs/file.c
>> +++ b/fs/btrfs/file.c
>> @@ -2482,6 +2482,12 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
>>          btrfs_mark_buffer_dirty(leaf);
>>          btrfs_release_path(path);
>>
>> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
>> +                                               clone_info->file_offset,
>> +                                               clone_len);
>> +       if (ret)
>> +               return ret;
>> +
>>          /* If it's a hole, nothing more needs to be done. */
>>          if (clone_info->disk_offset == 0)
>>                  return 0;
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index fd8f821a3919..21fb80292256 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -241,6 +241,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>>          btrfs_mark_buffer_dirty(leaf);
>>          btrfs_release_path(path);
>>
>> +       /*
>> +        * We align size to sectorsize for inline extents just for simplicity
>> +        * sake.
>> +        */
>> +       size = ALIGN(size, root->fs_info->sectorsize);
>> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
>> +       if (ret)
>> +               goto fail;
>> +
>>          /*
>>           * we're an inline extent, so nobody can
>>           * extend the file past i_size without locking
>> @@ -2375,6 +2384,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
>>          ins.offset = disk_num_bytes;
>>          ins.type = BTRFS_EXTENT_ITEM_KEY;
>>
>> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos,
>> +                                               ram_bytes);
>> +       if (ret)
>> +               goto out;
>> +
>>          /*
>>           * Release the reserved range from inode dirty range map, as it is
>>           * already moved into delayed_ref_head
>> @@ -4084,6 +4098,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>>          }
>>
>>          while (1) {
>> +               u64 clear_start = 0, clear_len = 0;
>> +
>>                  fi = NULL;
>>                  leaf = path->nodes[0];
>>                  btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
>> @@ -4134,6 +4150,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>>
>>                  if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
>>                          u64 num_dec;
>> +
>> +                       clear_start = found_key.offset;
>>                          extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
>>                          if (!del_item) {
>>                                  u64 orig_num_bytes =
>> @@ -4141,6 +4159,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>>                                  extent_num_bytes = ALIGN(new_size -
>>                                                  found_key.offset,
>>                                                  fs_info->sectorsize);
>> +                               clear_start = ALIGN(new_size,
>> +                                                   fs_info->sectorsize);
>>                                  btrfs_set_file_extent_num_bytes(leaf, fi,
>>                                                           extent_num_bytes);
>>                                  num_dec = (orig_num_bytes -
>> @@ -4166,6 +4186,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>>                                                  inode_sub_bytes(inode, num_dec);
>>                                  }
>>                          }
>> +                       clear_len = num_dec;
>>                  } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
>>                          /*
>>                           * we can't truncate inline items that have had
>> @@ -4187,12 +4208,34 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>>                                   */
>>                                  ret = NEED_TRUNCATE_BLOCK;
>>                                  break;
>> +                       } else {
>> +                               /*
>> +                                * Inline extents are special, we just treat
>> +                                * them as a full sector worth in the file
>> +                                * extent tree just for simplicity sake.
>> +                                */
>> +                               clear_len = fs_info->sectorsize;
>>                          }
>>
>>                          if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
>>                                  inode_sub_bytes(inode, item_end + 1 - new_size);
>>                  }
>>   delete:
>> +               /*
>> +                * We use btrfs_truncate_inode_items() to clean up log trees for
>> +                * multiple fsyncs, and in this case we don't want to clear the
>> +                * file extent range because it's just the log.
>> +                */
>> +               if (root == BTRFS_I(inode)->root) {
>> +                       ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
>> +                                                                 clear_start,
>> +                                                                 clear_len);
>> +                       if (ret) {
>> +                               btrfs_abort_transaction(trans, ret);
>> +                               break;
>> +                       }
>> +               }
>> +
>>                  if (del_item)
>>                          last_size = found_key.offset;
>>                  else
>> @@ -4513,14 +4556,22 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>>                  }
>>                  last_byte = min(extent_map_end(em), block_end);
>>                  last_byte = ALIGN(last_byte, fs_info->sectorsize);
>> +               hole_size = last_byte - cur_offset;
>> +
>>                  if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>>                          struct extent_map *hole_em;
>> -                       hole_size = last_byte - cur_offset;
>>
>>                          err = maybe_insert_hole(root, inode, cur_offset,
>>                                                  hole_size);
>>                          if (err)
>>                                  break;
>> +
>> +                       err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
>> +                                                               cur_offset,
>> +                                                               hole_size);
>> +                       if (err)
>> +                               break;
>> +
>>                          btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
>>                                                  cur_offset + hole_size - 1, 0);
>>                          hole_em = alloc_extent_map();
>> @@ -4552,6 +4603,12 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>>                                                          hole_size - 1, 0);
>>                          }
>>                          free_extent_map(hole_em);
>> +               } else {
>> +                       err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
>> +                                                               cur_offset,
>> +                                                               hole_size);
>> +                       if (err)
>> +                               break;
>>                  }
>>   next:
>>                  free_extent_map(em);
>> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
>> index 80978ebfdcbb..56278a5b69e4 100644
>> --- a/fs/btrfs/tree-log.c
>> +++ b/fs/btrfs/tree-log.c
>> @@ -830,6 +830,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
>>                          goto out;
>>          }
>>
>> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
>> +                                               extent_end - start);
>> +       if (ret)
>> +               goto out;
> 
> So while working on making ranged fsyncs for the slow patch (inode has
> the 'need full sync' flag set), I uncovered more cases where we end up
> with missing file extent items.
> 
> When doing a fast fsync, we log only the extents in the given range,
> but we log an inode item with the current i_size of the inode.
> So after a log replay we can get missing file extent items.
> 

For this case I think we need to not just log that current range, we need to log 
anything that was changed leading up to that offset.  Range fsync is just an 
optimization, in the !NO_HOLES case we need to make sure we're still leaving the 
fs in a consistent state, so if we have any modified extents that lead up to our 
range those need to be logged as well.  Thanks,

Josef
Filipe Manana March 6, 2020, 3:01 p.m. UTC | #3
On Fri, Mar 6, 2020 at 2:52 PM Josef Bacik <josef@toxicpanda.com> wrote:
>
> On 3/6/20 6:51 AM, Filipe Manana wrote:
> > On Fri, Jan 17, 2020 at 2:03 PM Josef Bacik <josef@toxicpanda.com> wrote:
> >>
> >> We want to use this everywhere we modify the file extent items
> >> permanently.  These include
> >>
> >> 1) Inserting new file extents for writes and prealloc extents.
> >> 2) Truncating inode items.
> >> 3) btrfs_cont_expand().
> >> 4) Insert inline extents.
> >> 5) Insert new extents from log replay.
> >> 6) Insert a new extent for clone, as it could be past isize.
> >>
> >> We do not however call the clear helper for hole punching because it
> >> simply swaps out an existing file extent for a hole, so there's
> >> effectively no change as far as the i_size is concerned.
> >>
> >> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> >> Reviewed-by: Filipe Manana <fdmanana@suse.com>
> >> ---
> >>   fs/btrfs/delayed-inode.c |  4 +++
> >>   fs/btrfs/file.c          |  6 ++++
> >>   fs/btrfs/inode.c         | 59 +++++++++++++++++++++++++++++++++++++++-
> >>   fs/btrfs/tree-log.c      |  5 ++++
> >>   4 files changed, 73 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
> >> index d3e15e1d4a91..8b4dcf4f6b3e 100644
> >> --- a/fs/btrfs/delayed-inode.c
> >> +++ b/fs/btrfs/delayed-inode.c
> >> @@ -1762,6 +1762,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
> >>   {
> >>          struct btrfs_delayed_node *delayed_node;
> >>          struct btrfs_inode_item *inode_item;
> >> +       struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
> >>
> >>          delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
> >>          if (!delayed_node)
> >> @@ -1779,6 +1780,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
> >>          i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
> >>          i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
> >>          btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
> >> +       btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
> >> +                                         round_up(i_size_read(inode),
> >> +                                                  fs_info->sectorsize));
> >>          inode->i_mode = btrfs_stack_inode_mode(inode_item);
> >>          set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
> >>          inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
> >> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> >> index c6f9029e3d49..f72fb38e9aaa 100644
> >> --- a/fs/btrfs/file.c
> >> +++ b/fs/btrfs/file.c
> >> @@ -2482,6 +2482,12 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
> >>          btrfs_mark_buffer_dirty(leaf);
> >>          btrfs_release_path(path);
> >>
> >> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
> >> +                                               clone_info->file_offset,
> >> +                                               clone_len);
> >> +       if (ret)
> >> +               return ret;
> >> +
> >>          /* If it's a hole, nothing more needs to be done. */
> >>          if (clone_info->disk_offset == 0)
> >>                  return 0;
> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> >> index fd8f821a3919..21fb80292256 100644
> >> --- a/fs/btrfs/inode.c
> >> +++ b/fs/btrfs/inode.c
> >> @@ -241,6 +241,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
> >>          btrfs_mark_buffer_dirty(leaf);
> >>          btrfs_release_path(path);
> >>
> >> +       /*
> >> +        * We align size to sectorsize for inline extents just for simplicity
> >> +        * sake.
> >> +        */
> >> +       size = ALIGN(size, root->fs_info->sectorsize);
> >> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
> >> +       if (ret)
> >> +               goto fail;
> >> +
> >>          /*
> >>           * we're an inline extent, so nobody can
> >>           * extend the file past i_size without locking
> >> @@ -2375,6 +2384,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
> >>          ins.offset = disk_num_bytes;
> >>          ins.type = BTRFS_EXTENT_ITEM_KEY;
> >>
> >> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos,
> >> +                                               ram_bytes);
> >> +       if (ret)
> >> +               goto out;
> >> +
> >>          /*
> >>           * Release the reserved range from inode dirty range map, as it is
> >>           * already moved into delayed_ref_head
> >> @@ -4084,6 +4098,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
> >>          }
> >>
> >>          while (1) {
> >> +               u64 clear_start = 0, clear_len = 0;
> >> +
> >>                  fi = NULL;
> >>                  leaf = path->nodes[0];
> >>                  btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> >> @@ -4134,6 +4150,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
> >>
> >>                  if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
> >>                          u64 num_dec;
> >> +
> >> +                       clear_start = found_key.offset;
> >>                          extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
> >>                          if (!del_item) {
> >>                                  u64 orig_num_bytes =
> >> @@ -4141,6 +4159,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
> >>                                  extent_num_bytes = ALIGN(new_size -
> >>                                                  found_key.offset,
> >>                                                  fs_info->sectorsize);
> >> +                               clear_start = ALIGN(new_size,
> >> +                                                   fs_info->sectorsize);
> >>                                  btrfs_set_file_extent_num_bytes(leaf, fi,
> >>                                                           extent_num_bytes);
> >>                                  num_dec = (orig_num_bytes -
> >> @@ -4166,6 +4186,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
> >>                                                  inode_sub_bytes(inode, num_dec);
> >>                                  }
> >>                          }
> >> +                       clear_len = num_dec;
> >>                  } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
> >>                          /*
> >>                           * we can't truncate inline items that have had
> >> @@ -4187,12 +4208,34 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
> >>                                   */
> >>                                  ret = NEED_TRUNCATE_BLOCK;
> >>                                  break;
> >> +                       } else {
> >> +                               /*
> >> +                                * Inline extents are special, we just treat
> >> +                                * them as a full sector worth in the file
> >> +                                * extent tree just for simplicity sake.
> >> +                                */
> >> +                               clear_len = fs_info->sectorsize;
> >>                          }
> >>
> >>                          if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
> >>                                  inode_sub_bytes(inode, item_end + 1 - new_size);
> >>                  }
> >>   delete:
> >> +               /*
> >> +                * We use btrfs_truncate_inode_items() to clean up log trees for
> >> +                * multiple fsyncs, and in this case we don't want to clear the
> >> +                * file extent range because it's just the log.
> >> +                */
> >> +               if (root == BTRFS_I(inode)->root) {
> >> +                       ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
> >> +                                                                 clear_start,
> >> +                                                                 clear_len);
> >> +                       if (ret) {
> >> +                               btrfs_abort_transaction(trans, ret);
> >> +                               break;
> >> +                       }
> >> +               }
> >> +
> >>                  if (del_item)
> >>                          last_size = found_key.offset;
> >>                  else
> >> @@ -4513,14 +4556,22 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
> >>                  }
> >>                  last_byte = min(extent_map_end(em), block_end);
> >>                  last_byte = ALIGN(last_byte, fs_info->sectorsize);
> >> +               hole_size = last_byte - cur_offset;
> >> +
> >>                  if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> >>                          struct extent_map *hole_em;
> >> -                       hole_size = last_byte - cur_offset;
> >>
> >>                          err = maybe_insert_hole(root, inode, cur_offset,
> >>                                                  hole_size);
> >>                          if (err)
> >>                                  break;
> >> +
> >> +                       err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
> >> +                                                               cur_offset,
> >> +                                                               hole_size);
> >> +                       if (err)
> >> +                               break;
> >> +
> >>                          btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
> >>                                                  cur_offset + hole_size - 1, 0);
> >>                          hole_em = alloc_extent_map();
> >> @@ -4552,6 +4603,12 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
> >>                                                          hole_size - 1, 0);
> >>                          }
> >>                          free_extent_map(hole_em);
> >> +               } else {
> >> +                       err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
> >> +                                                               cur_offset,
> >> +                                                               hole_size);
> >> +                       if (err)
> >> +                               break;
> >>                  }
> >>   next:
> >>                  free_extent_map(em);
> >> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> >> index 80978ebfdcbb..56278a5b69e4 100644
> >> --- a/fs/btrfs/tree-log.c
> >> +++ b/fs/btrfs/tree-log.c
> >> @@ -830,6 +830,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
> >>                          goto out;
> >>          }
> >>
> >> +       ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
> >> +                                               extent_end - start);
> >> +       if (ret)
> >> +               goto out;
> >
> > So while working on making ranged fsyncs for the slow patch (inode has
> > the 'need full sync' flag set), I uncovered more cases where we end up
> > with missing file extent items.
> >
> > When doing a fast fsync, we log only the extents in the given range,
> > but we log an inode item with the current i_size of the inode.
> > So after a log replay we can get missing file extent items.
> >
>
> For this case I think we need to not just log that current range, we need to log
> anything that was changed leading up to that offset.  Range fsync is just an
> optimization, in the !NO_HOLES case we need to make sure we're still leaving the
> fs in a consistent state, so if we have any modified extents that lead up to our
> range those need to be logged as well.  Thanks,

Sounds reasonable, I don't any efficient way in mind to do it either.
So for any type of fsync (full or fast) always reset the start offset
to 0 when not using NO_HOLES, and leave the end offset alone.
I'll introduce a patch in my patchset for that and send the test case.

Thanks.

>
> Josef
diff mbox series

Patch

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index d3e15e1d4a91..8b4dcf4f6b3e 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1762,6 +1762,7 @@  int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 {
 	struct btrfs_delayed_node *delayed_node;
 	struct btrfs_inode_item *inode_item;
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 
 	delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
 	if (!delayed_node)
@@ -1779,6 +1780,9 @@  int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 	i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
 	i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
 	btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
+	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+					  round_up(i_size_read(inode),
+						   fs_info->sectorsize));
 	inode->i_mode = btrfs_stack_inode_mode(inode_item);
 	set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
 	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c6f9029e3d49..f72fb38e9aaa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2482,6 +2482,12 @@  static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
 
+	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+						clone_info->file_offset,
+						clone_len);
+	if (ret)
+		return ret;
+
 	/* If it's a hole, nothing more needs to be done. */
 	if (clone_info->disk_offset == 0)
 		return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fd8f821a3919..21fb80292256 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -241,6 +241,15 @@  static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
 
+	/*
+	 * We align size to sectorsize for inline extents just for simplicity
+	 * sake.
+	 */
+	size = ALIGN(size, root->fs_info->sectorsize);
+	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
+	if (ret)
+		goto fail;
+
 	/*
 	 * we're an inline extent, so nobody can
 	 * extend the file past i_size without locking
@@ -2375,6 +2384,11 @@  static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ins.offset = disk_num_bytes;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 
+	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos,
+						ram_bytes);
+	if (ret)
+		goto out;
+
 	/*
 	 * Release the reserved range from inode dirty range map, as it is
 	 * already moved into delayed_ref_head
@@ -4084,6 +4098,8 @@  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	}
 
 	while (1) {
+		u64 clear_start = 0, clear_len = 0;
+
 		fi = NULL;
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -4134,6 +4150,8 @@  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
+
+			clear_start = found_key.offset;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
 			if (!del_item) {
 				u64 orig_num_bytes =
@@ -4141,6 +4159,8 @@  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				extent_num_bytes = ALIGN(new_size -
 						found_key.offset,
 						fs_info->sectorsize);
+				clear_start = ALIGN(new_size,
+						    fs_info->sectorsize);
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes -
@@ -4166,6 +4186,7 @@  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 						inode_sub_bytes(inode, num_dec);
 				}
 			}
+			clear_len = num_dec;
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			/*
 			 * we can't truncate inline items that have had
@@ -4187,12 +4208,34 @@  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				 */
 				ret = NEED_TRUNCATE_BLOCK;
 				break;
+			} else {
+				/*
+				 * Inline extents are special, we just treat
+				 * them as a full sector worth in the file
+				 * extent tree just for simplicity sake.
+				 */
+				clear_len = fs_info->sectorsize;
 			}
 
 			if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 				inode_sub_bytes(inode, item_end + 1 - new_size);
 		}
 delete:
+		/*
+		 * We use btrfs_truncate_inode_items() to clean up log trees for
+		 * multiple fsyncs, and in this case we don't want to clear the
+		 * file extent range because it's just the log.
+		 */
+		if (root == BTRFS_I(inode)->root) {
+			ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+								  clear_start,
+								  clear_len);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				break;
+			}
+		}
+
 		if (del_item)
 			last_size = found_key.offset;
 		else
@@ -4513,14 +4556,22 @@  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 		}
 		last_byte = min(extent_map_end(em), block_end);
 		last_byte = ALIGN(last_byte, fs_info->sectorsize);
+		hole_size = last_byte - cur_offset;
+
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
 			struct extent_map *hole_em;
-			hole_size = last_byte - cur_offset;
 
 			err = maybe_insert_hole(root, inode, cur_offset,
 						hole_size);
 			if (err)
 				break;
+
+			err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+								cur_offset,
+								hole_size);
+			if (err)
+				break;
+
 			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
 						cur_offset + hole_size - 1, 0);
 			hole_em = alloc_extent_map();
@@ -4552,6 +4603,12 @@  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 							hole_size - 1, 0);
 			}
 			free_extent_map(hole_em);
+		} else {
+			err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+								cur_offset,
+								hole_size);
+			if (err)
+				break;
 		}
 next:
 		free_extent_map(em);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 80978ebfdcbb..56278a5b69e4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -830,6 +830,11 @@  static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			goto out;
 	}
 
+	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
+						extent_end - start);
+	if (ret)
+		goto out;
+
 	inode_add_bytes(inode, nbytes);
 update_inode:
 	ret = btrfs_update_inode(trans, root, inode);