diff mbox

[v3] btrfs: fiemap: Cache and merge fiemap extent before submit it to user

Message ID 20170407024315.30685-1-quwenruo@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Qu Wenruo April 7, 2017, 2:43 a.m. UTC
[BUG]
Cycle mount btrfs can cause fiemap to return different result.
Like:
 # mount /dev/vdb5 /mnt/btrfs
 # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
 # xfs_io -c "fiemap -v" /mnt/btrfs/file
 /mnt/test/file:
 EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
   0: [0..127]:        25088..25215       128   0x1
 # umount /mnt/btrfs
 # mount /dev/vdb5 /mnt/btrfs
 # xfs_io -c "fiemap -v" /mnt/btrfs/file
 /mnt/test/file:
 EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
   0: [0..31]:         25088..25119        32   0x0
   1: [32..63]:        25120..25151        32   0x0
   2: [64..95]:        25152..25183        32   0x0
   3: [96..127]:       25184..25215        32   0x1
But after above fiemap, we get correct merged result if we call fiemap
again.
 # xfs_io -c "fiemap -v" /mnt/btrfs/file
 /mnt/test/file:
 EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
   0: [0..127]:        25088..25215       128   0x1

[REASON]
Btrfs will try to merge extent map when inserting new extent map.

btrfs_fiemap(start=0 len=(u64)-1)
|- extent_fiemap(start=0 len=(u64)-1)
   |- get_extent_skip_holes(start=0 len=64k)
   |  |- btrfs_get_extent_fiemap(start=0 len=64k)
   |     |- btrfs_get_extent(start=0 len=64k)
   |        |  Found on-disk (ino, EXTENT_DATA, 0)
   |        |- add_extent_mapping()
   |        |- Return (em->start=0, len=16k)
   |
   |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
   |
   |- get_extent_skip_holes(start=0 len=64k)
   |  |- btrfs_get_extent_fiemap(start=0 len=64k)
   |     |- btrfs_get_extent(start=16k len=48k)
   |        |  Found on-disk (ino, EXTENT_DATA, 16k)
   |        |- add_extent_mapping()
   |        |  |- try_merge_map()
   |        |     Merge with previous em start=0 len=16k
   |        |     resulting em start=0 len=32k
   |        |- Return (em->start=0, len=32K)    << Merged result
   |- Stripe off the unrelated range (0~16K) of return em
   |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
      ^^^ Causing split fiemap extent.

And since in add_extent_mapping(), em is already merged, in next
fiemap() call, we will get merged result.

[FIX]
Here we introduce a new structure, fiemap_cache, which records previous
fiemap extent.

And will always try to merge current fiemap_cache result before calling
fiemap_fill_next_extent().
Only when we failed to merge current fiemap extent with cached one, we
will call fiemap_fill_next_extent() to submit cached one.

So by this method, we can merge all fiemap extents.

It can also be done in fs/ioctl.c, however the problem is if
fieinfo->fi_extents_max == 0, we have no space to cache previous fiemap
extent.
So I choose to merge it in btrfs.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
v2:
  Since fiemap_extent_info has a limit for number of fiemap_extent, it's possible
  that fiemap_fill_next_extent() return 1 halfway. Remove the WARN_ON() which can
  cause kernel warning if we fiemap is called on large compressed file.
v3:
  Rename finish_fiemap_extent() to check_fiemap_extent(), as in v3 we ensured
  submit_fiemap_extent() to submit fiemap cache, so it just acts as a
  sanity check.
  Remove BTRFS_MAX_EXTENT_SIZE limit in submit_fiemap_extent(), as
  extent map can be larger than BTRFS_MAX_EXTENT_SIZE.
  Don't do backward jump, suggested by David.
  Better sanity check and recoverable fix.

To David:
  What about adding a btrfs_debug_warn(), which will only call WARN_ON(1) if
  BTRFS_CONFIG_DEBUG is specified for recoverable bug?

  And modify ASSERT() to always WARN_ON() and exit error code?
---
 fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 2 deletions(-)

Comments

David Sterba April 12, 2017, 3:05 p.m. UTC | #1
On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
> [BUG]
> Cycle mount btrfs can cause fiemap to return different result.
> Like:
>  # mount /dev/vdb5 /mnt/btrfs
>  # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
>  # xfs_io -c "fiemap -v" /mnt/btrfs/file
>  /mnt/test/file:
>  EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>    0: [0..127]:        25088..25215       128   0x1
>  # umount /mnt/btrfs
>  # mount /dev/vdb5 /mnt/btrfs
>  # xfs_io -c "fiemap -v" /mnt/btrfs/file
>  /mnt/test/file:
>  EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>    0: [0..31]:         25088..25119        32   0x0
>    1: [32..63]:        25120..25151        32   0x0
>    2: [64..95]:        25152..25183        32   0x0
>    3: [96..127]:       25184..25215        32   0x1
> But after above fiemap, we get correct merged result if we call fiemap
> again.
>  # xfs_io -c "fiemap -v" /mnt/btrfs/file
>  /mnt/test/file:
>  EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>    0: [0..127]:        25088..25215       128   0x1
> 
> [REASON]
> Btrfs will try to merge extent map when inserting new extent map.
> 
> btrfs_fiemap(start=0 len=(u64)-1)
> |- extent_fiemap(start=0 len=(u64)-1)
>    |- get_extent_skip_holes(start=0 len=64k)
>    |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>    |     |- btrfs_get_extent(start=0 len=64k)
>    |        |  Found on-disk (ino, EXTENT_DATA, 0)
>    |        |- add_extent_mapping()
>    |        |- Return (em->start=0, len=16k)
>    |
>    |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
>    |
>    |- get_extent_skip_holes(start=0 len=64k)
>    |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>    |     |- btrfs_get_extent(start=16k len=48k)
>    |        |  Found on-disk (ino, EXTENT_DATA, 16k)
>    |        |- add_extent_mapping()
>    |        |  |- try_merge_map()
>    |        |     Merge with previous em start=0 len=16k
>    |        |     resulting em start=0 len=32k
>    |        |- Return (em->start=0, len=32K)    << Merged result
>    |- Stripe off the unrelated range (0~16K) of return em
>    |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
>       ^^^ Causing split fiemap extent.
> 
> And since in add_extent_mapping(), em is already merged, in next
> fiemap() call, we will get merged result.
> 
> [FIX]
> Here we introduce a new structure, fiemap_cache, which records previous
> fiemap extent.
> 
> And will always try to merge current fiemap_cache result before calling
> fiemap_fill_next_extent().
> Only when we failed to merge current fiemap extent with cached one, we
> will call fiemap_fill_next_extent() to submit cached one.
> 
> So by this method, we can merge all fiemap extents.

The cache gets reset on each call to extent_fiemap, so if fi_extents_max
is 1, the cache will be always unset and we'll never merge anything. The
same can happen if the number of extents reaches the limit
(FIEMAP_MAX_EXTENTS or any other depending on the ioctl caller). And
this leads to the unmerged extents.

> It can also be done in fs/ioctl.c, however the problem is if
> fieinfo->fi_extents_max == 0, we have no space to cache previous fiemap
> extent.

I don't see why, it's the same code path, no?

> So I choose to merge it in btrfs.

Lifting that to the vfs interface is probably not the right approach.
The ioctl has never done any postprocessing of the data returned by
filesystems, it's really up to the filesystem to prepare the data.

> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
> v2:
>   Since fiemap_extent_info has a limit for number of fiemap_extent, it's possible
>   that fiemap_fill_next_extent() return 1 halfway. Remove the WARN_ON() which can
>   cause kernel warning if we fiemap is called on large compressed file.
> v3:
>   Rename finish_fiemap_extent() to check_fiemap_extent(), as in v3 we ensured
>   submit_fiemap_extent() to submit fiemap cache, so it just acts as a
>   sanity check.
>   Remove BTRFS_MAX_EXTENT_SIZE limit in submit_fiemap_extent(), as
>   extent map can be larger than BTRFS_MAX_EXTENT_SIZE.
>   Don't do backward jump, suggested by David.
>   Better sanity check and recoverable fix.
> 
> To David:
>   What about adding a btrfs_debug_warn(), which will only call WARN_ON(1) if
>   BTRFS_CONFIG_DEBUG is specified for recoverable bug?
> 
>   And modify ASSERT() to always WARN_ON() and exit error code?

That's for a separate discussion.

> ---
>  fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 122 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 28e8192..c4cb65d 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -4353,6 +4353,123 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
>  	return NULL;
>  }
>  
> +/*
> + * To cache previous fiemap extent
> + *
> + * Will be used for merging fiemap extent
> + */
> +struct fiemap_cache {
> +	u64 offset;
> +	u64 phys;
> +	u64 len;
> +	u32 flags;
> +	bool cached;
> +};
> +
> +/*
> + * Helper to submit fiemap extent.
> + *
> + * Will try to merge current fiemap extent specified by @offset, @phys,
> + * @len and @flags with cached one.
> + * And only when we fails to merge, cached one will be submitted as
> + * fiemap extent.
> + *
> + * Return value is the same as fiemap_fill_next_extent().
> + */
> +static int submit_fiemap_extent(struct fiemap_extent_info *fieinfo,

I'd suggest to rename it to emmit_fiemap_extent

> +				struct fiemap_cache *cache,
> +				u64 offset, u64 phys, u64 len, u32 flags)
> +{
> +	int ret = 0;
> +
> +	if (!cache->cached)
> +		goto assign;
> +
> +	/*
> +	 * Sanity check, extent_fiemap() should have ensured that new
> +	 * fiemap extent won't overlap with cahced one.
> +	 * Not recoverable.
> +	 *
> +	 * NOTE: Physical address can overlap, due to compression
> +	 */
> +	if (cache->offset + cache->len > offset) {
> +		WARN_ON(1);
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Only merges fiemap extents if
> +	 * 1) Their logical addresses are continuous
> +	 *
> +	 * 2) Their physical addresses are continuous
> +	 *    So truly compressed (physical size smaller than logical size)
> +	 *    extents won't get merged with each other
> +	 *
> +	 * 3) Share same flags except FIEMAP_EXTENT_LAST
> +	 *    So regular extent won't get merged with prealloc extent
> +	 */
> +	if (cache->offset + cache->len  == offset &&
> +	    cache->phys + cache->len == phys  &&
> +	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
> +			(flags & ~FIEMAP_EXTENT_LAST)) {
> +		cache->len += len;
> +		cache->flags |= flags;
> +		goto try_submit_last;
> +	}
> +
> +	/* Not mergeable, need to submit cached one */
> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
> +				      cache->len, cache->flags);
> +	cache->cached = false;
> +	if (ret)
> +		return ret;
> +assign:
> +	cache->cached = true;
> +	cache->offset = offset;
> +	cache->phys = phys;
> +	cache->len = len;
> +	cache->flags = flags;
> +try_submit_last:
> +	if (cache->flags & FIEMAP_EXTENT_LAST) {
> +		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
> +				cache->phys, cache->len, cache->flags);
> +		cache->cached = false;
> +	}
> +	return ret;
> +}
> +
> +/*
> + * Sanity check for fiemap cache
> + *
> + * All fiemap cache should be submitted by submit_fiemap_extent()
> + * Iteration should be terminated either by last fiemap extent or
> + * fieinfo->fi_extents_max.
> + * So no cached fiemap should exist.
> + */
> +static int check_fiemap_cache(struct btrfs_fs_info *fs_info,
> +			       struct fiemap_extent_info *fieinfo,
> +			       struct fiemap_cache *cache)
> +{
> +	int ret;
> +
> +	if (!cache->cached)
> +		return 0;
> +
> +	/* Small and recoverbale problem, only to info developer */
> +#ifdef CONFIG_BTRFS_DEBUG
> +	WARN_ON(1);
> +#endif
> +	btrfs_warn(fs_info,
> +		   "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x",
> +		   cache->offset, cache->phys, cache->len, cache->flags);
> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
> +				      cache->len, cache->flags);
> +	cache->cached = false;
> +	if (ret > 0)
> +		ret = 0;
> +	return ret;
> +}
> +
>  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  		__u64 start, __u64 len, get_extent_t *get_extent)
>  {
> @@ -4370,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  	struct extent_state *cached_state = NULL;
>  	struct btrfs_path *path;
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct fiemap_cache cache = { 0 };
>  	int end = 0;
>  	u64 em_start = 0;
>  	u64 em_len = 0;
> @@ -4549,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  			flags |= FIEMAP_EXTENT_LAST;
>  			end = 1;
>  		}
> -		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
> -					      em_len, flags);
> +		ret = submit_fiemap_extent(fieinfo, &cache, em_start, disko,
> +					   em_len, flags);
>  		if (ret) {
>  			if (ret == 1)
>  				ret = 0;
> @@ -4558,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  		}
>  	}
>  out_free:
> +	if (!ret)
> +		ret = check_fiemap_cache(root->fs_info, fieinfo, &cache);
>  	free_extent_map(em);
>  out:
>  	btrfs_free_path(path);
> -- 
> 2.9.3
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo April 13, 2017, 12:36 a.m. UTC | #2
At 04/12/2017 11:05 PM, David Sterba wrote:
> On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
>> [BUG]
>> Cycle mount btrfs can cause fiemap to return different result.
>> Like:
>>   # mount /dev/vdb5 /mnt/btrfs
>>   # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
>>   # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>   /mnt/test/file:
>>   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>     0: [0..127]:        25088..25215       128   0x1
>>   # umount /mnt/btrfs
>>   # mount /dev/vdb5 /mnt/btrfs
>>   # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>   /mnt/test/file:
>>   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>     0: [0..31]:         25088..25119        32   0x0
>>     1: [32..63]:        25120..25151        32   0x0
>>     2: [64..95]:        25152..25183        32   0x0
>>     3: [96..127]:       25184..25215        32   0x1
>> But after above fiemap, we get correct merged result if we call fiemap
>> again.
>>   # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>   /mnt/test/file:
>>   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>     0: [0..127]:        25088..25215       128   0x1
>>
>> [REASON]
>> Btrfs will try to merge extent map when inserting new extent map.
>>
>> btrfs_fiemap(start=0 len=(u64)-1)
>> |- extent_fiemap(start=0 len=(u64)-1)
>>     |- get_extent_skip_holes(start=0 len=64k)
>>     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>>     |     |- btrfs_get_extent(start=0 len=64k)
>>     |        |  Found on-disk (ino, EXTENT_DATA, 0)
>>     |        |- add_extent_mapping()
>>     |        |- Return (em->start=0, len=16k)
>>     |
>>     |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
>>     |
>>     |- get_extent_skip_holes(start=0 len=64k)
>>     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>>     |     |- btrfs_get_extent(start=16k len=48k)
>>     |        |  Found on-disk (ino, EXTENT_DATA, 16k)
>>     |        |- add_extent_mapping()
>>     |        |  |- try_merge_map()
>>     |        |     Merge with previous em start=0 len=16k
>>     |        |     resulting em start=0 len=32k
>>     |        |- Return (em->start=0, len=32K)    << Merged result
>>     |- Stripe off the unrelated range (0~16K) of return em
>>     |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
>>        ^^^ Causing split fiemap extent.
>>
>> And since in add_extent_mapping(), em is already merged, in next
>> fiemap() call, we will get merged result.
>>
>> [FIX]
>> Here we introduce a new structure, fiemap_cache, which records previous
>> fiemap extent.
>>
>> And will always try to merge current fiemap_cache result before calling
>> fiemap_fill_next_extent().
>> Only when we failed to merge current fiemap extent with cached one, we
>> will call fiemap_fill_next_extent() to submit cached one.
>>
>> So by this method, we can merge all fiemap extents.
> 
> The cache gets reset on each call to extent_fiemap, so if fi_extents_max
> is 1, the cache will be always unset and we'll never merge anything. The
> same can happen if the number of extents reaches the limit
> (FIEMAP_MAX_EXTENTS or any other depending on the ioctl caller). And
> this leads to the unmerged extents.

Nope, extents will still be merged, as long as they can be merged.

The fiemap extent is only submitted if we found an unmergeable extent.

Even fi_extents_max is 1, it still possible for us to merge extents.

File A:
Extent 1: offset=0 len=4k phys=X
Extent 2: offset=4k len=4k phys=X+4
Extent 3: offset=8k len=4k phys=Y

1) Found Extent 1
    Cache it, not submitted yet.
2) Found Extent 2
    Merge it with cached one, not submitted yet.
3) Found Extent 3
    Can't merge, submit cached first.
    Submitted one reach fi_extents_max, exit current extent_fiemap.

4) Next fiemap call starts from offset 8K,
    Extent 3 is the last extent, no need to cache just submit.

So we still got merged fiemap extent, without anything wrong.

The point is, fi_extents_max or other limit can only be merged when we 
submit fiemap_extent, in that case either we found unmergable extent, or 
we already hit the last extent.

> 
>> It can also be done in fs/ioctl.c, however the problem is if
>> fieinfo->fi_extents_max == 0, we have no space to cache previous fiemap
>> extent.
> 
> I don't see why, it's the same code path, no?

My original design in VFS is to check if we can merge current fiemap 
extent with the last one in fiemap_info.

But for fi_extents_max == 0 case, fiemap_info doesn't store any extent 
so that's not possible.


So for fi_extents_max == 0 case, either do it in each fs like what we 
are doing, or introduce a new function like fiemap_cache_next_extent() 
with reference to cached structure.

> 
>> So I choose to merge it in btrfs.
> 
> Lifting that to the vfs interface is probably not the right approach.
> The ioctl has never done any postprocessing of the data returned by
> filesystems, it's really up to the filesystem to prepare the data.

OK, let's keep it in btrfs.

> 
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>> v2:
>>    Since fiemap_extent_info has a limit for number of fiemap_extent, it's possible
>>    that fiemap_fill_next_extent() return 1 halfway. Remove the WARN_ON() which can
>>    cause kernel warning if we fiemap is called on large compressed file.
>> v3:
>>    Rename finish_fiemap_extent() to check_fiemap_extent(), as in v3 we ensured
>>    submit_fiemap_extent() to submit fiemap cache, so it just acts as a
>>    sanity check.
>>    Remove BTRFS_MAX_EXTENT_SIZE limit in submit_fiemap_extent(), as
>>    extent map can be larger than BTRFS_MAX_EXTENT_SIZE.
>>    Don't do backward jump, suggested by David.
>>    Better sanity check and recoverable fix.
>>
>> To David:
>>    What about adding a btrfs_debug_warn(), which will only call WARN_ON(1) if
>>    BTRFS_CONFIG_DEBUG is specified for recoverable bug?
>>
>>    And modify ASSERT() to always WARN_ON() and exit error code?
> 
> That's for a separate discussion.
> 
>> ---
>>   fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 122 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 28e8192..c4cb65d 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -4353,6 +4353,123 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
>>   	return NULL;
>>   }
>>   
>> +/*
>> + * To cache previous fiemap extent
>> + *
>> + * Will be used for merging fiemap extent
>> + */
>> +struct fiemap_cache {
>> +	u64 offset;
>> +	u64 phys;
>> +	u64 len;
>> +	u32 flags;
>> +	bool cached;
>> +};
>> +
>> +/*
>> + * Helper to submit fiemap extent.
>> + *
>> + * Will try to merge current fiemap extent specified by @offset, @phys,
>> + * @len and @flags with cached one.
>> + * And only when we fails to merge, cached one will be submitted as
>> + * fiemap extent.
>> + *
>> + * Return value is the same as fiemap_fill_next_extent().
>> + */
>> +static int submit_fiemap_extent(struct fiemap_extent_info *fieinfo,
> 
> I'd suggest to rename it to emmit_fiemap_extent

I'm OK to change it in next version.

Thanks,
Qu

> 
>> +				struct fiemap_cache *cache,
>> +				u64 offset, u64 phys, u64 len, u32 flags)
>> +{
>> +	int ret = 0;
>> +
>> +	if (!cache->cached)
>> +		goto assign;
>> +
>> +	/*
>> +	 * Sanity check, extent_fiemap() should have ensured that new
>> +	 * fiemap extent won't overlap with cahced one.
>> +	 * Not recoverable.
>> +	 *
>> +	 * NOTE: Physical address can overlap, due to compression
>> +	 */
>> +	if (cache->offset + cache->len > offset) {
>> +		WARN_ON(1);
>> +		return -EINVAL;
>> +	}
>> +
>> +	/*
>> +	 * Only merges fiemap extents if
>> +	 * 1) Their logical addresses are continuous
>> +	 *
>> +	 * 2) Their physical addresses are continuous
>> +	 *    So truly compressed (physical size smaller than logical size)
>> +	 *    extents won't get merged with each other
>> +	 *
>> +	 * 3) Share same flags except FIEMAP_EXTENT_LAST
>> +	 *    So regular extent won't get merged with prealloc extent
>> +	 */
>> +	if (cache->offset + cache->len  == offset &&
>> +	    cache->phys + cache->len == phys  &&
>> +	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
>> +			(flags & ~FIEMAP_EXTENT_LAST)) {
>> +		cache->len += len;
>> +		cache->flags |= flags;
>> +		goto try_submit_last;
>> +	}
>> +
>> +	/* Not mergeable, need to submit cached one */
>> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
>> +				      cache->len, cache->flags);
>> +	cache->cached = false;
>> +	if (ret)
>> +		return ret;
>> +assign:
>> +	cache->cached = true;
>> +	cache->offset = offset;
>> +	cache->phys = phys;
>> +	cache->len = len;
>> +	cache->flags = flags;
>> +try_submit_last:
>> +	if (cache->flags & FIEMAP_EXTENT_LAST) {
>> +		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
>> +				cache->phys, cache->len, cache->flags);
>> +		cache->cached = false;
>> +	}
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Sanity check for fiemap cache
>> + *
>> + * All fiemap cache should be submitted by submit_fiemap_extent()
>> + * Iteration should be terminated either by last fiemap extent or
>> + * fieinfo->fi_extents_max.
>> + * So no cached fiemap should exist.
>> + */
>> +static int check_fiemap_cache(struct btrfs_fs_info *fs_info,
>> +			       struct fiemap_extent_info *fieinfo,
>> +			       struct fiemap_cache *cache)
>> +{
>> +	int ret;
>> +
>> +	if (!cache->cached)
>> +		return 0;
>> +
>> +	/* Small and recoverbale problem, only to info developer */
>> +#ifdef CONFIG_BTRFS_DEBUG
>> +	WARN_ON(1);
>> +#endif
>> +	btrfs_warn(fs_info,
>> +		   "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x",
>> +		   cache->offset, cache->phys, cache->len, cache->flags);
>> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
>> +				      cache->len, cache->flags);
>> +	cache->cached = false;
>> +	if (ret > 0)
>> +		ret = 0;
>> +	return ret;
>> +}
>> +
>>   int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>   		__u64 start, __u64 len, get_extent_t *get_extent)
>>   {
>> @@ -4370,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>   	struct extent_state *cached_state = NULL;
>>   	struct btrfs_path *path;
>>   	struct btrfs_root *root = BTRFS_I(inode)->root;
>> +	struct fiemap_cache cache = { 0 };
>>   	int end = 0;
>>   	u64 em_start = 0;
>>   	u64 em_len = 0;
>> @@ -4549,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>   			flags |= FIEMAP_EXTENT_LAST;
>>   			end = 1;
>>   		}
>> -		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
>> -					      em_len, flags);
>> +		ret = submit_fiemap_extent(fieinfo, &cache, em_start, disko,
>> +					   em_len, flags);
>>   		if (ret) {
>>   			if (ret == 1)
>>   				ret = 0;
>> @@ -4558,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>   		}
>>   	}
>>   out_free:
>> +	if (!ret)
>> +		ret = check_fiemap_cache(root->fs_info, fieinfo, &cache);
>>   	free_extent_map(em);
>>   out:
>>   	btrfs_free_path(path);
>> -- 
>> 2.9.3
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo April 20, 2017, 1:25 a.m. UTC | #3
On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
> [BUG]
> Cycle mount btrfs can cause fiemap to return different result.
> Like:
>  # mount /dev/vdb5 /mnt/btrfs
>  # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
>  # xfs_io -c "fiemap -v" /mnt/btrfs/file
>  /mnt/test/file:
>  EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>    0: [0..127]:        25088..25215       128   0x1
>  # umount /mnt/btrfs
>  # mount /dev/vdb5 /mnt/btrfs
>  # xfs_io -c "fiemap -v" /mnt/btrfs/file
>  /mnt/test/file:
>  EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>    0: [0..31]:         25088..25119        32   0x0
>    1: [32..63]:        25120..25151        32   0x0
>    2: [64..95]:        25152..25183        32   0x0
>    3: [96..127]:       25184..25215        32   0x1
> But after above fiemap, we get correct merged result if we call fiemap
> again.
>  # xfs_io -c "fiemap -v" /mnt/btrfs/file
>  /mnt/test/file:
>  EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>    0: [0..127]:        25088..25215       128   0x1
> 
> [REASON]
> Btrfs will try to merge extent map when inserting new extent map.
> 
> btrfs_fiemap(start=0 len=(u64)-1)
> |- extent_fiemap(start=0 len=(u64)-1)
>    |- get_extent_skip_holes(start=0 len=64k)
>    |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>    |     |- btrfs_get_extent(start=0 len=64k)
>    |        |  Found on-disk (ino, EXTENT_DATA, 0)
>    |        |- add_extent_mapping()
>    |        |- Return (em->start=0, len=16k)
>    |
>    |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
>    |
>    |- get_extent_skip_holes(start=0 len=64k)
>    |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>    |     |- btrfs_get_extent(start=16k len=48k)
>    |        |  Found on-disk (ino, EXTENT_DATA, 16k)
>    |        |- add_extent_mapping()
>    |        |  |- try_merge_map()
>    |        |     Merge with previous em start=0 len=16k
>    |        |     resulting em start=0 len=32k
>    |        |- Return (em->start=0, len=32K)    << Merged result
>    |- Stripe off the unrelated range (0~16K) of return em
>    |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
>       ^^^ Causing split fiemap extent.
> 
> And since in add_extent_mapping(), em is already merged, in next
> fiemap() call, we will get merged result.
> 
> [FIX]
> Here we introduce a new structure, fiemap_cache, which records previous
> fiemap extent.
> 
> And will always try to merge current fiemap_cache result before calling
> fiemap_fill_next_extent().
> Only when we failed to merge current fiemap extent with cached one, we
> will call fiemap_fill_next_extent() to submit cached one.
> 
> So by this method, we can merge all fiemap extents.
>

If I understand it correctly, what it's missing currently is 'merge', a
straightfoward idea is to make use of the 'merge' ability of btrfs_get_extent.

Since btrfs_get_extent_fiemap is a wrapper of btrfs_get_extent, does it make
sense if we make btrfs_get_extent_fiemap iterate all extent maps within the
range passing to it or just one more extent map to check if btrfs_get_extent
could return a merged extent map before returning?

If no extent map could be merged, which is the worst case, the first
btrfs_get_extent_fiemap will read file metadata into memory from disk and the
following btrfs_get_extent_fiemap will read the rest of file metadata from
the in-memory extent map tree directly.

Thanks,

-liubo

> It can also be done in fs/ioctl.c, however the problem is if
> fieinfo->fi_extents_max == 0, we have no space to cache previous fiemap
> extent.
> So I choose to merge it in btrfs.
> 
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
> v2:
>   Since fiemap_extent_info has a limit for number of fiemap_extent, it's possible
>   that fiemap_fill_next_extent() return 1 halfway. Remove the WARN_ON() which can
>   cause kernel warning if we fiemap is called on large compressed file.
> v3:
>   Rename finish_fiemap_extent() to check_fiemap_extent(), as in v3 we ensured
>   submit_fiemap_extent() to submit fiemap cache, so it just acts as a
>   sanity check.
>   Remove BTRFS_MAX_EXTENT_SIZE limit in submit_fiemap_extent(), as
>   extent map can be larger than BTRFS_MAX_EXTENT_SIZE.
>   Don't do backward jump, suggested by David.
>   Better sanity check and recoverable fix.
> 
> To David:
>   What about adding a btrfs_debug_warn(), which will only call WARN_ON(1) if
>   BTRFS_CONFIG_DEBUG is specified for recoverable bug?
> 
>   And modify ASSERT() to always WARN_ON() and exit error code?
> ---
>  fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 122 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 28e8192..c4cb65d 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -4353,6 +4353,123 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
>  	return NULL;
>  }
>  
> +/*
> + * To cache previous fiemap extent
> + *
> + * Will be used for merging fiemap extent
> + */
> +struct fiemap_cache {
> +	u64 offset;
> +	u64 phys;
> +	u64 len;
> +	u32 flags;
> +	bool cached;
> +};
> +
> +/*
> + * Helper to submit fiemap extent.
> + *
> + * Will try to merge current fiemap extent specified by @offset, @phys,
> + * @len and @flags with cached one.
> + * And only when we fails to merge, cached one will be submitted as
> + * fiemap extent.
> + *
> + * Return value is the same as fiemap_fill_next_extent().
> + */
> +static int submit_fiemap_extent(struct fiemap_extent_info *fieinfo,
> +				struct fiemap_cache *cache,
> +				u64 offset, u64 phys, u64 len, u32 flags)
> +{
> +	int ret = 0;
> +
> +	if (!cache->cached)
> +		goto assign;
> +
> +	/*
> +	 * Sanity check, extent_fiemap() should have ensured that new
> +	 * fiemap extent won't overlap with cahced one.
> +	 * Not recoverable.
> +	 *
> +	 * NOTE: Physical address can overlap, due to compression
> +	 */
> +	if (cache->offset + cache->len > offset) {
> +		WARN_ON(1);
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Only merges fiemap extents if
> +	 * 1) Their logical addresses are continuous
> +	 *
> +	 * 2) Their physical addresses are continuous
> +	 *    So truly compressed (physical size smaller than logical size)
> +	 *    extents won't get merged with each other
> +	 *
> +	 * 3) Share same flags except FIEMAP_EXTENT_LAST
> +	 *    So regular extent won't get merged with prealloc extent
> +	 */
> +	if (cache->offset + cache->len  == offset &&
> +	    cache->phys + cache->len == phys  &&
> +	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
> +			(flags & ~FIEMAP_EXTENT_LAST)) {
> +		cache->len += len;
> +		cache->flags |= flags;
> +		goto try_submit_last;
> +	}
> +
> +	/* Not mergeable, need to submit cached one */
> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
> +				      cache->len, cache->flags);
> +	cache->cached = false;
> +	if (ret)
> +		return ret;
> +assign:
> +	cache->cached = true;
> +	cache->offset = offset;
> +	cache->phys = phys;
> +	cache->len = len;
> +	cache->flags = flags;
> +try_submit_last:
> +	if (cache->flags & FIEMAP_EXTENT_LAST) {
> +		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
> +				cache->phys, cache->len, cache->flags);
> +		cache->cached = false;
> +	}
> +	return ret;
> +}
> +
> +/*
> + * Sanity check for fiemap cache
> + *
> + * All fiemap cache should be submitted by submit_fiemap_extent()
> + * Iteration should be terminated either by last fiemap extent or
> + * fieinfo->fi_extents_max.
> + * So no cached fiemap should exist.
> + */
> +static int check_fiemap_cache(struct btrfs_fs_info *fs_info,
> +			       struct fiemap_extent_info *fieinfo,
> +			       struct fiemap_cache *cache)
> +{
> +	int ret;
> +
> +	if (!cache->cached)
> +		return 0;
> +
> +	/* Small and recoverbale problem, only to info developer */
> +#ifdef CONFIG_BTRFS_DEBUG
> +	WARN_ON(1);
> +#endif
> +	btrfs_warn(fs_info,
> +		   "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x",
> +		   cache->offset, cache->phys, cache->len, cache->flags);
> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
> +				      cache->len, cache->flags);
> +	cache->cached = false;
> +	if (ret > 0)
> +		ret = 0;
> +	return ret;
> +}
> +
>  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  		__u64 start, __u64 len, get_extent_t *get_extent)
>  {
> @@ -4370,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  	struct extent_state *cached_state = NULL;
>  	struct btrfs_path *path;
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct fiemap_cache cache = { 0 };
>  	int end = 0;
>  	u64 em_start = 0;
>  	u64 em_len = 0;
> @@ -4549,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  			flags |= FIEMAP_EXTENT_LAST;
>  			end = 1;
>  		}
> -		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
> -					      em_len, flags);
> +		ret = submit_fiemap_extent(fieinfo, &cache, em_start, disko,
> +					   em_len, flags);
>  		if (ret) {
>  			if (ret == 1)
>  				ret = 0;
> @@ -4558,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  		}
>  	}
>  out_free:
> +	if (!ret)
> +		ret = check_fiemap_cache(root->fs_info, fieinfo, &cache);
>  	free_extent_map(em);
>  out:
>  	btrfs_free_path(path);
> -- 
> 2.9.3
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo April 20, 2017, 1:52 a.m. UTC | #4
At 04/20/2017 09:25 AM, Liu Bo wrote:
> On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
>> [BUG]
>> Cycle mount btrfs can cause fiemap to return different result.
>> Like:
>>   # mount /dev/vdb5 /mnt/btrfs
>>   # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
>>   # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>   /mnt/test/file:
>>   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>     0: [0..127]:        25088..25215       128   0x1
>>   # umount /mnt/btrfs
>>   # mount /dev/vdb5 /mnt/btrfs
>>   # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>   /mnt/test/file:
>>   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>     0: [0..31]:         25088..25119        32   0x0
>>     1: [32..63]:        25120..25151        32   0x0
>>     2: [64..95]:        25152..25183        32   0x0
>>     3: [96..127]:       25184..25215        32   0x1
>> But after above fiemap, we get correct merged result if we call fiemap
>> again.
>>   # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>   /mnt/test/file:
>>   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>     0: [0..127]:        25088..25215       128   0x1
>>
>> [REASON]
>> Btrfs will try to merge extent map when inserting new extent map.
>>
>> btrfs_fiemap(start=0 len=(u64)-1)
>> |- extent_fiemap(start=0 len=(u64)-1)
>>     |- get_extent_skip_holes(start=0 len=64k)
>>     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>>     |     |- btrfs_get_extent(start=0 len=64k)
>>     |        |  Found on-disk (ino, EXTENT_DATA, 0)
>>     |        |- add_extent_mapping()
>>     |        |- Return (em->start=0, len=16k)
>>     |
>>     |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
>>     |
>>     |- get_extent_skip_holes(start=0 len=64k)
>>     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>>     |     |- btrfs_get_extent(start=16k len=48k)
>>     |        |  Found on-disk (ino, EXTENT_DATA, 16k)
>>     |        |- add_extent_mapping()
>>     |        |  |- try_merge_map()
>>     |        |     Merge with previous em start=0 len=16k
>>     |        |     resulting em start=0 len=32k
>>     |        |- Return (em->start=0, len=32K)    << Merged result
>>     |- Stripe off the unrelated range (0~16K) of return em
>>     |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
>>        ^^^ Causing split fiemap extent.
>>
>> And since in add_extent_mapping(), em is already merged, in next
>> fiemap() call, we will get merged result.
>>
>> [FIX]
>> Here we introduce a new structure, fiemap_cache, which records previous
>> fiemap extent.
>>
>> And will always try to merge current fiemap_cache result before calling
>> fiemap_fill_next_extent().
>> Only when we failed to merge current fiemap extent with cached one, we
>> will call fiemap_fill_next_extent() to submit cached one.
>>
>> So by this method, we can merge all fiemap extents.
>>
> 
> If I understand it correctly, what it's missing currently is 'merge', a
> straightfoward idea is to make use of the 'merge' ability of btrfs_get_extent. >
> Since btrfs_get_extent_fiemap is a wrapper of btrfs_get_extent, does it make
> sense if we make btrfs_get_extent_fiemap iterate all extent maps within the
> range passing to it or just one more extent map to check if btrfs_get_extent
> could return a merged extent map before returning?

So your idea to to do the merging inside btrfs_get_extent(), instead of 
merging it in extent_fiemap()?

In theory, they have the same effect for fiemap.
And that's my original idea.

But the problem is, btrfs_get_extent() is called almost everywhere, more 
frequently than extent_fiemap(), the extra time used to merging extent 
map may cause performance problem.

For the worst case, if a file is made up by 262144 4K extent (takes up 
1G), btrfs_get_extent() call on the file will iterate all these extents,
which will iterate more than 500 16K tree blocks.

If only fiemap takes such longer time, it's still acceptable. But if 
btrfs_get_extent() takes such longer time, I think it will be a problem 
then.

Thanks,
Qu


> 
> If no extent map could be merged, which is the worst case, the first
> btrfs_get_extent_fiemap will read file metadata into memory from disk and the
> following btrfs_get_extent_fiemap will read the rest of file metadata from
> the in-memory extent map tree directly.
> 
> Thanks,
> 
> -liubo
> 
>> It can also be done in fs/ioctl.c, however the problem is if
>> fieinfo->fi_extents_max == 0, we have no space to cache previous fiemap
>> extent.
>> So I choose to merge it in btrfs.
>>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>> v2:
>>    Since fiemap_extent_info has a limit for number of fiemap_extent, it's possible
>>    that fiemap_fill_next_extent() return 1 halfway. Remove the WARN_ON() which can
>>    cause kernel warning if we fiemap is called on large compressed file.
>> v3:
>>    Rename finish_fiemap_extent() to check_fiemap_extent(), as in v3 we ensured
>>    submit_fiemap_extent() to submit fiemap cache, so it just acts as a
>>    sanity check.
>>    Remove BTRFS_MAX_EXTENT_SIZE limit in submit_fiemap_extent(), as
>>    extent map can be larger than BTRFS_MAX_EXTENT_SIZE.
>>    Don't do backward jump, suggested by David.
>>    Better sanity check and recoverable fix.
>>
>> To David:
>>    What about adding a btrfs_debug_warn(), which will only call WARN_ON(1) if
>>    BTRFS_CONFIG_DEBUG is specified for recoverable bug?
>>
>>    And modify ASSERT() to always WARN_ON() and exit error code?
>> ---
>>   fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 122 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 28e8192..c4cb65d 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -4353,6 +4353,123 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
>>   	return NULL;
>>   }
>>   
>> +/*
>> + * To cache previous fiemap extent
>> + *
>> + * Will be used for merging fiemap extent
>> + */
>> +struct fiemap_cache {
>> +	u64 offset;
>> +	u64 phys;
>> +	u64 len;
>> +	u32 flags;
>> +	bool cached;
>> +};
>> +
>> +/*
>> + * Helper to submit fiemap extent.
>> + *
>> + * Will try to merge current fiemap extent specified by @offset, @phys,
>> + * @len and @flags with cached one.
>> + * And only when we fails to merge, cached one will be submitted as
>> + * fiemap extent.
>> + *
>> + * Return value is the same as fiemap_fill_next_extent().
>> + */
>> +static int submit_fiemap_extent(struct fiemap_extent_info *fieinfo,
>> +				struct fiemap_cache *cache,
>> +				u64 offset, u64 phys, u64 len, u32 flags)
>> +{
>> +	int ret = 0;
>> +
>> +	if (!cache->cached)
>> +		goto assign;
>> +
>> +	/*
>> +	 * Sanity check, extent_fiemap() should have ensured that new
>> +	 * fiemap extent won't overlap with cahced one.
>> +	 * Not recoverable.
>> +	 *
>> +	 * NOTE: Physical address can overlap, due to compression
>> +	 */
>> +	if (cache->offset + cache->len > offset) {
>> +		WARN_ON(1);
>> +		return -EINVAL;
>> +	}
>> +
>> +	/*
>> +	 * Only merges fiemap extents if
>> +	 * 1) Their logical addresses are continuous
>> +	 *
>> +	 * 2) Their physical addresses are continuous
>> +	 *    So truly compressed (physical size smaller than logical size)
>> +	 *    extents won't get merged with each other
>> +	 *
>> +	 * 3) Share same flags except FIEMAP_EXTENT_LAST
>> +	 *    So regular extent won't get merged with prealloc extent
>> +	 */
>> +	if (cache->offset + cache->len  == offset &&
>> +	    cache->phys + cache->len == phys  &&
>> +	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
>> +			(flags & ~FIEMAP_EXTENT_LAST)) {
>> +		cache->len += len;
>> +		cache->flags |= flags;
>> +		goto try_submit_last;
>> +	}
>> +
>> +	/* Not mergeable, need to submit cached one */
>> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
>> +				      cache->len, cache->flags);
>> +	cache->cached = false;
>> +	if (ret)
>> +		return ret;
>> +assign:
>> +	cache->cached = true;
>> +	cache->offset = offset;
>> +	cache->phys = phys;
>> +	cache->len = len;
>> +	cache->flags = flags;
>> +try_submit_last:
>> +	if (cache->flags & FIEMAP_EXTENT_LAST) {
>> +		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
>> +				cache->phys, cache->len, cache->flags);
>> +		cache->cached = false;
>> +	}
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Sanity check for fiemap cache
>> + *
>> + * All fiemap cache should be submitted by submit_fiemap_extent()
>> + * Iteration should be terminated either by last fiemap extent or
>> + * fieinfo->fi_extents_max.
>> + * So no cached fiemap should exist.
>> + */
>> +static int check_fiemap_cache(struct btrfs_fs_info *fs_info,
>> +			       struct fiemap_extent_info *fieinfo,
>> +			       struct fiemap_cache *cache)
>> +{
>> +	int ret;
>> +
>> +	if (!cache->cached)
>> +		return 0;
>> +
>> +	/* Small and recoverbale problem, only to info developer */
>> +#ifdef CONFIG_BTRFS_DEBUG
>> +	WARN_ON(1);
>> +#endif
>> +	btrfs_warn(fs_info,
>> +		   "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x",
>> +		   cache->offset, cache->phys, cache->len, cache->flags);
>> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
>> +				      cache->len, cache->flags);
>> +	cache->cached = false;
>> +	if (ret > 0)
>> +		ret = 0;
>> +	return ret;
>> +}
>> +
>>   int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>   		__u64 start, __u64 len, get_extent_t *get_extent)
>>   {
>> @@ -4370,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>   	struct extent_state *cached_state = NULL;
>>   	struct btrfs_path *path;
>>   	struct btrfs_root *root = BTRFS_I(inode)->root;
>> +	struct fiemap_cache cache = { 0 };
>>   	int end = 0;
>>   	u64 em_start = 0;
>>   	u64 em_len = 0;
>> @@ -4549,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>   			flags |= FIEMAP_EXTENT_LAST;
>>   			end = 1;
>>   		}
>> -		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
>> -					      em_len, flags);
>> +		ret = submit_fiemap_extent(fieinfo, &cache, em_start, disko,
>> +					   em_len, flags);
>>   		if (ret) {
>>   			if (ret == 1)
>>   				ret = 0;
>> @@ -4558,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>   		}
>>   	}
>>   out_free:
>> +	if (!ret)
>> +		ret = check_fiemap_cache(root->fs_info, fieinfo, &cache);
>>   	free_extent_map(em);
>>   out:
>>   	btrfs_free_path(path);
>> -- 
>> 2.9.3
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo April 20, 2017, 1:58 a.m. UTC | #5
On Thu, Apr 20, 2017 at 09:52:00AM +0800, Qu Wenruo wrote:
> 
> 
> At 04/20/2017 09:25 AM, Liu Bo wrote:
> > On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
> > > [BUG]
> > > Cycle mount btrfs can cause fiemap to return different result.
> > > Like:
> > >   # mount /dev/vdb5 /mnt/btrfs
> > >   # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
> > >   # xfs_io -c "fiemap -v" /mnt/btrfs/file
> > >   /mnt/test/file:
> > >   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> > >     0: [0..127]:        25088..25215       128   0x1
> > >   # umount /mnt/btrfs
> > >   # mount /dev/vdb5 /mnt/btrfs
> > >   # xfs_io -c "fiemap -v" /mnt/btrfs/file
> > >   /mnt/test/file:
> > >   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> > >     0: [0..31]:         25088..25119        32   0x0
> > >     1: [32..63]:        25120..25151        32   0x0
> > >     2: [64..95]:        25152..25183        32   0x0
> > >     3: [96..127]:       25184..25215        32   0x1
> > > But after above fiemap, we get correct merged result if we call fiemap
> > > again.
> > >   # xfs_io -c "fiemap -v" /mnt/btrfs/file
> > >   /mnt/test/file:
> > >   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> > >     0: [0..127]:        25088..25215       128   0x1
> > > 
> > > [REASON]
> > > Btrfs will try to merge extent map when inserting new extent map.
> > > 
> > > btrfs_fiemap(start=0 len=(u64)-1)
> > > |- extent_fiemap(start=0 len=(u64)-1)
> > >     |- get_extent_skip_holes(start=0 len=64k)
> > >     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
> > >     |     |- btrfs_get_extent(start=0 len=64k)
> > >     |        |  Found on-disk (ino, EXTENT_DATA, 0)
> > >     |        |- add_extent_mapping()
> > >     |        |- Return (em->start=0, len=16k)
> > >     |
> > >     |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
> > >     |
> > >     |- get_extent_skip_holes(start=0 len=64k)
> > >     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
> > >     |     |- btrfs_get_extent(start=16k len=48k)
> > >     |        |  Found on-disk (ino, EXTENT_DATA, 16k)
> > >     |        |- add_extent_mapping()
> > >     |        |  |- try_merge_map()
> > >     |        |     Merge with previous em start=0 len=16k
> > >     |        |     resulting em start=0 len=32k
> > >     |        |- Return (em->start=0, len=32K)    << Merged result
> > >     |- Stripe off the unrelated range (0~16K) of return em
> > >     |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
> > >        ^^^ Causing split fiemap extent.
> > > 
> > > And since in add_extent_mapping(), em is already merged, in next
> > > fiemap() call, we will get merged result.
> > > 
> > > [FIX]
> > > Here we introduce a new structure, fiemap_cache, which records previous
> > > fiemap extent.
> > > 
> > > And will always try to merge current fiemap_cache result before calling
> > > fiemap_fill_next_extent().
> > > Only when we failed to merge current fiemap extent with cached one, we
> > > will call fiemap_fill_next_extent() to submit cached one.
> > > 
> > > So by this method, we can merge all fiemap extents.
> > > 
> > 
> > If I understand it correctly, what it's missing currently is 'merge', a
> > straightfoward idea is to make use of the 'merge' ability of btrfs_get_extent. >
> > Since btrfs_get_extent_fiemap is a wrapper of btrfs_get_extent, does it make
> > sense if we make btrfs_get_extent_fiemap iterate all extent maps within the
> > range passing to it or just one more extent map to check if btrfs_get_extent
> > could return a merged extent map before returning?
> 
> So your idea to to do the merging inside btrfs_get_extent(), instead of
> merging it in extent_fiemap()?
>

No, merge ems inside the wrapper, ie. btrfs_get_extent_fiemap().

Thanks,

-liubo
> In theory, they have the same effect for fiemap.
> And that's my original idea.
> 
> But the problem is, btrfs_get_extent() is called almost everywhere, more
> frequently than extent_fiemap(), the extra time used to merging extent map
> may cause performance problem.
> 
> For the worst case, if a file is made up by 262144 4K extent (takes up 1G),
> btrfs_get_extent() call on the file will iterate all these extents,
> which will iterate more than 500 16K tree blocks.
> 
> If only fiemap takes such longer time, it's still acceptable. But if
> btrfs_get_extent() takes such longer time, I think it will be a problem
> then.
> 
> Thanks,
> Qu
> 
> 
> > 
> > If no extent map could be merged, which is the worst case, the first
> > btrfs_get_extent_fiemap will read file metadata into memory from disk and the
> > following btrfs_get_extent_fiemap will read the rest of file metadata from
> > the in-memory extent map tree directly.
> > 
> > Thanks,
> > 
> > -liubo
> > 
> > > It can also be done in fs/ioctl.c, however the problem is if
> > > fieinfo->fi_extents_max == 0, we have no space to cache previous fiemap
> > > extent.
> > > So I choose to merge it in btrfs.
> > > 
> > > Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> > > ---
> > > v2:
> > >    Since fiemap_extent_info has a limit for number of fiemap_extent, it's possible
> > >    that fiemap_fill_next_extent() return 1 halfway. Remove the WARN_ON() which can
> > >    cause kernel warning if we fiemap is called on large compressed file.
> > > v3:
> > >    Rename finish_fiemap_extent() to check_fiemap_extent(), as in v3 we ensured
> > >    submit_fiemap_extent() to submit fiemap cache, so it just acts as a
> > >    sanity check.
> > >    Remove BTRFS_MAX_EXTENT_SIZE limit in submit_fiemap_extent(), as
> > >    extent map can be larger than BTRFS_MAX_EXTENT_SIZE.
> > >    Don't do backward jump, suggested by David.
> > >    Better sanity check and recoverable fix.
> > > 
> > > To David:
> > >    What about adding a btrfs_debug_warn(), which will only call WARN_ON(1) if
> > >    BTRFS_CONFIG_DEBUG is specified for recoverable bug?
> > > 
> > >    And modify ASSERT() to always WARN_ON() and exit error code?
> > > ---
> > >   fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
> > >   1 file changed, 122 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> > > index 28e8192..c4cb65d 100644
> > > --- a/fs/btrfs/extent_io.c
> > > +++ b/fs/btrfs/extent_io.c
> > > @@ -4353,6 +4353,123 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
> > >   	return NULL;
> > >   }
> > > +/*
> > > + * To cache previous fiemap extent
> > > + *
> > > + * Will be used for merging fiemap extent
> > > + */
> > > +struct fiemap_cache {
> > > +	u64 offset;
> > > +	u64 phys;
> > > +	u64 len;
> > > +	u32 flags;
> > > +	bool cached;
> > > +};
> > > +
> > > +/*
> > > + * Helper to submit fiemap extent.
> > > + *
> > > + * Will try to merge current fiemap extent specified by @offset, @phys,
> > > + * @len and @flags with cached one.
> > > + * And only when we fails to merge, cached one will be submitted as
> > > + * fiemap extent.
> > > + *
> > > + * Return value is the same as fiemap_fill_next_extent().
> > > + */
> > > +static int submit_fiemap_extent(struct fiemap_extent_info *fieinfo,
> > > +				struct fiemap_cache *cache,
> > > +				u64 offset, u64 phys, u64 len, u32 flags)
> > > +{
> > > +	int ret = 0;
> > > +
> > > +	if (!cache->cached)
> > > +		goto assign;
> > > +
> > > +	/*
> > > +	 * Sanity check, extent_fiemap() should have ensured that new
> > > +	 * fiemap extent won't overlap with cahced one.
> > > +	 * Not recoverable.
> > > +	 *
> > > +	 * NOTE: Physical address can overlap, due to compression
> > > +	 */
> > > +	if (cache->offset + cache->len > offset) {
> > > +		WARN_ON(1);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	/*
> > > +	 * Only merges fiemap extents if
> > > +	 * 1) Their logical addresses are continuous
> > > +	 *
> > > +	 * 2) Their physical addresses are continuous
> > > +	 *    So truly compressed (physical size smaller than logical size)
> > > +	 *    extents won't get merged with each other
> > > +	 *
> > > +	 * 3) Share same flags except FIEMAP_EXTENT_LAST
> > > +	 *    So regular extent won't get merged with prealloc extent
> > > +	 */
> > > +	if (cache->offset + cache->len  == offset &&
> > > +	    cache->phys + cache->len == phys  &&
> > > +	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
> > > +			(flags & ~FIEMAP_EXTENT_LAST)) {
> > > +		cache->len += len;
> > > +		cache->flags |= flags;
> > > +		goto try_submit_last;
> > > +	}
> > > +
> > > +	/* Not mergeable, need to submit cached one */
> > > +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
> > > +				      cache->len, cache->flags);
> > > +	cache->cached = false;
> > > +	if (ret)
> > > +		return ret;
> > > +assign:
> > > +	cache->cached = true;
> > > +	cache->offset = offset;
> > > +	cache->phys = phys;
> > > +	cache->len = len;
> > > +	cache->flags = flags;
> > > +try_submit_last:
> > > +	if (cache->flags & FIEMAP_EXTENT_LAST) {
> > > +		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
> > > +				cache->phys, cache->len, cache->flags);
> > > +		cache->cached = false;
> > > +	}
> > > +	return ret;
> > > +}
> > > +
> > > +/*
> > > + * Sanity check for fiemap cache
> > > + *
> > > + * All fiemap cache should be submitted by submit_fiemap_extent()
> > > + * Iteration should be terminated either by last fiemap extent or
> > > + * fieinfo->fi_extents_max.
> > > + * So no cached fiemap should exist.
> > > + */
> > > +static int check_fiemap_cache(struct btrfs_fs_info *fs_info,
> > > +			       struct fiemap_extent_info *fieinfo,
> > > +			       struct fiemap_cache *cache)
> > > +{
> > > +	int ret;
> > > +
> > > +	if (!cache->cached)
> > > +		return 0;
> > > +
> > > +	/* Small and recoverbale problem, only to info developer */
> > > +#ifdef CONFIG_BTRFS_DEBUG
> > > +	WARN_ON(1);
> > > +#endif
> > > +	btrfs_warn(fs_info,
> > > +		   "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x",
> > > +		   cache->offset, cache->phys, cache->len, cache->flags);
> > > +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
> > > +				      cache->len, cache->flags);
> > > +	cache->cached = false;
> > > +	if (ret > 0)
> > > +		ret = 0;
> > > +	return ret;
> > > +}
> > > +
> > >   int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> > >   		__u64 start, __u64 len, get_extent_t *get_extent)
> > >   {
> > > @@ -4370,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> > >   	struct extent_state *cached_state = NULL;
> > >   	struct btrfs_path *path;
> > >   	struct btrfs_root *root = BTRFS_I(inode)->root;
> > > +	struct fiemap_cache cache = { 0 };
> > >   	int end = 0;
> > >   	u64 em_start = 0;
> > >   	u64 em_len = 0;
> > > @@ -4549,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> > >   			flags |= FIEMAP_EXTENT_LAST;
> > >   			end = 1;
> > >   		}
> > > -		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
> > > -					      em_len, flags);
> > > +		ret = submit_fiemap_extent(fieinfo, &cache, em_start, disko,
> > > +					   em_len, flags);
> > >   		if (ret) {
> > >   			if (ret == 1)
> > >   				ret = 0;
> > > @@ -4558,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> > >   		}
> > >   	}
> > >   out_free:
> > > +	if (!ret)
> > > +		ret = check_fiemap_cache(root->fs_info, fieinfo, &cache);
> > >   	free_extent_map(em);
> > >   out:
> > >   	btrfs_free_path(path);
> > > -- 
> > > 2.9.3
> > > 
> > > 
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo April 20, 2017, 2:08 a.m. UTC | #6
On Thu, Apr 20, 2017 at 09:52:00AM +0800, Qu Wenruo wrote:
> 
> 
> At 04/20/2017 09:25 AM, Liu Bo wrote:
> > On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
> > > [BUG]
> > > Cycle mount btrfs can cause fiemap to return different result.
> > > Like:
> > >   # mount /dev/vdb5 /mnt/btrfs
> > >   # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
> > >   # xfs_io -c "fiemap -v" /mnt/btrfs/file
> > >   /mnt/test/file:
> > >   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> > >     0: [0..127]:        25088..25215       128   0x1
> > >   # umount /mnt/btrfs
> > >   # mount /dev/vdb5 /mnt/btrfs
> > >   # xfs_io -c "fiemap -v" /mnt/btrfs/file
> > >   /mnt/test/file:
> > >   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> > >     0: [0..31]:         25088..25119        32   0x0
> > >     1: [32..63]:        25120..25151        32   0x0
> > >     2: [64..95]:        25152..25183        32   0x0
> > >     3: [96..127]:       25184..25215        32   0x1
> > > But after above fiemap, we get correct merged result if we call fiemap
> > > again.
> > >   # xfs_io -c "fiemap -v" /mnt/btrfs/file
> > >   /mnt/test/file:
> > >   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> > >     0: [0..127]:        25088..25215       128   0x1
> > > 
> > > [REASON]
> > > Btrfs will try to merge extent map when inserting new extent map.
> > > 
> > > btrfs_fiemap(start=0 len=(u64)-1)
> > > |- extent_fiemap(start=0 len=(u64)-1)
> > >     |- get_extent_skip_holes(start=0 len=64k)
> > >     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
> > >     |     |- btrfs_get_extent(start=0 len=64k)
> > >     |        |  Found on-disk (ino, EXTENT_DATA, 0)
> > >     |        |- add_extent_mapping()
> > >     |        |- Return (em->start=0, len=16k)
> > >     |
> > >     |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
> > >     |
> > >     |- get_extent_skip_holes(start=0 len=64k)
> > >     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
> > >     |     |- btrfs_get_extent(start=16k len=48k)
> > >     |        |  Found on-disk (ino, EXTENT_DATA, 16k)
> > >     |        |- add_extent_mapping()
> > >     |        |  |- try_merge_map()
> > >     |        |     Merge with previous em start=0 len=16k
> > >     |        |     resulting em start=0 len=32k
> > >     |        |- Return (em->start=0, len=32K)    << Merged result
> > >     |- Stripe off the unrelated range (0~16K) of return em
> > >     |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
> > >        ^^^ Causing split fiemap extent.
> > > 
> > > And since in add_extent_mapping(), em is already merged, in next
> > > fiemap() call, we will get merged result.
> > > 
> > > [FIX]
> > > Here we introduce a new structure, fiemap_cache, which records previous
> > > fiemap extent.
> > > 
> > > And will always try to merge current fiemap_cache result before calling
> > > fiemap_fill_next_extent().
> > > Only when we failed to merge current fiemap extent with cached one, we
> > > will call fiemap_fill_next_extent() to submit cached one.
> > > 
> > > So by this method, we can merge all fiemap extents.
> > > 
> > 
> > If I understand it correctly, what it's missing currently is 'merge', a
> > straightfoward idea is to make use of the 'merge' ability of btrfs_get_extent. >
> > Since btrfs_get_extent_fiemap is a wrapper of btrfs_get_extent, does it make
> > sense if we make btrfs_get_extent_fiemap iterate all extent maps within the
> > range passing to it or just one more extent map to check if btrfs_get_extent
> > could return a merged extent map before returning?
> 
> So your idea to to do the merging inside btrfs_get_extent(), instead of
> merging it in extent_fiemap()?
> 
> In theory, they have the same effect for fiemap.
> And that's my original idea.
> 
> But the problem is, btrfs_get_extent() is called almost everywhere, more
> frequently than extent_fiemap(), the extra time used to merging extent map
> may cause performance problem.
> 
> For the worst case, if a file is made up by 262144 4K extent (takes up 1G),
> btrfs_get_extent() call on the file will iterate all these extents,
> which will iterate more than 500 16K tree blocks.
> 
> If only fiemap takes such longer time, it's still acceptable. But if
> btrfs_get_extent() takes such longer time, I think it will be a problem
> then.
>

Not related to the patch, but the question is, does fiemap ioctl think
it is important to have unified output?

The filefrag manual only says it's attempting to get extent
information.  And this inconsistent output of filefrag doesn't seem to
confusing users as the reported extent count is correct.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>

Thanks,

-liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo April 20, 2017, 2:09 a.m. UTC | #7
At 04/20/2017 09:58 AM, Liu Bo wrote:
> On Thu, Apr 20, 2017 at 09:52:00AM +0800, Qu Wenruo wrote:
>>
>>
>> At 04/20/2017 09:25 AM, Liu Bo wrote:
>>> On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
>>>> [BUG]
>>>> Cycle mount btrfs can cause fiemap to return different result.
>>>> Like:
>>>>    # mount /dev/vdb5 /mnt/btrfs
>>>>    # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
>>>>    # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>>>    /mnt/test/file:
>>>>    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>>>      0: [0..127]:        25088..25215       128   0x1
>>>>    # umount /mnt/btrfs
>>>>    # mount /dev/vdb5 /mnt/btrfs
>>>>    # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>>>    /mnt/test/file:
>>>>    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>>>      0: [0..31]:         25088..25119        32   0x0
>>>>      1: [32..63]:        25120..25151        32   0x0
>>>>      2: [64..95]:        25152..25183        32   0x0
>>>>      3: [96..127]:       25184..25215        32   0x1
>>>> But after above fiemap, we get correct merged result if we call fiemap
>>>> again.
>>>>    # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>>>    /mnt/test/file:
>>>>    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>>>      0: [0..127]:        25088..25215       128   0x1
>>>>
>>>> [REASON]
>>>> Btrfs will try to merge extent map when inserting new extent map.
>>>>
>>>> btrfs_fiemap(start=0 len=(u64)-1)
>>>> |- extent_fiemap(start=0 len=(u64)-1)
>>>>      |- get_extent_skip_holes(start=0 len=64k)
>>>>      |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>>>>      |     |- btrfs_get_extent(start=0 len=64k)
>>>>      |        |  Found on-disk (ino, EXTENT_DATA, 0)
>>>>      |        |- add_extent_mapping()
>>>>      |        |- Return (em->start=0, len=16k)
>>>>      |
>>>>      |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
>>>>      |
>>>>      |- get_extent_skip_holes(start=0 len=64k)
>>>>      |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>>>>      |     |- btrfs_get_extent(start=16k len=48k)
>>>>      |        |  Found on-disk (ino, EXTENT_DATA, 16k)
>>>>      |        |- add_extent_mapping()
>>>>      |        |  |- try_merge_map()
>>>>      |        |     Merge with previous em start=0 len=16k
>>>>      |        |     resulting em start=0 len=32k
>>>>      |        |- Return (em->start=0, len=32K)    << Merged result
>>>>      |- Stripe off the unrelated range (0~16K) of return em
>>>>      |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
>>>>         ^^^ Causing split fiemap extent.
>>>>
>>>> And since in add_extent_mapping(), em is already merged, in next
>>>> fiemap() call, we will get merged result.
>>>>
>>>> [FIX]
>>>> Here we introduce a new structure, fiemap_cache, which records previous
>>>> fiemap extent.
>>>>
>>>> And will always try to merge current fiemap_cache result before calling
>>>> fiemap_fill_next_extent().
>>>> Only when we failed to merge current fiemap extent with cached one, we
>>>> will call fiemap_fill_next_extent() to submit cached one.
>>>>
>>>> So by this method, we can merge all fiemap extents.
>>>>
>>>
>>> If I understand it correctly, what it's missing currently is 'merge', a
>>> straightfoward idea is to make use of the 'merge' ability of btrfs_get_extent. >
>>> Since btrfs_get_extent_fiemap is a wrapper of btrfs_get_extent, does it make
>>> sense if we make btrfs_get_extent_fiemap iterate all extent maps within the
>>> range passing to it or just one more extent map to check if btrfs_get_extent
>>> could return a merged extent map before returning?
>>
>> So your idea to to do the merging inside btrfs_get_extent(), instead of
>> merging it in extent_fiemap()?
>>
> 
> No, merge ems inside the wrapper, ie. btrfs_get_extent_fiemap().

Then llseek() with SEEK_HOLE/SEEK_DATA also get affected.

So limiting the extra time to merging extent maps in fiemap is still valid.

Thanks,
Qu
> 
> Thanks,
> 
> -liubo
>> In theory, they have the same effect for fiemap.
>> And that's my original idea.
>>
>> But the problem is, btrfs_get_extent() is called almost everywhere, more
>> frequently than extent_fiemap(), the extra time used to merging extent map
>> may cause performance problem.
>>
>> For the worst case, if a file is made up by 262144 4K extent (takes up 1G),
>> btrfs_get_extent() call on the file will iterate all these extents,
>> which will iterate more than 500 16K tree blocks.
>>
>> If only fiemap takes such longer time, it's still acceptable. But if
>> btrfs_get_extent() takes such longer time, I think it will be a problem
>> then.
>>
>> Thanks,
>> Qu
>>
>>
>>>
>>> If no extent map could be merged, which is the worst case, the first
>>> btrfs_get_extent_fiemap will read file metadata into memory from disk and the
>>> following btrfs_get_extent_fiemap will read the rest of file metadata from
>>> the in-memory extent map tree directly.
>>>
>>> Thanks,
>>>
>>> -liubo
>>>
>>>> It can also be done in fs/ioctl.c, however the problem is if
>>>> fieinfo->fi_extents_max == 0, we have no space to cache previous fiemap
>>>> extent.
>>>> So I choose to merge it in btrfs.
>>>>
>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>> ---
>>>> v2:
>>>>     Since fiemap_extent_info has a limit for number of fiemap_extent, it's possible
>>>>     that fiemap_fill_next_extent() return 1 halfway. Remove the WARN_ON() which can
>>>>     cause kernel warning if we fiemap is called on large compressed file.
>>>> v3:
>>>>     Rename finish_fiemap_extent() to check_fiemap_extent(), as in v3 we ensured
>>>>     submit_fiemap_extent() to submit fiemap cache, so it just acts as a
>>>>     sanity check.
>>>>     Remove BTRFS_MAX_EXTENT_SIZE limit in submit_fiemap_extent(), as
>>>>     extent map can be larger than BTRFS_MAX_EXTENT_SIZE.
>>>>     Don't do backward jump, suggested by David.
>>>>     Better sanity check and recoverable fix.
>>>>
>>>> To David:
>>>>     What about adding a btrfs_debug_warn(), which will only call WARN_ON(1) if
>>>>     BTRFS_CONFIG_DEBUG is specified for recoverable bug?
>>>>
>>>>     And modify ASSERT() to always WARN_ON() and exit error code?
>>>> ---
>>>>    fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 122 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>>> index 28e8192..c4cb65d 100644
>>>> --- a/fs/btrfs/extent_io.c
>>>> +++ b/fs/btrfs/extent_io.c
>>>> @@ -4353,6 +4353,123 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
>>>>    	return NULL;
>>>>    }
>>>> +/*
>>>> + * To cache previous fiemap extent
>>>> + *
>>>> + * Will be used for merging fiemap extent
>>>> + */
>>>> +struct fiemap_cache {
>>>> +	u64 offset;
>>>> +	u64 phys;
>>>> +	u64 len;
>>>> +	u32 flags;
>>>> +	bool cached;
>>>> +};
>>>> +
>>>> +/*
>>>> + * Helper to submit fiemap extent.
>>>> + *
>>>> + * Will try to merge current fiemap extent specified by @offset, @phys,
>>>> + * @len and @flags with cached one.
>>>> + * And only when we fails to merge, cached one will be submitted as
>>>> + * fiemap extent.
>>>> + *
>>>> + * Return value is the same as fiemap_fill_next_extent().
>>>> + */
>>>> +static int submit_fiemap_extent(struct fiemap_extent_info *fieinfo,
>>>> +				struct fiemap_cache *cache,
>>>> +				u64 offset, u64 phys, u64 len, u32 flags)
>>>> +{
>>>> +	int ret = 0;
>>>> +
>>>> +	if (!cache->cached)
>>>> +		goto assign;
>>>> +
>>>> +	/*
>>>> +	 * Sanity check, extent_fiemap() should have ensured that new
>>>> +	 * fiemap extent won't overlap with cahced one.
>>>> +	 * Not recoverable.
>>>> +	 *
>>>> +	 * NOTE: Physical address can overlap, due to compression
>>>> +	 */
>>>> +	if (cache->offset + cache->len > offset) {
>>>> +		WARN_ON(1);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Only merges fiemap extents if
>>>> +	 * 1) Their logical addresses are continuous
>>>> +	 *
>>>> +	 * 2) Their physical addresses are continuous
>>>> +	 *    So truly compressed (physical size smaller than logical size)
>>>> +	 *    extents won't get merged with each other
>>>> +	 *
>>>> +	 * 3) Share same flags except FIEMAP_EXTENT_LAST
>>>> +	 *    So regular extent won't get merged with prealloc extent
>>>> +	 */
>>>> +	if (cache->offset + cache->len  == offset &&
>>>> +	    cache->phys + cache->len == phys  &&
>>>> +	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
>>>> +			(flags & ~FIEMAP_EXTENT_LAST)) {
>>>> +		cache->len += len;
>>>> +		cache->flags |= flags;
>>>> +		goto try_submit_last;
>>>> +	}
>>>> +
>>>> +	/* Not mergeable, need to submit cached one */
>>>> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
>>>> +				      cache->len, cache->flags);
>>>> +	cache->cached = false;
>>>> +	if (ret)
>>>> +		return ret;
>>>> +assign:
>>>> +	cache->cached = true;
>>>> +	cache->offset = offset;
>>>> +	cache->phys = phys;
>>>> +	cache->len = len;
>>>> +	cache->flags = flags;
>>>> +try_submit_last:
>>>> +	if (cache->flags & FIEMAP_EXTENT_LAST) {
>>>> +		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
>>>> +				cache->phys, cache->len, cache->flags);
>>>> +		cache->cached = false;
>>>> +	}
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Sanity check for fiemap cache
>>>> + *
>>>> + * All fiemap cache should be submitted by submit_fiemap_extent()
>>>> + * Iteration should be terminated either by last fiemap extent or
>>>> + * fieinfo->fi_extents_max.
>>>> + * So no cached fiemap should exist.
>>>> + */
>>>> +static int check_fiemap_cache(struct btrfs_fs_info *fs_info,
>>>> +			       struct fiemap_extent_info *fieinfo,
>>>> +			       struct fiemap_cache *cache)
>>>> +{
>>>> +	int ret;
>>>> +
>>>> +	if (!cache->cached)
>>>> +		return 0;
>>>> +
>>>> +	/* Small and recoverbale problem, only to info developer */
>>>> +#ifdef CONFIG_BTRFS_DEBUG
>>>> +	WARN_ON(1);
>>>> +#endif
>>>> +	btrfs_warn(fs_info,
>>>> +		   "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x",
>>>> +		   cache->offset, cache->phys, cache->len, cache->flags);
>>>> +	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
>>>> +				      cache->len, cache->flags);
>>>> +	cache->cached = false;
>>>> +	if (ret > 0)
>>>> +		ret = 0;
>>>> +	return ret;
>>>> +}
>>>> +
>>>>    int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>>>    		__u64 start, __u64 len, get_extent_t *get_extent)
>>>>    {
>>>> @@ -4370,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>>>    	struct extent_state *cached_state = NULL;
>>>>    	struct btrfs_path *path;
>>>>    	struct btrfs_root *root = BTRFS_I(inode)->root;
>>>> +	struct fiemap_cache cache = { 0 };
>>>>    	int end = 0;
>>>>    	u64 em_start = 0;
>>>>    	u64 em_len = 0;
>>>> @@ -4549,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>>>    			flags |= FIEMAP_EXTENT_LAST;
>>>>    			end = 1;
>>>>    		}
>>>> -		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
>>>> -					      em_len, flags);
>>>> +		ret = submit_fiemap_extent(fieinfo, &cache, em_start, disko,
>>>> +					   em_len, flags);
>>>>    		if (ret) {
>>>>    			if (ret == 1)
>>>>    				ret = 0;
>>>> @@ -4558,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>>>>    		}
>>>>    	}
>>>>    out_free:
>>>> +	if (!ret)
>>>> +		ret = check_fiemap_cache(root->fs_info, fieinfo, &cache);
>>>>    	free_extent_map(em);
>>>>    out:
>>>>    	btrfs_free_path(path);
>>>> -- 
>>>> 2.9.3
>>>>
>>>>
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>
>>
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo April 20, 2017, 2:16 a.m. UTC | #8
At 04/20/2017 10:08 AM, Liu Bo wrote:
> On Thu, Apr 20, 2017 at 09:52:00AM +0800, Qu Wenruo wrote:
>>
>>
>> At 04/20/2017 09:25 AM, Liu Bo wrote:
>>> On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
>>>> [BUG]
>>>> Cycle mount btrfs can cause fiemap to return different result.
>>>> Like:
>>>>    # mount /dev/vdb5 /mnt/btrfs
>>>>    # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
>>>>    # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>>>    /mnt/test/file:
>>>>    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>>>      0: [0..127]:        25088..25215       128   0x1
>>>>    # umount /mnt/btrfs
>>>>    # mount /dev/vdb5 /mnt/btrfs
>>>>    # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>>>    /mnt/test/file:
>>>>    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>>>      0: [0..31]:         25088..25119        32   0x0
>>>>      1: [32..63]:        25120..25151        32   0x0
>>>>      2: [64..95]:        25152..25183        32   0x0
>>>>      3: [96..127]:       25184..25215        32   0x1
>>>> But after above fiemap, we get correct merged result if we call fiemap
>>>> again.
>>>>    # xfs_io -c "fiemap -v" /mnt/btrfs/file
>>>>    /mnt/test/file:
>>>>    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
>>>>      0: [0..127]:        25088..25215       128   0x1
>>>>
>>>> [REASON]
>>>> Btrfs will try to merge extent map when inserting new extent map.
>>>>
>>>> btrfs_fiemap(start=0 len=(u64)-1)
>>>> |- extent_fiemap(start=0 len=(u64)-1)
>>>>      |- get_extent_skip_holes(start=0 len=64k)
>>>>      |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>>>>      |     |- btrfs_get_extent(start=0 len=64k)
>>>>      |        |  Found on-disk (ino, EXTENT_DATA, 0)
>>>>      |        |- add_extent_mapping()
>>>>      |        |- Return (em->start=0, len=16k)
>>>>      |
>>>>      |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
>>>>      |
>>>>      |- get_extent_skip_holes(start=0 len=64k)
>>>>      |  |- btrfs_get_extent_fiemap(start=0 len=64k)
>>>>      |     |- btrfs_get_extent(start=16k len=48k)
>>>>      |        |  Found on-disk (ino, EXTENT_DATA, 16k)
>>>>      |        |- add_extent_mapping()
>>>>      |        |  |- try_merge_map()
>>>>      |        |     Merge with previous em start=0 len=16k
>>>>      |        |     resulting em start=0 len=32k
>>>>      |        |- Return (em->start=0, len=32K)    << Merged result
>>>>      |- Stripe off the unrelated range (0~16K) of return em
>>>>      |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
>>>>         ^^^ Causing split fiemap extent.
>>>>
>>>> And since in add_extent_mapping(), em is already merged, in next
>>>> fiemap() call, we will get merged result.
>>>>
>>>> [FIX]
>>>> Here we introduce a new structure, fiemap_cache, which records previous
>>>> fiemap extent.
>>>>
>>>> And will always try to merge current fiemap_cache result before calling
>>>> fiemap_fill_next_extent().
>>>> Only when we failed to merge current fiemap extent with cached one, we
>>>> will call fiemap_fill_next_extent() to submit cached one.
>>>>
>>>> So by this method, we can merge all fiemap extents.
>>>>
>>>
>>> If I understand it correctly, what it's missing currently is 'merge', a
>>> straightfoward idea is to make use of the 'merge' ability of btrfs_get_extent. >
>>> Since btrfs_get_extent_fiemap is a wrapper of btrfs_get_extent, does it make
>>> sense if we make btrfs_get_extent_fiemap iterate all extent maps within the
>>> range passing to it or just one more extent map to check if btrfs_get_extent
>>> could return a merged extent map before returning?
>>
>> So your idea to to do the merging inside btrfs_get_extent(), instead of
>> merging it in extent_fiemap()?
>>
>> In theory, they have the same effect for fiemap.
>> And that's my original idea.
>>
>> But the problem is, btrfs_get_extent() is called almost everywhere, more
>> frequently than extent_fiemap(), the extra time used to merging extent map
>> may cause performance problem.
>>
>> For the worst case, if a file is made up by 262144 4K extent (takes up 1G),
>> btrfs_get_extent() call on the file will iterate all these extents,
>> which will iterate more than 500 16K tree blocks.
>>
>> If only fiemap takes such longer time, it's still acceptable. But if
>> btrfs_get_extent() takes such longer time, I think it will be a problem
>> then.
>>
> 
> Not related to the patch, but the question is, does fiemap ioctl think
> it is important to have unified output?

Not defined by any one nor documentation.

But wouldn't it be strange that fiemap returns different result even we 
had done nothing?

As XFS can do it well, why not Btrfs?
(BTW, it seems that XFS does it even better, merge ondisk extents at 
insert time, other than do tricks in fiemap routine)

Thanks,
Qu

> 
> The filefrag manual only says it's attempting to get extent
> information.  And this inconsistent output of filefrag doesn't seem to
> confusing users as the reported extent count is correct.
> 
> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
> 
> Thanks,
> 
> -liubo
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba May 5, 2017, 5:36 p.m. UTC | #9
On Thu, Apr 20, 2017 at 10:09:39AM +0800, Qu Wenruo wrote:
> >>> If I understand it correctly, what it's missing currently is 'merge', a
> >>> straightfoward idea is to make use of the 'merge' ability of btrfs_get_extent. >
> >>> Since btrfs_get_extent_fiemap is a wrapper of btrfs_get_extent, does it make
> >>> sense if we make btrfs_get_extent_fiemap iterate all extent maps within the
> >>> range passing to it or just one more extent map to check if btrfs_get_extent
> >>> could return a merged extent map before returning?
> >>
> >> So your idea to to do the merging inside btrfs_get_extent(), instead of
> >> merging it in extent_fiemap()?
> >>
> > 
> > No, merge ems inside the wrapper, ie. btrfs_get_extent_fiemap().
> 
> Then llseek() with SEEK_HOLE/SEEK_DATA also get affected.

Agreed, unconditionally reading all extents inside seek would need to
be evaluated first. Limiting the scope to the fiemap ioctl is safer.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba May 5, 2017, 5:41 p.m. UTC | #10
On Thu, Apr 13, 2017 at 08:36:24AM +0800, Qu Wenruo wrote:
> 
> 
> At 04/12/2017 11:05 PM, David Sterba wrote:
> > On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
> >> [BUG]
> >> Cycle mount btrfs can cause fiemap to return different result.
> >> Like:
> >>   # mount /dev/vdb5 /mnt/btrfs
> >>   # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file
> >>   # xfs_io -c "fiemap -v" /mnt/btrfs/file
> >>   /mnt/test/file:
> >>   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> >>     0: [0..127]:        25088..25215       128   0x1
> >>   # umount /mnt/btrfs
> >>   # mount /dev/vdb5 /mnt/btrfs
> >>   # xfs_io -c "fiemap -v" /mnt/btrfs/file
> >>   /mnt/test/file:
> >>   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> >>     0: [0..31]:         25088..25119        32   0x0
> >>     1: [32..63]:        25120..25151        32   0x0
> >>     2: [64..95]:        25152..25183        32   0x0
> >>     3: [96..127]:       25184..25215        32   0x1
> >> But after above fiemap, we get correct merged result if we call fiemap
> >> again.
> >>   # xfs_io -c "fiemap -v" /mnt/btrfs/file
> >>   /mnt/test/file:
> >>   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> >>     0: [0..127]:        25088..25215       128   0x1
> >>
> >> [REASON]
> >> Btrfs will try to merge extent map when inserting new extent map.
> >>
> >> btrfs_fiemap(start=0 len=(u64)-1)
> >> |- extent_fiemap(start=0 len=(u64)-1)
> >>     |- get_extent_skip_holes(start=0 len=64k)
> >>     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
> >>     |     |- btrfs_get_extent(start=0 len=64k)
> >>     |        |  Found on-disk (ino, EXTENT_DATA, 0)
> >>     |        |- add_extent_mapping()
> >>     |        |- Return (em->start=0, len=16k)
> >>     |
> >>     |- fiemap_fill_next_extent(logic=0 phys=X len=16k)
> >>     |
> >>     |- get_extent_skip_holes(start=0 len=64k)
> >>     |  |- btrfs_get_extent_fiemap(start=0 len=64k)
> >>     |     |- btrfs_get_extent(start=16k len=48k)
> >>     |        |  Found on-disk (ino, EXTENT_DATA, 16k)
> >>     |        |- add_extent_mapping()
> >>     |        |  |- try_merge_map()
> >>     |        |     Merge with previous em start=0 len=16k
> >>     |        |     resulting em start=0 len=32k
> >>     |        |- Return (em->start=0, len=32K)    << Merged result
> >>     |- Stripe off the unrelated range (0~16K) of return em
> >>     |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K)
> >>        ^^^ Causing split fiemap extent.
> >>
> >> And since in add_extent_mapping(), em is already merged, in next
> >> fiemap() call, we will get merged result.
> >>
> >> [FIX]
> >> Here we introduce a new structure, fiemap_cache, which records previous
> >> fiemap extent.
> >>
> >> And will always try to merge current fiemap_cache result before calling
> >> fiemap_fill_next_extent().
> >> Only when we failed to merge current fiemap extent with cached one, we
> >> will call fiemap_fill_next_extent() to submit cached one.
> >>
> >> So by this method, we can merge all fiemap extents.
> > 
> > The cache gets reset on each call to extent_fiemap, so if fi_extents_max
> > is 1, the cache will be always unset and we'll never merge anything. The
> > same can happen if the number of extents reaches the limit
> > (FIEMAP_MAX_EXTENTS or any other depending on the ioctl caller). And
> > this leads to the unmerged extents.
> 
> Nope, extents will still be merged, as long as they can be merged.
> 
> The fiemap extent is only submitted if we found an unmergeable extent.
> 
> Even fi_extents_max is 1, it still possible for us to merge extents.
> 
> File A:
> Extent 1: offset=0 len=4k phys=X
> Extent 2: offset=4k len=4k phys=X+4
> Extent 3: offset=8k len=4k phys=Y
> 
> 1) Found Extent 1
>     Cache it, not submitted yet.
> 2) Found Extent 2
>     Merge it with cached one, not submitted yet.
> 3) Found Extent 3
>     Can't merge, submit cached first.
>     Submitted one reach fi_extents_max, exit current extent_fiemap.
> 
> 4) Next fiemap call starts from offset 8K,
>     Extent 3 is the last extent, no need to cache just submit.
> 
> So we still got merged fiemap extent, without anything wrong.
> 
> The point is, fi_extents_max or other limit can only be merged when we 
> submit fiemap_extent, in that case either we found unmergable extent, or 
> we already hit the last extent.

Thanks for the explanation.

> >> It can also be done in fs/ioctl.c, however the problem is if
> >> fieinfo->fi_extents_max == 0, we have no space to cache previous fiemap
> >> extent.
> > 
> > I don't see why, it's the same code path, no?
> 
> My original design in VFS is to check if we can merge current fiemap 
> extent with the last one in fiemap_info.
> 
> But for fi_extents_max == 0 case, fiemap_info doesn't store any extent 
> so that's not possible.
> 
> 
> So for fi_extents_max == 0 case, either do it in each fs like what we 
> are doing, or introduce a new function like fiemap_cache_next_extent() 
> with reference to cached structure.
> 
> > 
> >> So I choose to merge it in btrfs.
> > 
> > Lifting that to the vfs interface is probably not the right approach.
> > The ioctl has never done any postprocessing of the data returned by
> > filesystems, it's really up to the filesystem to prepare the data.
> 
> OK, let's keep it in btrfs.
> 
> > 
> >> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> >> ---
> >> v2:
> >>    Since fiemap_extent_info has a limit for number of fiemap_extent, it's possible
> >>    that fiemap_fill_next_extent() return 1 halfway. Remove the WARN_ON() which can
> >>    cause kernel warning if we fiemap is called on large compressed file.
> >> v3:
> >>    Rename finish_fiemap_extent() to check_fiemap_extent(), as in v3 we ensured
> >>    submit_fiemap_extent() to submit fiemap cache, so it just acts as a
> >>    sanity check.
> >>    Remove BTRFS_MAX_EXTENT_SIZE limit in submit_fiemap_extent(), as
> >>    extent map can be larger than BTRFS_MAX_EXTENT_SIZE.
> >>    Don't do backward jump, suggested by David.
> >>    Better sanity check and recoverable fix.
> >>
> >> To David:
> >>    What about adding a btrfs_debug_warn(), which will only call WARN_ON(1) if
> >>    BTRFS_CONFIG_DEBUG is specified for recoverable bug?
> >>
> >>    And modify ASSERT() to always WARN_ON() and exit error code?
> > 
> > That's for a separate discussion.
> > 
> >> ---
> >>   fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
> >>   1 file changed, 122 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> >> index 28e8192..c4cb65d 100644
> >> --- a/fs/btrfs/extent_io.c
> >> +++ b/fs/btrfs/extent_io.c
> >> @@ -4353,6 +4353,123 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
> >>   	return NULL;
> >>   }
> >>   
> >> +/*
> >> + * To cache previous fiemap extent
> >> + *
> >> + * Will be used for merging fiemap extent
> >> + */
> >> +struct fiemap_cache {
> >> +	u64 offset;
> >> +	u64 phys;
> >> +	u64 len;
> >> +	u32 flags;
> >> +	bool cached;
> >> +};
> >> +
> >> +/*
> >> + * Helper to submit fiemap extent.
> >> + *
> >> + * Will try to merge current fiemap extent specified by @offset, @phys,
> >> + * @len and @flags with cached one.
> >> + * And only when we fails to merge, cached one will be submitted as
> >> + * fiemap extent.
> >> + *
> >> + * Return value is the same as fiemap_fill_next_extent().
> >> + */
> >> +static int submit_fiemap_extent(struct fiemap_extent_info *fieinfo,
> > 
> > I'd suggest to rename it to emmit_fiemap_extent
> 
> I'm OK to change it in next version.

As we agree on the patch, I'll rename the varaible and you don't need to
send a new version. I made a typo, the intended name was emit_fiemap_extent.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba June 16, 2017, 12:33 p.m. UTC | #11
On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
>  fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 122 insertions(+), 2 deletions(-)

I see a lot of warnings from btrfs/094

[13458.628334] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=581632 phys=13078528 len=4096 flags=0x800
[13458.644183] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=864256 phys=13299712 len=4096 flags=0x800
[13458.660097] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=1138688 phys=13504512 len=4096 flags=0x0
[13458.675846] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=1462272 phys=13733888 len=4096 flags=0x800
...

about 1000 in total from a fstest run.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo June 17, 2017, 7:43 a.m. UTC | #12
On 2017年06月16日 20:33, David Sterba wrote:
> On Fri, Apr 07, 2017 at 10:43:15AM +0800, Qu Wenruo wrote:
>>   fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 122 insertions(+), 2 deletions(-)
> 
> I see a lot of warnings from btrfs/094
> 
> [13458.628334] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=581632 phys=13078528 len=4096 flags=0x800
> [13458.644183] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=864256 phys=13299712 len=4096 flags=0x800
> [13458.660097] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=1138688 phys=13504512 len=4096 flags=0x0
> [13458.675846] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=1462272 phys=13733888 len=4096 flags=0x800
> ...
> 
> about 1000 in total from a fstest run.

Strangely, I can't reproduce it either on my original branch (v4.12 with 
this patch), or torvalds/master (this patch is already merged with your 
renaming)

Also I'm enabling CONFIG_BTRFS_DEBUG, so when the btrfs_warn() is 
triggered, it should cause the test case to fail.

However I tried all btrfs/09*,  nothing happened.

Is some other out-of-tree patch causing the problem?
If so, which branch are you testing? I think I could help to fix it.

Thanks,
Qu

> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Adam Borowski June 17, 2017, 8:30 a.m. UTC | #13
On Sat, Jun 17, 2017 at 03:43:18PM +0800, Qu Wenruo wrote:
> On 2017年06月16日 20:33, David Sterba wrote:
> > I see a lot of warnings from btrfs/094
> > 
> > [13458.628334] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=581632 phys=13078528 len=4096 flags=0x800
> 
> Strangely, I can't reproduce it either on my original branch (v4.12 with
> this patch), or torvalds/master (this patch is already merged with your
> renaming)

I for one get this a lot even in regular use.  Somehow, it always has Comm:
dpkg, despite the vast majority of activity on the system obviously not
being dpkg.

> However I tried all btrfs/09*,  nothing happened.
> 
> Is some other out-of-tree patch causing the problem?
> If so, which branch are you testing? I think I could help to fix it.

For me, linus/master with a handful of not possibly relevant patches
(in fs/btrfs/ I have dedupe and defrag on files opened ro, raid5/6 warning,
raid5/6 incompat flag clearing).


Meow!
Qu Wenruo June 17, 2017, 1:28 p.m. UTC | #14
On 2017年06月17日 16:30, Adam Borowski wrote:
> On Sat, Jun 17, 2017 at 03:43:18PM +0800, Qu Wenruo wrote:
>> On 2017年06月16日 20:33, David Sterba wrote:
>>> I see a lot of warnings from btrfs/094
>>>
>>> [13458.628334] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=581632 phys=13078528 len=4096 flags=0x800
>>
>> Strangely, I can't reproduce it either on my original branch (v4.12 with
>> this patch), or torvalds/master (this patch is already merged with your
>> renaming)
> 
> I for one get this a lot even in regular use.  Somehow, it always has Comm:
> dpkg, despite the vast majority of activity on the system obviously not
> being dpkg.

Is that with 0x800 flag? Or some other flag?
I'm trying fallocated file with my original branch, but still no chance 
to reproduce it though.

> 
>> However I tried all btrfs/09*,  nothing happened.
>>
>> Is some other out-of-tree patch causing the problem?
>> If so, which branch are you testing? I think I could help to fix it.
> 
> For me, linus/master with a handful of not possibly relevant patches
> (in fs/btrfs/ I have dedupe and defrag on files opened ro, raid5/6 warning,
> raid5/6 incompat flag clearing).

linus/master without any extra patch is still the same?
And which commit?
I'm using 1439ccf73d9c07654fdd5b4969fd53c2feb8684d, at least it doesn't 
cause any warning the related test case, and I tried several combination 
with preallocated and written and hole, still no chance.

I also ran btrfs/* with my patch applied on v4.11-rc2 (sorry, that's the 
correct original patch base), and except some known bug, it doesn't 
cause anything special.

Thanks,
Qu

> 
> 
> Meow!
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Adam Borowski June 17, 2017, 2:57 p.m. UTC | #15
On Sat, Jun 17, 2017 at 09:28:30PM +0800, Qu Wenruo wrote:
> On 2017年06月17日 16:30, Adam Borowski wrote:
> > On Sat, Jun 17, 2017 at 03:43:18PM +0800, Qu Wenruo wrote:
> > > On 2017年06月16日 20:33, David Sterba wrote:
> > > > I see a lot of warnings from btrfs/094
> > > > 
> > > > [13458.628334] BTRFS warning (device sdb6): unhandled fiemap cache detected: offset=581632 phys=13078528 len=4096 flags=0x800
> > > 
> > > Strangely, I can't reproduce it either on my original branch (v4.12 with
> > > this patch), or torvalds/master (this patch is already merged with your
> > > renaming)
> > 
> > I for one get this a lot even in regular use.  Somehow, it always has Comm:
> > dpkg, despite the vast majority of activity on the system obviously not
> > being dpkg.
> 
> Is that with 0x800 flag? Or some other flag?

Always 0x2008:

[18330.861160] ------------[ cut here ]------------
[18330.861178] WARNING: CPU: 0 PID: 6735 at fs/btrfs/extent_io.c:4484 extent_fiemap+0x651/0x710
[18330.861180] Modules linked in: cp210x pl2303 usbserial nouveau video ttm
[18330.861197] CPU: 0 PID: 6735 Comm: dpkg Not tainted 4.12.0-rc5-debug-00017-g6afe0ac16d2b #3
[18330.861200] Hardware name: System manufacturer System Product Name/M4A77T, BIOS 2401    05/18/2011
[18330.861204] task: ffff8801229f2680 task.stack: ffffc9000351c000
[18330.861211] RIP: 0010:extent_fiemap+0x651/0x710
[18330.861214] RSP: 0018:ffffc9000351fd60 EFLAGS: 00010202
[18330.861218] RAX: ffff8802214ee000 RBX: 0000000000020000 RCX: 0000000000000000
[18330.861220] RDX: 0000000000000000 RSI: ffff880187ef64d0 RDI: ffff88021aa10000
[18330.861223] RBP: ffffc9000351fe60 R08: 0000000000020000 R09: 0000000000000000
[18330.861225] R10: ffffffffffffffff R11: ffff880187ef64d0 R12: 0000000000020000
[18330.861228] R13: 0000000000000000 R14: ffff880219a87900 R15: ffff880219a87900
[18330.861232] FS:  00007f2f3d45e400(0000) GS:ffff88022fc00000(0000) knlGS:0000000000000000
[18330.861234] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[18330.861237] CR2: 0000556ab3a77d88 CR3: 000000011deec000 CR4: 00000000000006f0
[18330.861239] Call Trace:
[18330.861248]  ? btrfs_get_extent+0xa60/0xa60
[18330.861254]  btrfs_fiemap+0x4d/0x60
[18330.861260]  do_vfs_ioctl+0x3bc/0x5e0
[18330.861266]  SyS_ioctl+0x86/0xa0
[18330.861272]  entry_SYSCALL_64_fastpath+0x17/0x98
[18330.861276] RIP: 0033:0x7f2f3cd77e07
[18330.861279] RSP: 002b:00007fff17a92f08 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[18330.861283] RAX: ffffffffffffffda RBX: 0000556ab35ead70 RCX: 00007f2f3cd77e07
[18330.861285] RDX: 00007fff17a92f50 RSI: 00000000c020660b RDI: 000000000000000a
[18330.861288] RBP: 0000000000000548 R08: 000000000000002b R09: 0000000000000052
[18330.861290] R10: 000000000000000a R11: 0000000000000246 R12: 00007fff17a92f20
[18330.861292] R13: 0000556ab15c1147 R14: ffffffffffffffff R15: 00000000000000a9
[18330.861295] Code: 04 fe ff ff 45 85 ed 4d 89 f7 0f 85 a6 fd ff ff 45 31 ed 80 7d 8f 00 48 8b 85 40 ff ff ff 48 8b b8 f0 01 00 00 0f 84 8b fd ff ff <0f> ff 4c 8b 6d a8 44 8b 65 88 48 c7 c6 f0 cc db 81 4c 8b 75 80 
[18330.861362] ---[ end trace fff11f4a8e5b834c ]---
[18330.861370] BTRFS warning (device sda1): unhandled fiemap cache detected: offset=0 phys=2435798867968 len=131072 flags=0x2008

> I'm trying fallocated file with my original branch, but still no chance to
> reproduce it though.

Seems to be perfectly reproducible for me.  Triggers a few times per dpkg run.

> > For me, linus/master with a handful of not possibly relevant patches
> > (in fs/btrfs/ I have dedupe and defrag on files opened ro, raid5/6 warning,
> > raid5/6 incompat flag clearing).
> 
> linus/master without any extra patch is still the same?
> And which commit?

git describe `git merge-base linux/master 6afe0ac16d2b`
v4.12-rc5

I've pushed my exact tree as https://github.com/kilobyte/linux kb-4.12-rc5
but there's not a single commit that could be possibly related.

I can retry with exactly linus/master but only after some stuff I forgot to
put on screen is done (should be an hour or two) -- looking for another setup
that reproduces this warning is probably a waste of time.

> I'm using 1439ccf73d9c07654fdd5b4969fd53c2feb8684d, at least it doesn't
> cause any warning the related test case, and I tried several combination
> with preallocated and written and hole, still no chance.
> 
> I also ran btrfs/* with my patch applied on v4.11-rc2 (sorry, that's the
> correct original patch base), and except some known bug, it doesn't cause
> anything special.

Can try that too if it'd be useful.


喵!
Adam Borowski June 17, 2017, 9:24 p.m. UTC | #16
On Sat, Jun 17, 2017 at 09:28:30PM +0800, Qu Wenruo wrote:
> > I for one get this a lot even in regular use.  Somehow, it always has Comm:
> > dpkg, despite the vast majority of activity on the system obviously not
> > being dpkg.

> linus/master without any extra patch is still the same?
> And which commit?
> I'm using 1439ccf73d9c07654fdd5b4969fd53c2feb8684d, at least it doesn't
> cause any warning the related test case, and I tried several combination
> with preallocated and written and hole, still no chance.

Current linus/master:

[   39.726099] ------------[ cut here ]------------
[   39.726109] WARNING: CPU: 5 PID: 3382 at fs/btrfs/extent_io.c:4484 extent_fiemap+0x651/0x710
[   39.726110] Modules linked in: cp210x pl2303 usbserial nouveau video ttm
[   39.726120] CPU: 5 PID: 3382 Comm: dpkg Not tainted 4.12.0-rc5-debug-00219-gadc311034c35 #1
[   39.726122] Hardware name: System manufacturer System Product Name/M4A77T, BIOS 2401    05/18/2011
[   39.726124] task: ffff880220844140 task.stack: ffffc90001b88000
[   39.726127] RIP: 0010:extent_fiemap+0x651/0x710
[   39.726129] RSP: 0018:ffffc90001b8bd60 EFLAGS: 00010202
[   39.726131] RAX: ffff88021824c800 RBX: 0000000000020000 RCX: 0000000000000000
[   39.726132] RDX: 0000000000000000 RSI: ffff880217fae9a0 RDI: ffff8802205d6000
[   39.726134] RBP: ffffc90001b8be60 R08: 0000000000020000 R09: 0000000000000000
[   39.726135] R10: ffffffffffffffff R11: ffff880217fae9a0 R12: 0000000000020000
[   39.726137] R13: 0000000000000000 R14: ffff880223685480 R15: ffff880223685480
[   39.726139] FS:  00007f45c1726400(0000) GS:ffff88022fd40000(0000) knlGS:0000000000000000
[   39.726140] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   39.726142] CR2: 000055e152d64000 CR3: 0000000213a8c000 CR4: 00000000000006e0
[   39.726143] Call Trace:
[   39.726149]  ? btrfs_get_extent+0xa60/0xa60
[   39.726153]  btrfs_fiemap+0x4d/0x60
[   39.726156]  do_vfs_ioctl+0x3bc/0x5e0
[   39.726159]  SyS_ioctl+0x86/0xa0
[   39.726163]  entry_SYSCALL_64_fastpath+0x17/0x98
[   39.726165] RIP: 0033:0x7f45c103fe07
[   39.726166] RSP: 002b:00007ffe327ecd88 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   39.726169] RAX: ffffffffffffffda RBX: 000055e1528a8d60 RCX: 00007f45c103fe07
[   39.726170] RDX: 00007ffe327ecdd0 RSI: 00000000c020660b RDI: 000000000000000a
[   39.726171] RBP: 0000000000000548 R08: 000000000000002b R09: 0000000000000052
[   39.726173] R10: 000000000000000a R11: 0000000000000246 R12: 00007ffe327ecda0
[   39.726174] R13: 000055e150c12147 R14: ffffffffffffffff R15: 00000000000000a9
[   39.726175] Code: 04 fe ff ff 45 85 ed 4d 89 f7 0f 85 a6 fd ff ff 45 31 ed 80 7d 8f 00 48 8b 85 40 ff ff ff 48 8b b8 f0 01 00 00 0f 84 8b fd ff ff <0f> ff 4c 8b 6d a8 44 8b 65 88 48 c7 c6 90 c9 db 81 4c 8b 75 80 
[   39.726210] ---[ end trace 25106b0204355180 ]---
[   39.726215] BTRFS warning (device sda1): unhandled fiemap cache detected: offset=0 phys=2435798867968 len=131072 flags=0x2008

> I also ran btrfs/* with my patch applied on v4.11-rc2 (sorry, that's the
> correct original patch base), and except some known bug, it doesn't cause
> anything special.

4.11-rc2 with nothing but 4751832d applied:

[  151.838761] ------------[ cut here ]------------
[  151.839176] WARNING: CPU: 0 PID: 2074 at fs/btrfs/extent_io.c:4460 extent_fiemap+0x662/0x740
[  151.840465] Modules linked in: pl2303 cp210x usbserial nouveau video ttm
[  151.842509] CPU: 0 PID: 2074 Comm: dpkg Not tainted 4.11.0-rc2-debug-00001-g2dbdec6c76b4 #1
[  151.844457] Hardware name: System manufacturer System Product Name/M4A77T, BIOS 2401    05/18/2011
[  151.846551] Call Trace:
[  151.848625]  dump_stack+0x4d/0x6d
[  151.850636]  __warn+0xd3/0xf0
[  151.852589]  warn_slowpath_null+0x18/0x20
[  151.854552]  extent_fiemap+0x662/0x740
[  151.856466]  ? btrfs_get_extent+0xa60/0xa60
[  151.858400]  btrfs_fiemap+0x4d/0x60
[  151.860323]  do_vfs_ioctl+0x3bc/0x5e0
[  151.862216]  SyS_ioctl+0x86/0xa0
[  151.864183]  entry_SYSCALL_64_fastpath+0x17/0x98
[  151.866071] RIP: 0033:0x7f542e763e07
[  151.868045] RSP: 002b:00007fff9d737188 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[  151.869995] RAX: ffffffffffffffda RBX: 000055cbf323bd80 RCX: 00007f542e763e07
[  151.872086] RDX: 00007fff9d7371d0 RSI: 00000000c020660b RDI: 000000000000000a
[  151.874127] RBP: 0000000000000548 R08: 000000000000002b R09: 0000000000000052
[  151.876246] R10: 000000000000000a R11: 0000000000000246 R12: 00007fff9d7371a0
[  151.878292] R13: 000055cbf28aa147 R14: ffffffffffffffff R15: 00000000000000a9
[  151.880487] ---[ end trace 135594ab8f41eada ]---
[  151.882586] BTRFS warning (device sda1): unhandled fiemap cache detected: offset=0 phys=2435798867968 len=131072 flags=0x2008


喵!
Qu Wenruo June 18, 2017, 9:38 a.m. UTC | #17
On 2017年06月18日 05:24, Adam Borowski wrote:
> On Sat, Jun 17, 2017 at 09:28:30PM +0800, Qu Wenruo wrote:
>>> I for one get this a lot even in regular use.  Somehow, it always has Comm:
>>> dpkg, despite the vast majority of activity on the system obviously not
>>> being dpkg.
> 
>> linus/master without any extra patch is still the same?
>> And which commit?
>> I'm using 1439ccf73d9c07654fdd5b4969fd53c2feb8684d, at least it doesn't
>> cause any warning the related test case, and I tried several combination
>> with preallocated and written and hole, still no chance.
> 
> Current linus/master:
> 
> [   39.726099] ------------[ cut here ]------------
> [   39.726109] WARNING: CPU: 5 PID: 3382 at fs/btrfs/extent_io.c:4484 extent_fiemap+0x651/0x710
> [   39.726110] Modules linked in: cp210x pl2303 usbserial nouveau video ttm
> [   39.726120] CPU: 5 PID: 3382 Comm: dpkg Not tainted 4.12.0-rc5-debug-00219-gadc311034c35 #1
> [   39.726122] Hardware name: System manufacturer System Product Name/M4A77T, BIOS 2401    05/18/2011
> [   39.726124] task: ffff880220844140 task.stack: ffffc90001b88000
> [   39.726127] RIP: 0010:extent_fiemap+0x651/0x710
> [   39.726129] RSP: 0018:ffffc90001b8bd60 EFLAGS: 00010202
> [   39.726131] RAX: ffff88021824c800 RBX: 0000000000020000 RCX: 0000000000000000
> [   39.726132] RDX: 0000000000000000 RSI: ffff880217fae9a0 RDI: ffff8802205d6000
> [   39.726134] RBP: ffffc90001b8be60 R08: 0000000000020000 R09: 0000000000000000
> [   39.726135] R10: ffffffffffffffff R11: ffff880217fae9a0 R12: 0000000000020000
> [   39.726137] R13: 0000000000000000 R14: ffff880223685480 R15: ffff880223685480
> [   39.726139] FS:  00007f45c1726400(0000) GS:ffff88022fd40000(0000) knlGS:0000000000000000
> [   39.726140] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   39.726142] CR2: 000055e152d64000 CR3: 0000000213a8c000 CR4: 00000000000006e0
> [   39.726143] Call Trace:
> [   39.726149]  ? btrfs_get_extent+0xa60/0xa60
> [   39.726153]  btrfs_fiemap+0x4d/0x60
> [   39.726156]  do_vfs_ioctl+0x3bc/0x5e0
> [   39.726159]  SyS_ioctl+0x86/0xa0
> [   39.726163]  entry_SYSCALL_64_fastpath+0x17/0x98
> [   39.726165] RIP: 0033:0x7f45c103fe07
> [   39.726166] RSP: 002b:00007ffe327ecd88 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [   39.726169] RAX: ffffffffffffffda RBX: 000055e1528a8d60 RCX: 00007f45c103fe07
> [   39.726170] RDX: 00007ffe327ecdd0 RSI: 00000000c020660b RDI: 000000000000000a
> [   39.726171] RBP: 0000000000000548 R08: 000000000000002b R09: 0000000000000052
> [   39.726173] R10: 000000000000000a R11: 0000000000000246 R12: 00007ffe327ecda0
> [   39.726174] R13: 000055e150c12147 R14: ffffffffffffffff R15: 00000000000000a9
> [   39.726175] Code: 04 fe ff ff 45 85 ed 4d 89 f7 0f 85 a6 fd ff ff 45 31 ed 80 7d 8f 00 48 8b 85 40 ff ff ff 48 8b b8 f0 01 00 00 0f 84 8b fd ff ff <0f> ff 4c 8b 6d a8 44 8b 65 88 48 c7 c6 90 c9 db 81 4c 8b 75 80
> [   39.726210] ---[ end trace 25106b0204355180 ]---
> [   39.726215] BTRFS warning (device sda1): unhandled fiemap cache detected: offset=phys$35798867968 len1072 flags=0x2008
>

Shared and encoded?

The length seems to be quite strange, maybe a compressed inline extent.

I'll try that when I'm back from LinuxCon China.

Thanks,
Qu

>> I also ran btrfs/* with my patch applied on v4.11-rc2 (sorry, that's the
>> correct original patch base), and except some known bug, it doesn't cause
>> anything special.
> 
> 4.11-rc2 with nothing but 4751832d applied:
> 
> [  151.838761] ------------[ cut here ]------------
> [  151.839176] WARNING: CPU: 0 PID: 2074 at fs/btrfs/extent_io.c:4460 extent_fiemap+0x662/0x740
> [  151.840465] Modules linked in: pl2303 cp210x usbserial nouveau video ttm
> [  151.842509] CPU: 0 PID: 2074 Comm: dpkg Not tainted 4.11.0-rc2-debug-00001-g2dbdec6c76b4 #1
> [  151.844457] Hardware name: System manufacturer System Product Name/M4A77T, BIOS 2401    05/18/2011
> [  151.846551] Call Trace:
> [  151.848625]  dump_stack+0x4d/0x6d
> [  151.850636]  __warn+0xd3/0xf0
> [  151.852589]  warn_slowpath_null+0x18/0x20
> [  151.854552]  extent_fiemap+0x662/0x740
> [  151.856466]  ? btrfs_get_extent+0xa60/0xa60
> [  151.858400]  btrfs_fiemap+0x4d/0x60
> [  151.860323]  do_vfs_ioctl+0x3bc/0x5e0
> [  151.862216]  SyS_ioctl+0x86/0xa0
> [  151.864183]  entry_SYSCALL_64_fastpath+0x17/0x98
> [  151.866071] RIP: 0033:0x7f542e763e07
> [  151.868045] RSP: 002b:00007fff9d737188 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [  151.869995] RAX: ffffffffffffffda RBX: 000055cbf323bd80 RCX: 00007f542e763e07
> [  151.872086] RDX: 00007fff9d7371d0 RSI: 00000000c020660b RDI: 000000000000000a
> [  151.874127] RBP: 0000000000000548 R08: 000000000000002b R09: 0000000000000052
> [  151.876246] R10: 000000000000000a R11: 0000000000000246 R12: 00007fff9d7371a0
> [  151.878292] R13: 000055cbf28aa147 R14: ffffffffffffffff R15: 00000000000000a9
> [  151.880487] ---[ end trace 135594ab8f41eada ]---
> [  151.882586] BTRFS warning (device sda1): unhandled fiemap cache detected: offset=phys$35798867968 len1072 flags=0x2008
> 
> 
> 喵!
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo June 18, 2017, 11:23 a.m. UTC | #18
On 2017年06月18日 17:38, Qu Wenruo wrote:
> 
> 
> On 2017年06月18日 05:24, Adam Borowski wrote:
>> On Sat, Jun 17, 2017 at 09:28:30PM +0800, Qu Wenruo wrote:
>>>> I for one get this a lot even in regular use.  Somehow, it always 
>>>> has Comm:
>>>> dpkg, despite the vast majority of activity on the system obviously not
>>>> being dpkg.
>>
>>> linus/master without any extra patch is still the same?
>>> And which commit?
>>> I'm using 1439ccf73d9c07654fdd5b4969fd53c2feb8684d, at least it doesn't
>>> cause any warning the related test case, and I tried several combination
>>> with preallocated and written and hole, still no chance.
>>
>> Current linus/master:
>>
>> [   39.726099] ------------[ cut here ]------------
>> [   39.726109] WARNING: CPU: 5 PID: 3382 at fs/btrfs/extent_io.c:4484 
>> extent_fiemap+0x651/0x710
>> [   39.726110] Modules linked in: cp210x pl2303 usbserial nouveau 
>> video ttm
>> [   39.726120] CPU: 5 PID: 3382 Comm: dpkg Not tainted 
>> 4.12.0-rc5-debug-00219-gadc311034c35 #1
>> [   39.726122] Hardware name: System manufacturer System Product 
>> Name/M4A77T, BIOS 2401    05/18/2011
>> [   39.726124] task: ffff880220844140 task.stack: ffffc90001b88000
>> [   39.726127] RIP: 0010:extent_fiemap+0x651/0x710
>> [   39.726129] RSP: 0018:ffffc90001b8bd60 EFLAGS: 00010202
>> [   39.726131] RAX: ffff88021824c800 RBX: 0000000000020000 RCX: 
>> 0000000000000000
>> [   39.726132] RDX: 0000000000000000 RSI: ffff880217fae9a0 RDI: 
>> ffff8802205d6000
>> [   39.726134] RBP: ffffc90001b8be60 R08: 0000000000020000 R09: 
>> 0000000000000000
>> [   39.726135] R10: ffffffffffffffff R11: ffff880217fae9a0 R12: 
>> 0000000000020000
>> [   39.726137] R13: 0000000000000000 R14: ffff880223685480 R15: 
>> ffff880223685480
>> [   39.726139] FS:  00007f45c1726400(0000) GS:ffff88022fd40000(0000) 
>> knlGS:0000000000000000
>> [   39.726140] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [   39.726142] CR2: 000055e152d64000 CR3: 0000000213a8c000 CR4: 
>> 00000000000006e0
>> [   39.726143] Call Trace:
>> [   39.726149]  ? btrfs_get_extent+0xa60/0xa60
>> [   39.726153]  btrfs_fiemap+0x4d/0x60
>> [   39.726156]  do_vfs_ioctl+0x3bc/0x5e0
>> [   39.726159]  SyS_ioctl+0x86/0xa0
>> [   39.726163]  entry_SYSCALL_64_fastpath+0x17/0x98
>> [   39.726165] RIP: 0033:0x7f45c103fe07
>> [   39.726166] RSP: 002b:00007ffe327ecd88 EFLAGS: 00000246 ORIG_RAX: 
>> 0000000000000010
>> [   39.726169] RAX: ffffffffffffffda RBX: 000055e1528a8d60 RCX: 
>> 00007f45c103fe07
>> [   39.726170] RDX: 00007ffe327ecdd0 RSI: 00000000c020660b RDI: 
>> 000000000000000a
>> [   39.726171] RBP: 0000000000000548 R08: 000000000000002b R09: 
>> 0000000000000052
>> [   39.726173] R10: 000000000000000a R11: 0000000000000246 R12: 
>> 00007ffe327ecda0
>> [   39.726174] R13: 000055e150c12147 R14: ffffffffffffffff R15: 
>> 00000000000000a9
>> [   39.726175] Code: 04 fe ff ff 45 85 ed 4d 89 f7 0f 85 a6 fd ff ff 
>> 45 31 ed 80 7d 8f 00 48 8b 85 40 ff ff ff 48 8b b8 f0 01 00 00 0f 84 
>> 8b fd ff ff <0f> ff 4c 8b 6d a8 44 8b 65 88 48 c7 c6 90 c9 db 81 4c 8b 
>> 75 80
>> [   39.726210] ---[ end trace 25106b0204355180 ]---
>> [   39.726215] BTRFS warning (device sda1): unhandled fiemap cache 
>> detected: offset=phys$35798867968 len1072 flags=0x2008


Well, still no good news.

I created file extents which returns 0x2008 and the last extent with 0x9.
But still failed to reproduce the error message.

BTW, I noticed that your output is a little strange.
Normally we should have "offset=%llu phys=%llu len=%llu and flags=0x%x".

But your output seems a little out of shape.
And further more, the len (if it's correct) is not aligned even to 512 
bytes.

Seems something went wrong totally.

Thanks,
Qu

>>
> 
> Shared and encoded?
> 
> The length seems to be quite strange, maybe a compressed inline extent.
> 
> I'll try that when I'm back from LinuxCon China.
> 
> Thanks,
> Qu
> 
>>> I also ran btrfs/* with my patch applied on v4.11-rc2 (sorry, that's the
>>> correct original patch base), and except some known bug, it doesn't 
>>> cause
>>> anything special.
>>
>> 4.11-rc2 with nothing but 4751832d applied:
>>
>> [  151.838761] ------------[ cut here ]------------
>> [  151.839176] WARNING: CPU: 0 PID: 2074 at fs/btrfs/extent_io.c:4460 
>> extent_fiemap+0x662/0x740
>> [  151.840465] Modules linked in: pl2303 cp210x usbserial nouveau 
>> video ttm
>> [  151.842509] CPU: 0 PID: 2074 Comm: dpkg Not tainted 
>> 4.11.0-rc2-debug-00001-g2dbdec6c76b4 #1
>> [  151.844457] Hardware name: System manufacturer System Product 
>> Name/M4A77T, BIOS 2401    05/18/2011
>> [  151.846551] Call Trace:
>> [  151.848625]  dump_stack+0x4d/0x6d
>> [  151.850636]  __warn+0xd3/0xf0
>> [  151.852589]  warn_slowpath_null+0x18/0x20
>> [  151.854552]  extent_fiemap+0x662/0x740
>> [  151.856466]  ? btrfs_get_extent+0xa60/0xa60
>> [  151.858400]  btrfs_fiemap+0x4d/0x60
>> [  151.860323]  do_vfs_ioctl+0x3bc/0x5e0
>> [  151.862216]  SyS_ioctl+0x86/0xa0
>> [  151.864183]  entry_SYSCALL_64_fastpath+0x17/0x98
>> [  151.866071] RIP: 0033:0x7f542e763e07
>> [  151.868045] RSP: 002b:00007fff9d737188 EFLAGS: 00000246 ORIG_RAX: 
>> 0000000000000010
>> [  151.869995] RAX: ffffffffffffffda RBX: 000055cbf323bd80 RCX: 
>> 00007f542e763e07
>> [  151.872086] RDX: 00007fff9d7371d0 RSI: 00000000c020660b RDI: 
>> 000000000000000a
>> [  151.874127] RBP: 0000000000000548 R08: 000000000000002b R09: 
>> 0000000000000052
>> [  151.876246] R10: 000000000000000a R11: 0000000000000246 R12: 
>> 00007fff9d7371a0
>> [  151.878292] R13: 000055cbf28aa147 R14: ffffffffffffffff R15: 
>> 00000000000000a9
>> [  151.880487] ---[ end trace 135594ab8f41eada ]---
>> [  151.882586] BTRFS warning (device sda1): unhandled fiemap cache 
>> detected: offset=phys$35798867968 len1072 flags=0x2008
>>
>>
>> 喵!
>>
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Holger Hoffstätte June 18, 2017, 11:56 a.m. UTC | #19
On 06/18/17 13:23, Qu Wenruo wrote:
> Well, still no good news.
> 
> I created file extents which returns 0x2008 and the last extent with 0x9.
> But still failed to reproduce the error message.
> 
> BTW, I noticed that your output is a little strange.
> Normally we should have "offset=%llu phys=%llu len=%llu and flags=0x%x".
> 
> But your output seems a little out of shape.
> And further more, the len (if it's correct) is not aligned even to 512 bytes.
> 
> Seems something went wrong totally.

That seems to be a mail decoding/display problem at your side.
Adam's original mail contained:

  offset=0 phys=2435798867968 len=131072 flags=0x2008

So length looks fine.

-h
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Adam Borowski June 18, 2017, 1:42 p.m. UTC | #20
On Sun, Jun 18, 2017 at 07:23:00PM +0800, Qu Wenruo wrote:
> > > [   39.726215] BTRFS warning (device sda1): unhandled fiemap cache
> > > detected: offset=phys$35798867968 len1072 flags=0x2008

> > > [  151.882586] BTRFS warning (device sda1): unhandled fiemap cache
> > > detected: offset=phys$35798867968 len1072 flags=0x2008


> I created file extents which returns 0x2008 and the last extent with 0x9.
> But still failed to reproduce the error message.
> 
> BTW, I noticed that your output is a little strange.
> Normally we should have "offset=%llu phys=%llu len=%llu and flags=0x%x".
> 
> But your output seems a little out of shape.
> And further more, the len (if it's correct) is not aligned even to 512
> bytes.

[24031.309852] BTRFS warning (device sda1): unhandled fiemap cache detected: offset=0 phys=2012381351936 len=131072 flags=0x2008

Eh?  All the numbers in my mails not only match that printf format but also
are nice round numbers; len being always 128KB (not in just what I posted,
also in all other occurences).

But, in your replies there's '$'blahblab and 0x13"1072".

> Seems something went wrong totally.

Perhaps there's some mail client borkage?  I see it ok in my Sent mbox but
it could have been corrupted in transit or on your side.

Here's my current dmesg: http://sprunge.us/ajag


Meow!
Qu Wenruo June 21, 2017, 8:13 a.m. UTC | #21
At 06/18/2017 09:42 PM, Adam Borowski wrote:
> On Sun, Jun 18, 2017 at 07:23:00PM +0800, Qu Wenruo wrote:
>>>> [   39.726215] BTRFS warning (device sda1): unhandled fiemap cache
>>>> detected: offset=phys$35798867968 len1072 flags=0x2008
> 
>>>> [  151.882586] BTRFS warning (device sda1): unhandled fiemap cache
>>>> detected: offset=phys$35798867968 len1072 flags=0x2008
> 
> 
>> I created file extents which returns 0x2008 and the last extent with 0x9.
>> But still failed to reproduce the error message.
>>
>> BTW, I noticed that your output is a little strange.
>> Normally we should have "offset=%llu phys=%llu len=%llu and flags=0x%x".
>>
>> But your output seems a little out of shape.
>> And further more, the len (if it's correct) is not aligned even to 512
>> bytes.
> 
> [24031.309852] BTRFS warning (device sda1): unhandled fiemap cache detected: offset=0 phys=2012381351936 len=131072 flags=0x2008
> 
> Eh?  All the numbers in my mails not only match that printf format but also
> are nice round numbers; len being always 128KB (not in just what I posted,
> also in all other occurences).

Well the new line seems pretty normal now.

I'll create a debug patch for you to get more info to pin down the bug.

> 
> But, in your replies there's '$'blahblab and 0x13"1072".
> 
>> Seems something went wrong totally.
> 
> Perhaps there's some mail client borkage?  I see it ok in my Sent mbox but
> it could have been corrupted in transit or on your side.
> 
> Here's my current dmesg: http://sprunge.us/ajag

In my work mailbox, everything is just as fine as yours.

But my gmx mailbox gives the wrong output even the message has the same 
message id (<20170617145725.y3wsex73aaf3vcje@angband.pl>).

I'd reduce the usage of my gmx mailbox.

Thanks,
Qu

> 
> 
> Meow!
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 28e8192..c4cb65d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4353,6 +4353,123 @@  static struct extent_map *get_extent_skip_holes(struct inode *inode,
 	return NULL;
 }
 
+/*
+ * To cache previous fiemap extent
+ *
+ * Will be used for merging fiemap extent
+ */
+struct fiemap_cache {
+	u64 offset;
+	u64 phys;
+	u64 len;
+	u32 flags;
+	bool cached;
+};
+
+/*
+ * Helper to submit fiemap extent.
+ *
+ * Will try to merge current fiemap extent specified by @offset, @phys,
+ * @len and @flags with cached one.
+ * And only when we fails to merge, cached one will be submitted as
+ * fiemap extent.
+ *
+ * Return value is the same as fiemap_fill_next_extent().
+ */
+static int submit_fiemap_extent(struct fiemap_extent_info *fieinfo,
+				struct fiemap_cache *cache,
+				u64 offset, u64 phys, u64 len, u32 flags)
+{
+	int ret = 0;
+
+	if (!cache->cached)
+		goto assign;
+
+	/*
+	 * Sanity check, extent_fiemap() should have ensured that new
+	 * fiemap extent won't overlap with cahced one.
+	 * Not recoverable.
+	 *
+	 * NOTE: Physical address can overlap, due to compression
+	 */
+	if (cache->offset + cache->len > offset) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	/*
+	 * Only merges fiemap extents if
+	 * 1) Their logical addresses are continuous
+	 *
+	 * 2) Their physical addresses are continuous
+	 *    So truly compressed (physical size smaller than logical size)
+	 *    extents won't get merged with each other
+	 *
+	 * 3) Share same flags except FIEMAP_EXTENT_LAST
+	 *    So regular extent won't get merged with prealloc extent
+	 */
+	if (cache->offset + cache->len  == offset &&
+	    cache->phys + cache->len == phys  &&
+	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
+			(flags & ~FIEMAP_EXTENT_LAST)) {
+		cache->len += len;
+		cache->flags |= flags;
+		goto try_submit_last;
+	}
+
+	/* Not mergeable, need to submit cached one */
+	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+				      cache->len, cache->flags);
+	cache->cached = false;
+	if (ret)
+		return ret;
+assign:
+	cache->cached = true;
+	cache->offset = offset;
+	cache->phys = phys;
+	cache->len = len;
+	cache->flags = flags;
+try_submit_last:
+	if (cache->flags & FIEMAP_EXTENT_LAST) {
+		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
+				cache->phys, cache->len, cache->flags);
+		cache->cached = false;
+	}
+	return ret;
+}
+
+/*
+ * Sanity check for fiemap cache
+ *
+ * All fiemap cache should be submitted by submit_fiemap_extent()
+ * Iteration should be terminated either by last fiemap extent or
+ * fieinfo->fi_extents_max.
+ * So no cached fiemap should exist.
+ */
+static int check_fiemap_cache(struct btrfs_fs_info *fs_info,
+			       struct fiemap_extent_info *fieinfo,
+			       struct fiemap_cache *cache)
+{
+	int ret;
+
+	if (!cache->cached)
+		return 0;
+
+	/* Small and recoverbale problem, only to info developer */
+#ifdef CONFIG_BTRFS_DEBUG
+	WARN_ON(1);
+#endif
+	btrfs_warn(fs_info,
+		   "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x",
+		   cache->offset, cache->phys, cache->len, cache->flags);
+	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+				      cache->len, cache->flags);
+	cache->cached = false;
+	if (ret > 0)
+		ret = 0;
+	return ret;
+}
+
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -4370,6 +4487,7 @@  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct fiemap_cache cache = { 0 };
 	int end = 0;
 	u64 em_start = 0;
 	u64 em_len = 0;
@@ -4549,8 +4667,8 @@  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			flags |= FIEMAP_EXTENT_LAST;
 			end = 1;
 		}
-		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-					      em_len, flags);
+		ret = submit_fiemap_extent(fieinfo, &cache, em_start, disko,
+					   em_len, flags);
 		if (ret) {
 			if (ret == 1)
 				ret = 0;
@@ -4558,6 +4676,8 @@  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		}
 	}
 out_free:
+	if (!ret)
+		ret = check_fiemap_cache(root->fs_info, fieinfo, &cache);
 	free_extent_map(em);
 out:
 	btrfs_free_path(path);