diff mbox series

[v2,15/18] btrfs: disk-io: introduce subpage metadata validation check

Message ID 20201210063905.75727-16-wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: add read-only support for subpage sector size | expand

Commit Message

Qu Wenruo Dec. 10, 2020, 6:39 a.m. UTC
For subpage metadata validation check, there are some difference:
- Read must finish in one bvec
  Since we're just reading one subpage range in one page, it should
  never be split into two bios nor two bvecs.

- How to grab the existing eb
  Instead of grabbing eb using page->private, we have to go search radix
  tree as we don't have any direct pointer at hand.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/disk-io.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

Comments

kernel test robot Dec. 10, 2020, 1:24 p.m. UTC | #1
Hi Qu,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kdave/for-next]
[also build test ERROR on next-20201209]
[cannot apply to btrfs/next v5.10-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Qu-Wenruo/btrfs-add-read-only-support-for-subpage-sector-size/20201210-144442
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
config: nds32-randconfig-r004-20201209 (attached as .config)
compiler: nds32le-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/e01cdf51d0d32647697616c0dd08f2cc3220bde4
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Qu-Wenruo/btrfs-add-read-only-support-for-subpage-sector-size/20201210-144442
        git checkout e01cdf51d0d32647697616c0dd08f2cc3220bde4
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=nds32 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   nds32le-linux-ld: fs/btrfs/disk-io.o: in function `btrfs_validate_metadata_buffer':
   disk-io.c:(.text+0x4200): undefined reference to `__udivdi3'
>> nds32le-linux-ld: disk-io.c:(.text+0x4204): undefined reference to `__udivdi3'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Dec. 10, 2020, 1:39 p.m. UTC | #2
Hi Qu,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kdave/for-next]
[also build test ERROR on next-20201210]
[cannot apply to v5.10-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Qu-Wenruo/btrfs-add-read-only-support-for-subpage-sector-size/20201210-144442
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
config: i386-randconfig-a013-20201209 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/e01cdf51d0d32647697616c0dd08f2cc3220bde4
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Qu-Wenruo/btrfs-add-read-only-support-for-subpage-sector-size/20201210-144442
        git checkout e01cdf51d0d32647697616c0dd08f2cc3220bde4
        # save the attached .config to linux build tree
        make W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: fs/btrfs/disk-io.o: in function `validate_subpage_buffer':
>> fs/btrfs/disk-io.c:619: undefined reference to `__udivdi3'

vim +619 fs/btrfs/disk-io.c

   593	
   594	static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
   595					   int mirror)
   596	{
   597		struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
   598		struct extent_buffer *eb;
   599		int reads_done;
   600		int ret = 0;
   601	
   602		if (!IS_ALIGNED(start, fs_info->sectorsize) ||
   603		    !IS_ALIGNED(end - start + 1, fs_info->sectorsize) ||
   604		    !IS_ALIGNED(end - start + 1, fs_info->nodesize)) {
   605			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
   606			btrfs_err(fs_info, "invalid tree read bytenr");
   607			return -EUCLEAN;
   608		}
   609	
   610		/*
   611		 * We don't allow bio merge for subpage metadata read, so we should
   612		 * only get one eb for each endio hook.
   613		 */
   614		ASSERT(end == start + fs_info->nodesize - 1);
   615		ASSERT(PagePrivate(page));
   616	
   617		rcu_read_lock();
   618		eb = radix_tree_lookup(&fs_info->buffer_radix,
 > 619				       start / fs_info->sectorsize);
   620		rcu_read_unlock();
   621	
   622		/*
   623		 * When we are reading one tree block, eb must have been
   624		 * inserted into the radix tree. If not something is wrong.
   625		 */
   626		if (!eb) {
   627			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
   628			btrfs_err(fs_info,
   629				"can't find extent buffer for bytenr %llu",
   630				start);
   631			return -EUCLEAN;
   632		}
   633		/*
   634		 * The pending IO might have been the only thing that kept
   635		 * this buffer in memory.  Make sure we have a ref for all
   636		 * this other checks
   637		 */
   638		atomic_inc(&eb->refs);
   639	
   640		reads_done = atomic_dec_and_test(&eb->io_pages);
   641		/* Subpage read must finish in page read */
   642		ASSERT(reads_done);
   643	
   644		eb->read_mirror = mirror;
   645		if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
   646			ret = -EIO;
   647			goto err;
   648		}
   649		ret = validate_extent_buffer(eb);
   650		if (ret < 0)
   651			goto err;
   652	
   653		if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
   654			btree_readahead_hook(eb, ret);
   655	
   656		set_extent_buffer_uptodate(eb);
   657	
   658		free_extent_buffer(eb);
   659		return ret;
   660	err:
   661		/*
   662		 * our io error hook is going to dec the io pages
   663		 * again, we have to make sure it has something to
   664		 * decrement
   665		 */
   666		atomic_inc(&eb->io_pages);
   667		clear_extent_buffer_uptodate(eb);
   668		free_extent_buffer(eb);
   669		return ret;
   670	}
   671	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Nikolay Borisov Dec. 14, 2020, 10:21 a.m. UTC | #3
On 10.12.20 г. 8:39 ч., Qu Wenruo wrote:
> For subpage metadata validation check, there are some difference:
> - Read must finish in one bvec
>   Since we're just reading one subpage range in one page, it should
>   never be split into two bios nor two bvecs.
> 
> - How to grab the existing eb
>   Instead of grabbing eb using page->private, we have to go search radix
>   tree as we don't have any direct pointer at hand.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/disk-io.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 82 insertions(+)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index b6c03a8b0c72..adda76895058 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -591,6 +591,84 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>  	return ret;
>  }
>  
> +static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
> +				   int mirror)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
> +	struct extent_buffer *eb;
> +	int reads_done;
> +	int ret = 0;
> +
> +	if (!IS_ALIGNED(start, fs_info->sectorsize) ||

That's guaranteed by the allocator.

> +	    !IS_ALIGNED(end - start + 1, fs_info->sectorsize) ||
That's guaranteed by the fact that  nodesize is a multiple of sectorsize.

> +	    !IS_ALIGNED(end - start + 1, fs_info->nodesize)) {

And that's also guaranteed that the size of an eb is always a nodesize.
Also aren't those checks already performed by the tree-checker during
write? Just remove this as it adds noise.

> +		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));> +		btrfs_err(fs_info, "invalid tree read bytenr");
> +		return -EUCLEAN;
> +	}
> +
> +	/*
> +	 * We don't allow bio merge for subpage metadata read, so we should
> +	 * only get one eb for each endio hook.
> +	 */
> +	ASSERT(end == start + fs_info->nodesize - 1);
> +	ASSERT(PagePrivate(page));
> +
> +	rcu_read_lock();
> +	eb = radix_tree_lookup(&fs_info->buffer_radix,
> +			       start / fs_info->sectorsize);

This division op likely produces the kernel robot's warning. It could be
written to use >> fs_info->sectorsize_bits. Furthermore this usage of
radix tree + rcu without acquiring the refs is unsafe as per my
explanation of, essentially, identical issue in patch 12 and our offline
chat about it.

> +	rcu_read_unlock();
> +
> +	/*
> +	 * When we are reading one tree block, eb must have been
> +	 * inserted into the radix tree. If not something is wrong.
> +	 */
> +	if (!eb) {
> +		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
> +		btrfs_err(fs_info,
> +			"can't find extent buffer for bytenr %llu",
> +			start);
> +		return -EUCLEAN;
> +	}

That's impossible to execute and such a failure will result in a crash
so just remove this code.

> +	/*
> +	 * The pending IO might have been the only thing that kept
> +	 * this buffer in memory.  Make sure we have a ref for all
> +	 * this other checks
> +	 */
> +	atomic_inc(&eb->refs);
> +
> +	reads_done = atomic_dec_and_test(&eb->io_pages);
> +	/* Subpage read must finish in page read */
> +	ASSERT(reads_done);

Just ASSERT(atomic_dec_and_test(&eb->io_pages)). Again, for subpage I
think that's a bit much since it only has 1 page so it's guaranteed that
it will always be true.
> +
> +	eb->read_mirror = mirror;
> +	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
> +		ret = -EIO;
> +		goto err;
> +	}
> +	ret = validate_extent_buffer(eb);
> +	if (ret < 0)
> +		goto err;
> +
> +	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
> +		btree_readahead_hook(eb, ret);
> +
> +	set_extent_buffer_uptodate(eb);
> +
> +	free_extent_buffer(eb);
> +	return ret;
> +err:
> +	/*
> +	 * our io error hook is going to dec the io pages
> +	 * again, we have to make sure it has something to
> +	 * decrement
> +	 */

That comment is slightly ambiguous - it's not the io error hook that
does the decrement but end_bio_extent_readpage. Just rewrite the comment
to :

"end_bio_extent_readpage decrements io_pages in case of error, make sure
it has ...."

> +	atomic_inc(&eb->io_pages);
> +	clear_extent_buffer_uptodate(eb);
> +	free_extent_buffer(eb);
> +	return ret;
> +}
> +
>  int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
>  				   struct page *page, u64 start, u64 end,
>  				   int mirror)
> @@ -600,6 +678,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
>  	int reads_done;
>  
>  	ASSERT(page->private);
> +
> +	if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
> +		return validate_subpage_buffer(page, start, end, mirror);

nit: validate_metadata_buffer is called in only once place so I'm
wondering won't it make it more readable if this check is lifted to its
sole caller so that when reading end_bio_extent_readpage it's apparent
what's going on. Though it's apparent that the nesting in the caller
will get somewhat unwieldy so won't be pressing hard for this.
> +
>  	eb = (struct extent_buffer *)page->private;
>  
>  
>
Qu Wenruo Dec. 14, 2020, 10:50 a.m. UTC | #4
On 2020/12/14 下午6:21, Nikolay Borisov wrote:
>
>
> On 10.12.20 г. 8:39 ч., Qu Wenruo wrote:
>> For subpage metadata validation check, there are some difference:
>> - Read must finish in one bvec
>>    Since we're just reading one subpage range in one page, it should
>>    never be split into two bios nor two bvecs.
>>
>> - How to grab the existing eb
>>    Instead of grabbing eb using page->private, we have to go search radix
>>    tree as we don't have any direct pointer at hand.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/disk-io.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 82 insertions(+)
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index b6c03a8b0c72..adda76895058 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -591,6 +591,84 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>>   	return ret;
>>   }
>>
>> +static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
>> +				   int mirror)
>> +{
>> +	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
>> +	struct extent_buffer *eb;
>> +	int reads_done;
>> +	int ret = 0;
>> +
>> +	if (!IS_ALIGNED(start, fs_info->sectorsize) ||
>
> That's guaranteed by the allocator.
>
>> +	    !IS_ALIGNED(end - start + 1, fs_info->sectorsize) ||
> That's guaranteed by the fact that  nodesize is a multiple of sectorsize.
>
>> +	    !IS_ALIGNED(end - start + 1, fs_info->nodesize)) {
>
> And that's also guaranteed that the size of an eb is always a nodesize.
> Also aren't those checks already performed by the tree-checker during
> write? Just remove this as it adds noise.
>
>> +		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));> +		btrfs_err(fs_info, "invalid tree read bytenr");
>> +		return -EUCLEAN;
>> +	}
>> +
>> +	/*
>> +	 * We don't allow bio merge for subpage metadata read, so we should
>> +	 * only get one eb for each endio hook.
>> +	 */
>> +	ASSERT(end == start + fs_info->nodesize - 1);
>> +	ASSERT(PagePrivate(page));
>> +
>> +	rcu_read_lock();
>> +	eb = radix_tree_lookup(&fs_info->buffer_radix,
>> +			       start / fs_info->sectorsize);
>
> This division op likely produces the kernel robot's warning. It could be
> written to use >> fs_info->sectorsize_bits. Furthermore this usage of
> radix tree + rcu without acquiring the refs is unsafe as per my
> explanation of, essentially, identical issue in patch 12 and our offline
> chat about it.

Another relic I forgot in the long update history, nice find.

>
>> +	rcu_read_unlock();
>> +
>> +	/*
>> +	 * When we are reading one tree block, eb must have been
>> +	 * inserted into the radix tree. If not something is wrong.
>> +	 */
>> +	if (!eb) {
>> +		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
>> +		btrfs_err(fs_info,
>> +			"can't find extent buffer for bytenr %llu",
>> +			start);
>> +		return -EUCLEAN;
>> +	}
>
> That's impossible to execute and such a failure will result in a crash
> so just remove this code.
>
>> +	/*
>> +	 * The pending IO might have been the only thing that kept
>> +	 * this buffer in memory.  Make sure we have a ref for all
>> +	 * this other checks
>> +	 */
>> +	atomic_inc(&eb->refs);
>> +
>> +	reads_done = atomic_dec_and_test(&eb->io_pages);
>> +	/* Subpage read must finish in page read */
>> +	ASSERT(reads_done);
>
> Just ASSERT(atomic_dec_and_test(&eb->io_pages)). Again, for subpage I
> think that's a bit much since it only has 1 page so it's guaranteed that
> it will always be true.

IIRC ASSERT() won't execute whatever in it for non debug build.
Thus ASSERT(atomic_*) would cause non-debug kernel not to decrease the
io_pages and hangs the system.

Exactly the pitfall I'm thinking of.

Thanks,
Qu

>> +
>> +	eb->read_mirror = mirror;
>> +	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
>> +		ret = -EIO;
>> +		goto err;
>> +	}
>> +	ret = validate_extent_buffer(eb);
>> +	if (ret < 0)
>> +		goto err;
>> +
>> +	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
>> +		btree_readahead_hook(eb, ret);
>> +
>> +	set_extent_buffer_uptodate(eb);
>> +
>> +	free_extent_buffer(eb);
>> +	return ret;
>> +err:
>> +	/*
>> +	 * our io error hook is going to dec the io pages
>> +	 * again, we have to make sure it has something to
>> +	 * decrement
>> +	 */
>
> That comment is slightly ambiguous - it's not the io error hook that
> does the decrement but end_bio_extent_readpage. Just rewrite the comment
> to :
>
> "end_bio_extent_readpage decrements io_pages in case of error, make sure
> it has ...."
>
>> +	atomic_inc(&eb->io_pages);
>> +	clear_extent_buffer_uptodate(eb);
>> +	free_extent_buffer(eb);
>> +	return ret;
>> +}
>> +
>>   int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
>>   				   struct page *page, u64 start, u64 end,
>>   				   int mirror)
>> @@ -600,6 +678,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
>>   	int reads_done;
>>
>>   	ASSERT(page->private);
>> +
>> +	if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
>> +		return validate_subpage_buffer(page, start, end, mirror);
>
> nit: validate_metadata_buffer is called in only once place so I'm
> wondering won't it make it more readable if this check is lifted to its
> sole caller so that when reading end_bio_extent_readpage it's apparent
> what's going on. Though it's apparent that the nesting in the caller
> will get somewhat unwieldy so won't be pressing hard for this.
>> +
>>   	eb = (struct extent_buffer *)page->private;
>>
>>
>>
Nikolay Borisov Dec. 14, 2020, 11:17 a.m. UTC | #5
On 14.12.20 г. 12:50 ч., Qu Wenruo wrote:
> 
> IIRC ASSERT() won't execute whatever in it for non debug build.
> Thus ASSERT(atomic_*) would cause non-debug kernel not to decrease the
> io_pages and hangs the system.

Nope: 

  3362 #ifdef CONFIG_BTRFS_ASSERT                                                      
     1 __cold __noreturn                                                               
     2 static inline void assertfail(const char *expr, const char *file, int line)     
     3 {                                                                               
     4         pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);           
     5         BUG();                                                                  
     6 }                                                                               
     7                                                                                 
     8 #define ASSERT(expr)                                            \               
     9         (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__))        
    10                                                                                 
    11 #else                                                                           
    12 static inline void assertfail(const char *expr, const char* file, int line) { } 
    13 #define ASSERT(expr)    (void)(expr)               <-- expression is evaluated.                   
    14 #endif
Qu Wenruo Dec. 14, 2020, 11:32 a.m. UTC | #6
On 2020/12/14 下午7:17, Nikolay Borisov wrote:
>
>
> On 14.12.20 г. 12:50 ч., Qu Wenruo wrote:
>>
>> IIRC ASSERT() won't execute whatever in it for non debug build.
>> Thus ASSERT(atomic_*) would cause non-debug kernel not to decrease the
>> io_pages and hangs the system.
>
> Nope:
>
>    3362 #ifdef CONFIG_BTRFS_ASSERT
>       1 __cold __noreturn
>       2 static inline void assertfail(const char *expr, const char *file, int line)
>       3 {
>       4         pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
>       5         BUG();
>       6 }
>       7
>       8 #define ASSERT(expr)                                            \
>       9         (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__))
>      10
>      11 #else
>      12 static inline void assertfail(const char *expr, const char* file, int line) { }
>      13 #define ASSERT(expr)    (void)(expr)               <-- expression is evaluated.
>      14 #endif
>
Wow, that's too tricky and maybe that's the reason why Josef is
complaining about the ASSERT()s slows down the system.

In fact, from the assert(3) man page, we're doing things differently
than user space at least:

   If the macro NDEBUG is defined at the moment <assert.h> was last
   included, the macro assert() generates no code, and hence does nothing
   at all.

So I'm confused, what's the proper way to do ASSERT()?

Thanks,
Qu
Nikolay Borisov Dec. 14, 2020, 12:40 p.m. UTC | #7
On 14.12.20 г. 13:32 ч., Qu Wenruo wrote:
> 
> 
> On 2020/12/14 下午7:17, Nikolay Borisov wrote:
>>
>>
>> On 14.12.20 г. 12:50 ч., Qu Wenruo wrote:
>>>
>>> IIRC ASSERT() won't execute whatever in it for non debug build.
>>> Thus ASSERT(atomic_*) would cause non-debug kernel not to decrease the
>>> io_pages and hangs the system.
>>
>> Nope:
>>
>>    3362 #ifdef CONFIG_BTRFS_ASSERT
>>       1 __cold __noreturn
>>       2 static inline void assertfail(const char *expr, const char
>> *file, int line)
>>       3 {
>>       4         pr_err("assertion failed: %s, in %s:%d\n", expr, file,
>> line);
>>       5         BUG();
>>       6 }
>>       7
>>       8 #define ASSERT(expr)                                            \
>>       9         (likely(expr) ? (void)0 : assertfail(#expr, __FILE__,
>> __LINE__))
>>      10
>>      11 #else
>>      12 static inline void assertfail(const char *expr, const char*
>> file, int line) { }
>>      13 #define ASSERT(expr)    (void)(expr)               <--
>> expression is evaluated.
>>      14 #endif
>>
> Wow, that's too tricky and maybe that's the reason why Josef is
> complaining about the ASSERT()s slows down the system.
> 
> In fact, from the assert(3) man page, we're doing things differently
> than user space at least:
> 
>   If the macro NDEBUG is defined at the moment <assert.h> was last
>   included, the macro assert() generates no code, and hence does nothing
>   at all.
> 
> So I'm confused, what's the proper way to do ASSERT()?

Well as it stands now, what I suggested would work. OTOH this really
puts forward the question why do we leave code around (well, the
compiler should really eliminate those redundant checks). Hm, David?



> 
> Thanks,
> Qu
>
diff mbox series

Patch

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b6c03a8b0c72..adda76895058 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -591,6 +591,84 @@  static int validate_extent_buffer(struct extent_buffer *eb)
 	return ret;
 }
 
+static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
+				   int mirror)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	struct extent_buffer *eb;
+	int reads_done;
+	int ret = 0;
+
+	if (!IS_ALIGNED(start, fs_info->sectorsize) ||
+	    !IS_ALIGNED(end - start + 1, fs_info->sectorsize) ||
+	    !IS_ALIGNED(end - start + 1, fs_info->nodesize)) {
+		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+		btrfs_err(fs_info, "invalid tree read bytenr");
+		return -EUCLEAN;
+	}
+
+	/*
+	 * We don't allow bio merge for subpage metadata read, so we should
+	 * only get one eb for each endio hook.
+	 */
+	ASSERT(end == start + fs_info->nodesize - 1);
+	ASSERT(PagePrivate(page));
+
+	rcu_read_lock();
+	eb = radix_tree_lookup(&fs_info->buffer_radix,
+			       start / fs_info->sectorsize);
+	rcu_read_unlock();
+
+	/*
+	 * When we are reading one tree block, eb must have been
+	 * inserted into the radix tree. If not something is wrong.
+	 */
+	if (!eb) {
+		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+		btrfs_err(fs_info,
+			"can't find extent buffer for bytenr %llu",
+			start);
+		return -EUCLEAN;
+	}
+	/*
+	 * The pending IO might have been the only thing that kept
+	 * this buffer in memory.  Make sure we have a ref for all
+	 * this other checks
+	 */
+	atomic_inc(&eb->refs);
+
+	reads_done = atomic_dec_and_test(&eb->io_pages);
+	/* Subpage read must finish in page read */
+	ASSERT(reads_done);
+
+	eb->read_mirror = mirror;
+	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+		ret = -EIO;
+		goto err;
+	}
+	ret = validate_extent_buffer(eb);
+	if (ret < 0)
+		goto err;
+
+	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+		btree_readahead_hook(eb, ret);
+
+	set_extent_buffer_uptodate(eb);
+
+	free_extent_buffer(eb);
+	return ret;
+err:
+	/*
+	 * our io error hook is going to dec the io pages
+	 * again, we have to make sure it has something to
+	 * decrement
+	 */
+	atomic_inc(&eb->io_pages);
+	clear_extent_buffer_uptodate(eb);
+	free_extent_buffer(eb);
+	return ret;
+}
+
 int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
 				   struct page *page, u64 start, u64 end,
 				   int mirror)
@@ -600,6 +678,10 @@  int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
 	int reads_done;
 
 	ASSERT(page->private);
+
+	if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+		return validate_subpage_buffer(page, start, end, mirror);
+
 	eb = (struct extent_buffer *)page->private;