diff mbox series

[RFC,v2,02/25] ext4: convert to exclusive lock while inserting delalloc extents

Message ID 20240102123918.799062-3-yi.zhang@huaweicloud.com (mailing list archive)
State New
Headers show
Series ext4: use iomap for regular file's buffered IO path and enable large foilo | expand

Commit Message

Zhang Yi Jan. 2, 2024, 12:38 p.m. UTC
From: Zhang Yi <yi.zhang@huawei.com>

ext4_da_map_blocks() only hold i_data_sem in shared mode and i_rwsem
when inserting delalloc extents, it could be raced by another querying
path of ext4_map_blocks() without i_rwsem, .e.g buffered read path.
Suppose we buffered read a file containing just a hole, and without any
cached extents tree, then it is raced by another delayed buffered write
to the same area or the near area belongs to the same hole, and the new
delalloc extent could be overwritten to a hole extent.

 pread()                           pwrite()
  filemap_read_folio()
   ext4_mpage_readpages()
    ext4_map_blocks()
     down_read(i_data_sem)
     ext4_ext_determine_hole()
     //find hole
     ext4_ext_put_gap_in_cache()
      ext4_es_find_extent_range()
      //no delalloc extent
                                    ext4_da_map_blocks()
                                     down_read(i_data_sem)
                                     ext4_insert_delayed_block()
                                     //insert delalloc extent
      ext4_es_insert_extent()
      //overwrite delalloc extent to hole

This race could lead to inconsistent delalloc extents tree and
incorrect reserved space counter. Fix this by converting to hold
i_data_sem in exclusive mode when adding a new delalloc extent in
ext4_da_map_blocks().

Cc: stable@vger.kernel.org
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Suggested-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

Comments

Jan Kara Jan. 3, 2024, 10:03 a.m. UTC | #1
On Tue 02-01-24 20:38:55, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> ext4_da_map_blocks() only hold i_data_sem in shared mode and i_rwsem
> when inserting delalloc extents, it could be raced by another querying
> path of ext4_map_blocks() without i_rwsem, .e.g buffered read path.
> Suppose we buffered read a file containing just a hole, and without any
> cached extents tree, then it is raced by another delayed buffered write
> to the same area or the near area belongs to the same hole, and the new
> delalloc extent could be overwritten to a hole extent.
> 
>  pread()                           pwrite()
>   filemap_read_folio()
>    ext4_mpage_readpages()
>     ext4_map_blocks()
>      down_read(i_data_sem)
>      ext4_ext_determine_hole()
>      //find hole
>      ext4_ext_put_gap_in_cache()
>       ext4_es_find_extent_range()
>       //no delalloc extent
>                                     ext4_da_map_blocks()
>                                      down_read(i_data_sem)
>                                      ext4_insert_delayed_block()
>                                      //insert delalloc extent
>       ext4_es_insert_extent()
>       //overwrite delalloc extent to hole
> 
> This race could lead to inconsistent delalloc extents tree and
> incorrect reserved space counter. Fix this by converting to hold
> i_data_sem in exclusive mode when adding a new delalloc extent in
> ext4_da_map_blocks().
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> Suggested-by: Jan Kara <jack@suse.cz>

Looks good to me! Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/inode.c | 25 +++++++++++--------------
>  1 file changed, 11 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5b0d3075be12..142c67f5c7fc 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1703,10 +1703,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
>  
>  	/* Lookup extent status tree firstly */
>  	if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
> -		if (ext4_es_is_hole(&es)) {
> -			down_read(&EXT4_I(inode)->i_data_sem);
> +		if (ext4_es_is_hole(&es))
>  			goto add_delayed;
> -		}
>  
>  		/*
>  		 * Delayed extent could be allocated by fallocate.
> @@ -1748,8 +1746,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
>  		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
>  	else
>  		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
> -	if (retval < 0)
> -		goto out_unlock;
> +	if (retval < 0) {
> +		up_read(&EXT4_I(inode)->i_data_sem);
> +		return retval;
> +	}
>  	if (retval > 0) {
>  		unsigned int status;
>  
> @@ -1765,24 +1765,21 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
>  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
>  		ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
>  				      map->m_pblk, status);
> -		goto out_unlock;
> +		up_read(&EXT4_I(inode)->i_data_sem);
> +		return retval;
>  	}
> +	up_read(&EXT4_I(inode)->i_data_sem);
>  
>  add_delayed:
> -	/*
> -	 * XXX: __block_prepare_write() unmaps passed block,
> -	 * is it OK?
> -	 */
> +	down_write(&EXT4_I(inode)->i_data_sem);
>  	retval = ext4_insert_delayed_block(inode, map->m_lblk);
> +	up_write(&EXT4_I(inode)->i_data_sem);
>  	if (retval)
> -		goto out_unlock;
> +		return retval;
>  
>  	map_bh(bh, inode->i_sb, invalid_block);
>  	set_buffer_new(bh);
>  	set_buffer_delay(bh);
> -
> -out_unlock:
> -	up_read((&EXT4_I(inode)->i_data_sem));
>  	return retval;
>  }
>  
> -- 
> 2.39.2
>
diff mbox series

Patch

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5b0d3075be12..142c67f5c7fc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1703,10 +1703,8 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
-		if (ext4_es_is_hole(&es)) {
-			down_read(&EXT4_I(inode)->i_data_sem);
+		if (ext4_es_is_hole(&es))
 			goto add_delayed;
-		}
 
 		/*
 		 * Delayed extent could be allocated by fallocate.
@@ -1748,8 +1746,10 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
 	else
 		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
-	if (retval < 0)
-		goto out_unlock;
+	if (retval < 0) {
+		up_read(&EXT4_I(inode)->i_data_sem);
+		return retval;
+	}
 	if (retval > 0) {
 		unsigned int status;
 
@@ -1765,24 +1765,21 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 				      map->m_pblk, status);
-		goto out_unlock;
+		up_read(&EXT4_I(inode)->i_data_sem);
+		return retval;
 	}
+	up_read(&EXT4_I(inode)->i_data_sem);
 
 add_delayed:
-	/*
-	 * XXX: __block_prepare_write() unmaps passed block,
-	 * is it OK?
-	 */
+	down_write(&EXT4_I(inode)->i_data_sem);
 	retval = ext4_insert_delayed_block(inode, map->m_lblk);
+	up_write(&EXT4_I(inode)->i_data_sem);
 	if (retval)
-		goto out_unlock;
+		return retval;
 
 	map_bh(bh, inode->i_sb, invalid_block);
 	set_buffer_new(bh);
 	set_buffer_delay(bh);
-
-out_unlock:
-	up_read((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }