diff mbox series

[v10,06/14] btrfs: optionally extend i_size in cow_file_range_inline()

Message ID a00b59623219c8a07f2c22f80ef1466d0f182d77.1629234193.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show
Series btrfs: add ioctls and send/receive support for reading/writing compressed data | expand

Commit Message

Omar Sandoval Aug. 17, 2021, 9:06 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

Currently, an inline extent is always created after i_size is extended
from btrfs_dirty_pages(). However, for encoded writes, we only want to
update i_size after we successfully created the inline extent. Add an
update_i_size parameter to cow_file_range_inline() and
insert_inline_extent() and pass in the size of the extent rather than
determining it from i_size. Since the start parameter is always passed
as 0, get rid of it and simplify the logic in these two functions. While
we're here, let's document the requirements for creating an inline
extent.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/btrfs/inode.c | 100 +++++++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 52 deletions(-)

Comments

Nikolay Borisov Aug. 20, 2021, 8:51 a.m. UTC | #1
On 18.08.21 г. 0:06, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Currently, an inline extent is always created after i_size is extended
> from btrfs_dirty_pages(). However, for encoded writes, we only want to
> update i_size after we successfully created the inline extent. Add an
> update_i_size parameter to cow_file_range_inline() and
> insert_inline_extent() and pass in the size of the extent rather than
> determining it from i_size. Since the start parameter is always passed
> as 0, get rid of it and simplify the logic in these two functions. While
> we're here, let's document the requirements for creating an inline
> extent.
> 
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  fs/btrfs/inode.c | 100 +++++++++++++++++++++++------------------------
>  1 file changed, 48 insertions(+), 52 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 708d8ab098bc..0b5ff14aa7fd 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -236,9 +236,10 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
>  static int insert_inline_extent(struct btrfs_trans_handle *trans,
>  				struct btrfs_path *path, bool extent_inserted,
>  				struct btrfs_root *root, struct inode *inode,
> -				u64 start, size_t size, size_t compressed_size,
> +				size_t size, size_t compressed_size,
>  				int compress_type,
> -				struct page **compressed_pages)
> +				struct page **compressed_pages,
> +				bool update_i_size)
>  {
>  	struct extent_buffer *leaf;
>  	struct page *page = NULL;
> @@ -247,7 +248,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>  	struct btrfs_file_extent_item *ei;
>  	int ret;
>  	size_t cur_size = size;
> -	unsigned long offset;
> +	u64 i_size;
>  
>  	ASSERT((compressed_size > 0 && compressed_pages) ||
>  	       (compressed_size == 0 && !compressed_pages));
> @@ -260,7 +261,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>  		size_t datasize;
>  
>  		key.objectid = btrfs_ino(BTRFS_I(inode));
> -		key.offset = start;
> +		key.offset = 0;
>  		key.type = BTRFS_EXTENT_DATA_KEY;
>  
>  		datasize = btrfs_file_extent_calc_inline_size(cur_size);
> @@ -297,12 +298,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>  		btrfs_set_file_extent_compression(leaf, ei,
>  						  compress_type);
>  	} else {
> -		page = find_get_page(inode->i_mapping,
> -				     start >> PAGE_SHIFT);
> +		page = find_get_page(inode->i_mapping, 0);
>  		btrfs_set_file_extent_compression(leaf, ei, 0);
>  		kaddr = kmap_atomic(page);
> -		offset = offset_in_page(start);
> -		write_extent_buffer(leaf, kaddr + offset, ptr, size);
> +		write_extent_buffer(leaf, kaddr, ptr, size);
>  		kunmap_atomic(kaddr);
>  		put_page(page);
>  	}
> @@ -313,8 +312,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>  	 * We align size to sectorsize for inline extents just for simplicity
>  	 * sake.
>  	 */
> -	size = ALIGN(size, root->fs_info->sectorsize);
> -	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
> +	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
> +					ALIGN(size, root->fs_info->sectorsize));
>  	if (ret)
>  		goto fail;
>  
> @@ -327,7 +326,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>  	 * before we unlock the pages.  Otherwise we
>  	 * could end up racing with unlink.
>  	 */
> -	BTRFS_I(inode)->disk_i_size = inode->i_size;
> +	i_size = i_size_read(inode);
> +	if (update_i_size && size > i_size) {
> +		i_size_write(inode, size);
> +		i_size = size;
> +	}
> +	BTRFS_I(inode)->disk_i_size = i_size;
> +
>  fail:
>  	return ret;
>  }
> @@ -338,35 +343,31 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>   * does the checks required to make sure the data is small enough
>   * to fit as an inline extent.
>   */
> -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
> -					  u64 end, size_t compressed_size,
> +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
> +					  size_t compressed_size,
>  					  int compress_type,
> -					  struct page **compressed_pages)
> +					  struct page **compressed_pages,
> +					  bool update_i_size)
>  {
>  	struct btrfs_drop_extents_args drop_args = { 0 };
>  	struct btrfs_root *root = inode->root;
>  	struct btrfs_fs_info *fs_info = root->fs_info;
>  	struct btrfs_trans_handle *trans;
> -	u64 isize = i_size_read(&inode->vfs_inode);
> -	u64 actual_end = min(end + 1, isize);
> -	u64 inline_len = actual_end - start;
> -	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
> -	u64 data_len = inline_len;
> +	u64 data_len = compressed_size ? compressed_size : size;
>  	int ret;
>  	struct btrfs_path *path;
>  
> -	if (compressed_size)
> -		data_len = compressed_size;
> -
> -	if (start > 0 ||
> -	    actual_end > fs_info->sectorsize ||
> +	/*
> +	 * We can create an inline extent if it ends at or beyond the current
> +	 * i_size, is no larger than a sector (decompressed), and the (possibly
> +	 * compressed) data fits in a leaf and the configured maximum inline
> +	 * size.
> +	 */

Urgh, just some days ago Qu was talking about how awkward it is to have
mixed extents in a file. And now, AFAIU, you are making them more likely
since now they can be created not just at the beginning of the file but
also after i_size write. While this won't be a problem in and of itself
it goes just the opposite way of us trying to shrink the possible cases
when we can have mixed extents. Qu what is your take on that?

<snip>
Qu Wenruo Aug. 20, 2021, 9:13 a.m. UTC | #2
On 2021/8/20 下午4:51, Nikolay Borisov wrote:
>
>
> On 18.08.21 г. 0:06, Omar Sandoval wrote:
>> From: Omar Sandoval <osandov@fb.com>
>>
>> Currently, an inline extent is always created after i_size is extended
>> from btrfs_dirty_pages(). However, for encoded writes, we only want to
>> update i_size after we successfully created the inline extent.

To me, the idea of write first then update isize is just going to cause
tons of inline extent related prblems.

The current example is falloc, which only update the isize after the
falloc finishes.

This behavior has already bothered me quite a lot, as it can easily
create mixed inline and regular extents.

Can't we remember the old isize (with proper locking), enlarge isize
(with holes filled), do the write.

If something wrong happened, we truncate the isize back to its old isize.

>> Add an
>> update_i_size parameter to cow_file_range_inline() and
>> insert_inline_extent() and pass in the size of the extent rather than
>> determining it from i_size. Since the start parameter is always passed
>> as 0, get rid of it and simplify the logic in these two functions. While
>> we're here, let's document the requirements for creating an inline
>> extent.
>>
>> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
>> Signed-off-by: Omar Sandoval <osandov@fb.com>
>> ---
>>   fs/btrfs/inode.c | 100 +++++++++++++++++++++++------------------------
>>   1 file changed, 48 insertions(+), 52 deletions(-)
>>
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index 708d8ab098bc..0b5ff14aa7fd 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -236,9 +236,10 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
>>   static int insert_inline_extent(struct btrfs_trans_handle *trans,
>>   				struct btrfs_path *path, bool extent_inserted,
>>   				struct btrfs_root *root, struct inode *inode,
>> -				u64 start, size_t size, size_t compressed_size,
>> +				size_t size, size_t compressed_size,
>>   				int compress_type,
>> -				struct page **compressed_pages)
>> +				struct page **compressed_pages,
>> +				bool update_i_size)
>>   {
>>   	struct extent_buffer *leaf;
>>   	struct page *page = NULL;
>> @@ -247,7 +248,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>>   	struct btrfs_file_extent_item *ei;
>>   	int ret;
>>   	size_t cur_size = size;
>> -	unsigned long offset;
>> +	u64 i_size;
>>
>>   	ASSERT((compressed_size > 0 && compressed_pages) ||
>>   	       (compressed_size == 0 && !compressed_pages));
>> @@ -260,7 +261,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>>   		size_t datasize;
>>
>>   		key.objectid = btrfs_ino(BTRFS_I(inode));
>> -		key.offset = start;
>> +		key.offset = 0;
>>   		key.type = BTRFS_EXTENT_DATA_KEY;
>>
>>   		datasize = btrfs_file_extent_calc_inline_size(cur_size);
>> @@ -297,12 +298,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>>   		btrfs_set_file_extent_compression(leaf, ei,
>>   						  compress_type);
>>   	} else {
>> -		page = find_get_page(inode->i_mapping,
>> -				     start >> PAGE_SHIFT);
>> +		page = find_get_page(inode->i_mapping, 0);
>>   		btrfs_set_file_extent_compression(leaf, ei, 0);
>>   		kaddr = kmap_atomic(page);
>> -		offset = offset_in_page(start);
>> -		write_extent_buffer(leaf, kaddr + offset, ptr, size);
>> +		write_extent_buffer(leaf, kaddr, ptr, size);
>>   		kunmap_atomic(kaddr);
>>   		put_page(page);
>>   	}
>> @@ -313,8 +312,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>>   	 * We align size to sectorsize for inline extents just for simplicity
>>   	 * sake.
>>   	 */
>> -	size = ALIGN(size, root->fs_info->sectorsize);
>> -	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
>> +	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
>> +					ALIGN(size, root->fs_info->sectorsize));
>>   	if (ret)
>>   		goto fail;
>>
>> @@ -327,7 +326,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>>   	 * before we unlock the pages.  Otherwise we
>>   	 * could end up racing with unlink.
>>   	 */
>> -	BTRFS_I(inode)->disk_i_size = inode->i_size;
>> +	i_size = i_size_read(inode);
>> +	if (update_i_size && size > i_size) {
>> +		i_size_write(inode, size);
>> +		i_size = size;
>> +	}
>> +	BTRFS_I(inode)->disk_i_size = i_size;
>> +
>>   fail:
>>   	return ret;
>>   }
>> @@ -338,35 +343,31 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
>>    * does the checks required to make sure the data is small enough
>>    * to fit as an inline extent.
>>    */
>> -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
>> -					  u64 end, size_t compressed_size,
>> +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
>> +					  size_t compressed_size,
>>   					  int compress_type,
>> -					  struct page **compressed_pages)
>> +					  struct page **compressed_pages,
>> +					  bool update_i_size)
>>   {
>>   	struct btrfs_drop_extents_args drop_args = { 0 };
>>   	struct btrfs_root *root = inode->root;
>>   	struct btrfs_fs_info *fs_info = root->fs_info;
>>   	struct btrfs_trans_handle *trans;
>> -	u64 isize = i_size_read(&inode->vfs_inode);
>> -	u64 actual_end = min(end + 1, isize);
>> -	u64 inline_len = actual_end - start;
>> -	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
>> -	u64 data_len = inline_len;
>> +	u64 data_len = compressed_size ? compressed_size : size;
>>   	int ret;
>>   	struct btrfs_path *path;
>>
>> -	if (compressed_size)
>> -		data_len = compressed_size;
>> -
>> -	if (start > 0 ||
>> -	    actual_end > fs_info->sectorsize ||
>> +	/*
>> +	 * We can create an inline extent if it ends at or beyond the current
>> +	 * i_size, is no larger than a sector (decompressed), and the (possibly
>> +	 * compressed) data fits in a leaf and the configured maximum inline
>> +	 * size.
>> +	 */
>
> Urgh, just some days ago Qu was talking about how awkward it is to have
> mixed extents in a file. And now, AFAIU, you are making them more likely
> since now they can be created not just at the beginning of the file but
> also after i_size write. While this won't be a problem in and of itself
> it goes just the opposite way of us trying to shrink the possible cases
> when we can have mixed extents.

Tree-checker should reject such inline extent at non-zero offset.

> Qu what is your take on that?

My question is, why encoded write needs to bother the inline extents at all?

My intuition of such encoded write is, it should not create inline
extents at all.

Or is there any special use-case involved for encoded write?

Thanks,
Qu


>
> <snip>
>
Omar Sandoval Aug. 20, 2021, 6:11 p.m. UTC | #3
On Fri, Aug 20, 2021 at 05:13:34PM +0800, Qu Wenruo wrote:
> 
> 
> On 2021/8/20 下午4:51, Nikolay Borisov wrote:
> > 
> > 
> > On 18.08.21 г. 0:06, Omar Sandoval wrote:
> > > From: Omar Sandoval <osandov@fb.com>
> > > 
> > > Currently, an inline extent is always created after i_size is extended
> > > from btrfs_dirty_pages(). However, for encoded writes, we only want to
> > > update i_size after we successfully created the inline extent.
> 
> To me, the idea of write first then update isize is just going to cause
> tons of inline extent related prblems.
> 
> The current example is falloc, which only update the isize after the
> falloc finishes.
> 
> This behavior has already bothered me quite a lot, as it can easily
> create mixed inline and regular extents.

Do you have an example of how this would happen? I have the inode and
extent bits locked during an encoded write, and I see that fallocate
does the same.

> Can't we remember the old isize (with proper locking), enlarge isize
> (with holes filled), do the write.
> 
> If something wrong happened, we truncate the isize back to its old isize.
> 
> > > Add an
> > > update_i_size parameter to cow_file_range_inline() and
> > > insert_inline_extent() and pass in the size of the extent rather than
> > > determining it from i_size. Since the start parameter is always passed
> > > as 0, get rid of it and simplify the logic in these two functions. While
> > > we're here, let's document the requirements for creating an inline
> > > extent.
> > > 
> > > Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > ---
> > >   fs/btrfs/inode.c | 100 +++++++++++++++++++++++------------------------
> > >   1 file changed, 48 insertions(+), 52 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > > index 708d8ab098bc..0b5ff14aa7fd 100644
> > > --- a/fs/btrfs/inode.c
> > > +++ b/fs/btrfs/inode.c
> > > @@ -236,9 +236,10 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
> > >   static int insert_inline_extent(struct btrfs_trans_handle *trans,
> > >   				struct btrfs_path *path, bool extent_inserted,
> > >   				struct btrfs_root *root, struct inode *inode,
> > > -				u64 start, size_t size, size_t compressed_size,
> > > +				size_t size, size_t compressed_size,
> > >   				int compress_type,
> > > -				struct page **compressed_pages)
> > > +				struct page **compressed_pages,
> > > +				bool update_i_size)
> > >   {
> > >   	struct extent_buffer *leaf;
> > >   	struct page *page = NULL;
> > > @@ -247,7 +248,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
> > >   	struct btrfs_file_extent_item *ei;
> > >   	int ret;
> > >   	size_t cur_size = size;
> > > -	unsigned long offset;
> > > +	u64 i_size;
> > > 
> > >   	ASSERT((compressed_size > 0 && compressed_pages) ||
> > >   	       (compressed_size == 0 && !compressed_pages));
> > > @@ -260,7 +261,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
> > >   		size_t datasize;
> > > 
> > >   		key.objectid = btrfs_ino(BTRFS_I(inode));
> > > -		key.offset = start;
> > > +		key.offset = 0;
> > >   		key.type = BTRFS_EXTENT_DATA_KEY;
> > > 
> > >   		datasize = btrfs_file_extent_calc_inline_size(cur_size);
> > > @@ -297,12 +298,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
> > >   		btrfs_set_file_extent_compression(leaf, ei,
> > >   						  compress_type);
> > >   	} else {
> > > -		page = find_get_page(inode->i_mapping,
> > > -				     start >> PAGE_SHIFT);
> > > +		page = find_get_page(inode->i_mapping, 0);
> > >   		btrfs_set_file_extent_compression(leaf, ei, 0);
> > >   		kaddr = kmap_atomic(page);
> > > -		offset = offset_in_page(start);
> > > -		write_extent_buffer(leaf, kaddr + offset, ptr, size);
> > > +		write_extent_buffer(leaf, kaddr, ptr, size);
> > >   		kunmap_atomic(kaddr);
> > >   		put_page(page);
> > >   	}
> > > @@ -313,8 +312,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
> > >   	 * We align size to sectorsize for inline extents just for simplicity
> > >   	 * sake.
> > >   	 */
> > > -	size = ALIGN(size, root->fs_info->sectorsize);
> > > -	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
> > > +	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
> > > +					ALIGN(size, root->fs_info->sectorsize));
> > >   	if (ret)
> > >   		goto fail;
> > > 
> > > @@ -327,7 +326,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
> > >   	 * before we unlock the pages.  Otherwise we
> > >   	 * could end up racing with unlink.
> > >   	 */
> > > -	BTRFS_I(inode)->disk_i_size = inode->i_size;
> > > +	i_size = i_size_read(inode);
> > > +	if (update_i_size && size > i_size) {
> > > +		i_size_write(inode, size);
> > > +		i_size = size;
> > > +	}
> > > +	BTRFS_I(inode)->disk_i_size = i_size;
> > > +
> > >   fail:
> > >   	return ret;
> > >   }
> > > @@ -338,35 +343,31 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
> > >    * does the checks required to make sure the data is small enough
> > >    * to fit as an inline extent.
> > >    */
> > > -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
> > > -					  u64 end, size_t compressed_size,
> > > +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
> > > +					  size_t compressed_size,
> > >   					  int compress_type,
> > > -					  struct page **compressed_pages)
> > > +					  struct page **compressed_pages,
> > > +					  bool update_i_size)
> > >   {
> > >   	struct btrfs_drop_extents_args drop_args = { 0 };
> > >   	struct btrfs_root *root = inode->root;
> > >   	struct btrfs_fs_info *fs_info = root->fs_info;
> > >   	struct btrfs_trans_handle *trans;
> > > -	u64 isize = i_size_read(&inode->vfs_inode);
> > > -	u64 actual_end = min(end + 1, isize);
> > > -	u64 inline_len = actual_end - start;
> > > -	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
> > > -	u64 data_len = inline_len;
> > > +	u64 data_len = compressed_size ? compressed_size : size;
> > >   	int ret;
> > >   	struct btrfs_path *path;
> > > 
> > > -	if (compressed_size)
> > > -		data_len = compressed_size;
> > > -
> > > -	if (start > 0 ||
> > > -	    actual_end > fs_info->sectorsize ||
> > > +	/*
> > > +	 * We can create an inline extent if it ends at or beyond the current
> > > +	 * i_size, is no larger than a sector (decompressed), and the (possibly
> > > +	 * compressed) data fits in a leaf and the configured maximum inline
> > > +	 * size.
> > > +	 */
> > 
> > Urgh, just some days ago Qu was talking about how awkward it is to have
> > mixed extents in a file. And now, AFAIU, you are making them more likely
> > since now they can be created not just at the beginning of the file but
> > also after i_size write. While this won't be a problem in and of itself
> > it goes just the opposite way of us trying to shrink the possible cases
> > when we can have mixed extents.
> 
> Tree-checker should reject such inline extent at non-zero offset.

This change does not allow creating inline extents at a non-zero offset.

> > Qu what is your take on that?
> 
> My question is, why encoded write needs to bother the inline extents at all?
> 
> My intuition of such encoded write is, it should not create inline
> extents at all.
> 
> Or is there any special use-case involved for encoded write?

We create compressed inline extents with normal writes. We should be
able to send and receive them without converting them into regular
extents.
Qu Wenruo Aug. 21, 2021, 1:11 a.m. UTC | #4
On 2021/8/21 上午2:11, Omar Sandoval wrote:
> On Fri, Aug 20, 2021 at 05:13:34PM +0800, Qu Wenruo wrote:
>>
>>
>> On 2021/8/20 下午4:51, Nikolay Borisov wrote:
>>>
>>>
>>> On 18.08.21 г. 0:06, Omar Sandoval wrote:
>>>> From: Omar Sandoval <osandov@fb.com>
>>>>
>>>> Currently, an inline extent is always created after i_size is extended
>>>> from btrfs_dirty_pages(). However, for encoded writes, we only want to
>>>> update i_size after we successfully created the inline extent.
>>
>> To me, the idea of write first then update isize is just going to cause
>> tons of inline extent related prblems.
>>
>> The current example is falloc, which only update the isize after the
>> falloc finishes.
>>
>> This behavior has already bothered me quite a lot, as it can easily
>> create mixed inline and regular extents.
>
> Do you have an example of how this would happen? I have the inode and
> extent bits locked during an encoded write, and I see that fallocate
> does the same.

xfs_io -f -c "pwrite 0 1K" -c "sync" -c "falloc 0 4k" -c "pwrite 4k 4k"

The [0, 1K) will be written as inline without doubt.

Then we go to falloc, it will try to zero the range [1K, 4K), but it
doesn't increase the isize.
Thus the page [0, 4k) will still be written back as inline, since isize
is still 1K.

Later [4K, 8K) will be written back as regular, causing mixed extents.

>
>> Can't we remember the old isize (with proper locking), enlarge isize
>> (with holes filled), do the write.
>>
>> If something wrong happened, we truncate the isize back to its old isize.
>>
[...]
>>>
>>> Urgh, just some days ago Qu was talking about how awkward it is to have
>>> mixed extents in a file. And now, AFAIU, you are making them more likely
>>> since now they can be created not just at the beginning of the file but
>>> also after i_size write. While this won't be a problem in and of itself
>>> it goes just the opposite way of us trying to shrink the possible cases
>>> when we can have mixed extents.
>>
>> Tree-checker should reject such inline extent at non-zero offset.
>
> This change does not allow creating inline extents at a non-zero offset.
>
>>> Qu what is your take on that?
>>
>> My question is, why encoded write needs to bother the inline extents at all?
>>
>> My intuition of such encoded write is, it should not create inline
>> extents at all.
>>
>> Or is there any special use-case involved for encoded write?
>
> We create compressed inline extents with normal writes. We should be
> able to send and receive them without converting them into regular
> extents.
>
But my first impression for any encoded write is that, they should work
like DIO, thus everything should be sectorsize aligned.

Then why could they create inline extent? As inline extent can only be
possible when the isize is smaller than sectorsize.

Thanks,
Qu
Omar Sandoval Aug. 23, 2021, 6:16 p.m. UTC | #5
On Sat, Aug 21, 2021 at 09:11:26AM +0800, Qu Wenruo wrote:
> 
> 
> On 2021/8/21 上午2:11, Omar Sandoval wrote:
> > On Fri, Aug 20, 2021 at 05:13:34PM +0800, Qu Wenruo wrote:
> > > 
> > > 
> > > On 2021/8/20 下午4:51, Nikolay Borisov wrote:
> > > > 
> > > > 
> > > > On 18.08.21 г. 0:06, Omar Sandoval wrote:
> > > > > From: Omar Sandoval <osandov@fb.com>
> > > > > 
> > > > > Currently, an inline extent is always created after i_size is extended
> > > > > from btrfs_dirty_pages(). However, for encoded writes, we only want to
> > > > > update i_size after we successfully created the inline extent.
> > > 
> > > To me, the idea of write first then update isize is just going to cause
> > > tons of inline extent related prblems.
> > > 
> > > The current example is falloc, which only update the isize after the
> > > falloc finishes.
> > > 
> > > This behavior has already bothered me quite a lot, as it can easily
> > > create mixed inline and regular extents.
> > 
> > Do you have an example of how this would happen? I have the inode and
> > extent bits locked during an encoded write, and I see that fallocate
> > does the same.
> 
> xfs_io -f -c "pwrite 0 1K" -c "sync" -c "falloc 0 4k" -c "pwrite 4k 4k"
> 
> The [0, 1K) will be written as inline without doubt.
> 
> Then we go to falloc, it will try to zero the range [1K, 4K), but it
> doesn't increase the isize.
> Thus the page [0, 4k) will still be written back as inline, since isize
> is still 1K.
> 
> Later [4K, 8K) will be written back as regular, causing mixed extents.

I'll have to read fallocate more closely to follow what's going on here
and figure out if it applies to encoded writes. Please help me out if
you see how this would be an issue with encoded writes.

> > > Can't we remember the old isize (with proper locking), enlarge isize
> > > (with holes filled), do the write.
> > > 
> > > If something wrong happened, we truncate the isize back to its old isize.
> > > 
> [...]
> > > > 
> > > > Urgh, just some days ago Qu was talking about how awkward it is to have
> > > > mixed extents in a file. And now, AFAIU, you are making them more likely
> > > > since now they can be created not just at the beginning of the file but
> > > > also after i_size write. While this won't be a problem in and of itself
> > > > it goes just the opposite way of us trying to shrink the possible cases
> > > > when we can have mixed extents.
> > > 
> > > Tree-checker should reject such inline extent at non-zero offset.
> > 
> > This change does not allow creating inline extents at a non-zero offset.
> > 
> > > > Qu what is your take on that?
> > > 
> > > My question is, why encoded write needs to bother the inline extents at all?
> > > 
> > > My intuition of such encoded write is, it should not create inline
> > > extents at all.
> > > 
> > > Or is there any special use-case involved for encoded write?
> > 
> > We create compressed inline extents with normal writes. We should be
> > able to send and receive them without converting them into regular
> > extents.
> > 
> But my first impression for any encoded write is that, they should work
> like DIO, thus everything should be sectorsize aligned.
> 
> Then why could they create inline extent? As inline extent can only be
> possible when the isize is smaller than sectorsize.

ENCODED_WRITE is not defined as "O_DIRECT, but encoded". It happens to
have some resemblance to O_DIRECT because we have alignment requirements
for new extents and because we bypass the page cache, but there's no
reason to copy arbitrary restrictions from O_DIRECT. If someone is using
ENCODED_WRITE to write compressed data, then they care about space
efficiency, so we should make efficient use of inline extents.
Qu Wenruo Aug. 23, 2021, 11:32 p.m. UTC | #6
On 2021/8/24 上午2:16, Omar Sandoval wrote:
> On Sat, Aug 21, 2021 at 09:11:26AM +0800, Qu Wenruo wrote:
>>
>>
>> On 2021/8/21 上午2:11, Omar Sandoval wrote:
>>> On Fri, Aug 20, 2021 at 05:13:34PM +0800, Qu Wenruo wrote:
>>>>
>>>>
>>>> On 2021/8/20 下午4:51, Nikolay Borisov wrote:
>>>>>
>>>>>
>>>>> On 18.08.21 г. 0:06, Omar Sandoval wrote:
>>>>>> From: Omar Sandoval <osandov@fb.com>
>>>>>>
>>>>>> Currently, an inline extent is always created after i_size is extended
>>>>>> from btrfs_dirty_pages(). However, for encoded writes, we only want to
>>>>>> update i_size after we successfully created the inline extent.
>>>>
>>>> To me, the idea of write first then update isize is just going to cause
>>>> tons of inline extent related prblems.
>>>>
>>>> The current example is falloc, which only update the isize after the
>>>> falloc finishes.
>>>>
>>>> This behavior has already bothered me quite a lot, as it can easily
>>>> create mixed inline and regular extents.
>>>
>>> Do you have an example of how this would happen? I have the inode and
>>> extent bits locked during an encoded write, and I see that fallocate
>>> does the same.
>>
>> xfs_io -f -c "pwrite 0 1K" -c "sync" -c "falloc 0 4k" -c "pwrite 4k 4k"
>>
>> The [0, 1K) will be written as inline without doubt.
>>
>> Then we go to falloc, it will try to zero the range [1K, 4K), but it
>> doesn't increase the isize.
>> Thus the page [0, 4k) will still be written back as inline, since isize
>> is still 1K.
>>
>> Later [4K, 8K) will be written back as regular, causing mixed extents.
>
> I'll have to read fallocate more closely to follow what's going on here
> and figure out if it applies to encoded writes. Please help me out if
> you see how this would be an issue with encoded writes.

This won't cause anything wrong, if the encoded writes follows the
existing inline extents requirement (always at offset 0).

Otherwise, the read path could be affected to handle inlined extent at
non-zero offset.

>
>>>> Can't we remember the old isize (with proper locking), enlarge isize
>>>> (with holes filled), do the write.
>>>>
>>>> If something wrong happened, we truncate the isize back to its old isize.
>>>>
>> [...]
>>>>>
>>>>> Urgh, just some days ago Qu was talking about how awkward it is to have
>>>>> mixed extents in a file. And now, AFAIU, you are making them more likely
>>>>> since now they can be created not just at the beginning of the file but
>>>>> also after i_size write. While this won't be a problem in and of itself
>>>>> it goes just the opposite way of us trying to shrink the possible cases
>>>>> when we can have mixed extents.
>>>>
>>>> Tree-checker should reject such inline extent at non-zero offset.
>>>
>>> This change does not allow creating inline extents at a non-zero offset.
>>>
>>>>> Qu what is your take on that?
>>>>
>>>> My question is, why encoded write needs to bother the inline extents at all?
>>>>
>>>> My intuition of such encoded write is, it should not create inline
>>>> extents at all.
>>>>
>>>> Or is there any special use-case involved for encoded write?
>>>
>>> We create compressed inline extents with normal writes. We should be
>>> able to send and receive them without converting them into regular
>>> extents.
>>>
>> But my first impression for any encoded write is that, they should work
>> like DIO, thus everything should be sectorsize aligned.
>>
>> Then why could they create inline extent? As inline extent can only be
>> possible when the isize is smaller than sectorsize.
>
> ENCODED_WRITE is not defined as "O_DIRECT, but encoded". It happens to
> have some resemblance to O_DIRECT because we have alignment requirements
> for new extents and because we bypass the page cache, but there's no
> reason to copy arbitrary restrictions from O_DIRECT. If someone is using
> ENCODED_WRITE to write compressed data, then they care about space
> efficiency, so we should make efficient use of inline extents.
>
Then as long as the inline extent requirement for 0 offset is still
followed, I'll be fine with that.

But for non-zero offset inline extent? It looks like a much larger
change, and may affect read path.

So I'd prefer we keep the 0 offset requirement for inline extent, and
find a better way to work around.

Thanks,
Qu
Omar Sandoval Aug. 23, 2021, 11:46 p.m. UTC | #7
On Tue, Aug 24, 2021 at 07:32:06AM +0800, Qu Wenruo wrote:
> 
> 
> On 2021/8/24 上午2:16, Omar Sandoval wrote:
> > On Sat, Aug 21, 2021 at 09:11:26AM +0800, Qu Wenruo wrote:
> > > 
> > > 
> > > On 2021/8/21 上午2:11, Omar Sandoval wrote:
> > > > On Fri, Aug 20, 2021 at 05:13:34PM +0800, Qu Wenruo wrote:
> > > > > 
> > > > > 
> > > > > On 2021/8/20 下午4:51, Nikolay Borisov wrote:
> > > > > > 
> > > > > > 
> > > > > > On 18.08.21 г. 0:06, Omar Sandoval wrote:
> > > > > > > From: Omar Sandoval <osandov@fb.com>
> > > > > > > 
> > > > > > > Currently, an inline extent is always created after i_size is extended
> > > > > > > from btrfs_dirty_pages(). However, for encoded writes, we only want to
> > > > > > > update i_size after we successfully created the inline extent.
> > > > > 
> > > > > To me, the idea of write first then update isize is just going to cause
> > > > > tons of inline extent related prblems.
> > > > > 
> > > > > The current example is falloc, which only update the isize after the
> > > > > falloc finishes.
> > > > > 
> > > > > This behavior has already bothered me quite a lot, as it can easily
> > > > > create mixed inline and regular extents.
> > > > 
> > > > Do you have an example of how this would happen? I have the inode and
> > > > extent bits locked during an encoded write, and I see that fallocate
> > > > does the same.
> > > 
> > > xfs_io -f -c "pwrite 0 1K" -c "sync" -c "falloc 0 4k" -c "pwrite 4k 4k"
> > > 
> > > The [0, 1K) will be written as inline without doubt.
> > > 
> > > Then we go to falloc, it will try to zero the range [1K, 4K), but it
> > > doesn't increase the isize.
> > > Thus the page [0, 4k) will still be written back as inline, since isize
> > > is still 1K.
> > > 
> > > Later [4K, 8K) will be written back as regular, causing mixed extents.
> > 
> > I'll have to read fallocate more closely to follow what's going on here
> > and figure out if it applies to encoded writes. Please help me out if
> > you see how this would be an issue with encoded writes.
> 
> This won't cause anything wrong, if the encoded writes follows the
> existing inline extents requirement (always at offset 0).
> 
> Otherwise, the read path could be affected to handle inlined extent at
> non-zero offset.
> 
> > 
> > > > > Can't we remember the old isize (with proper locking), enlarge isize
> > > > > (with holes filled), do the write.
> > > > > 
> > > > > If something wrong happened, we truncate the isize back to its old isize.
> > > > > 
> > > [...]
> > > > > > 
> > > > > > Urgh, just some days ago Qu was talking about how awkward it is to have
> > > > > > mixed extents in a file. And now, AFAIU, you are making them more likely
> > > > > > since now they can be created not just at the beginning of the file but
> > > > > > also after i_size write. While this won't be a problem in and of itself
> > > > > > it goes just the opposite way of us trying to shrink the possible cases
> > > > > > when we can have mixed extents.
> > > > > 
> > > > > Tree-checker should reject such inline extent at non-zero offset.
> > > > 
> > > > This change does not allow creating inline extents at a non-zero offset.
> > > > 
> > > > > > Qu what is your take on that?
> > > > > 
> > > > > My question is, why encoded write needs to bother the inline extents at all?
> > > > > 
> > > > > My intuition of such encoded write is, it should not create inline
> > > > > extents at all.
> > > > > 
> > > > > Or is there any special use-case involved for encoded write?
> > > > 
> > > > We create compressed inline extents with normal writes. We should be
> > > > able to send and receive them without converting them into regular
> > > > extents.
> > > > 
> > > But my first impression for any encoded write is that, they should work
> > > like DIO, thus everything should be sectorsize aligned.
> > > 
> > > Then why could they create inline extent? As inline extent can only be
> > > possible when the isize is smaller than sectorsize.
> > 
> > ENCODED_WRITE is not defined as "O_DIRECT, but encoded". It happens to
> > have some resemblance to O_DIRECT because we have alignment requirements
> > for new extents and because we bypass the page cache, but there's no
> > reason to copy arbitrary restrictions from O_DIRECT. If someone is using
> > ENCODED_WRITE to write compressed data, then they care about space
> > efficiency, so we should make efficient use of inline extents.
> > 
> Then as long as the inline extent requirement for 0 offset is still
> followed, I'll be fine with that.
> 
> But for non-zero offset inline extent? It looks like a much larger
> change, and may affect read path.
> 
> So I'd prefer we keep the 0 offset requirement for inline extent, and
> find a better way to work around.

Ah, okay. I didn't get rid of the 0 offset requirement and I have no
plans to. In fact, this patch kind of does the opposite: it gets rid of
the start parameter to cow_file_range_inline() because it doesn't make
sense for it to ever be anything other than 0 (and we're already
checking that start == 0 in the callers).
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 708d8ab098bc..0b5ff14aa7fd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -236,9 +236,10 @@  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 static int insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_path *path, bool extent_inserted,
 				struct btrfs_root *root, struct inode *inode,
-				u64 start, size_t size, size_t compressed_size,
+				size_t size, size_t compressed_size,
 				int compress_type,
-				struct page **compressed_pages)
+				struct page **compressed_pages,
+				bool update_i_size)
 {
 	struct extent_buffer *leaf;
 	struct page *page = NULL;
@@ -247,7 +248,7 @@  static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *ei;
 	int ret;
 	size_t cur_size = size;
-	unsigned long offset;
+	u64 i_size;
 
 	ASSERT((compressed_size > 0 && compressed_pages) ||
 	       (compressed_size == 0 && !compressed_pages));
@@ -260,7 +261,7 @@  static int insert_inline_extent(struct btrfs_trans_handle *trans,
 		size_t datasize;
 
 		key.objectid = btrfs_ino(BTRFS_I(inode));
-		key.offset = start;
+		key.offset = 0;
 		key.type = BTRFS_EXTENT_DATA_KEY;
 
 		datasize = btrfs_file_extent_calc_inline_size(cur_size);
@@ -297,12 +298,10 @@  static int insert_inline_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_file_extent_compression(leaf, ei,
 						  compress_type);
 	} else {
-		page = find_get_page(inode->i_mapping,
-				     start >> PAGE_SHIFT);
+		page = find_get_page(inode->i_mapping, 0);
 		btrfs_set_file_extent_compression(leaf, ei, 0);
 		kaddr = kmap_atomic(page);
-		offset = offset_in_page(start);
-		write_extent_buffer(leaf, kaddr + offset, ptr, size);
+		write_extent_buffer(leaf, kaddr, ptr, size);
 		kunmap_atomic(kaddr);
 		put_page(page);
 	}
@@ -313,8 +312,8 @@  static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	 * We align size to sectorsize for inline extents just for simplicity
 	 * sake.
 	 */
-	size = ALIGN(size, root->fs_info->sectorsize);
-	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
+	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+					ALIGN(size, root->fs_info->sectorsize));
 	if (ret)
 		goto fail;
 
@@ -327,7 +326,13 @@  static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	 * before we unlock the pages.  Otherwise we
 	 * could end up racing with unlink.
 	 */
-	BTRFS_I(inode)->disk_i_size = inode->i_size;
+	i_size = i_size_read(inode);
+	if (update_i_size && size > i_size) {
+		i_size_write(inode, size);
+		i_size = size;
+	}
+	BTRFS_I(inode)->disk_i_size = i_size;
+
 fail:
 	return ret;
 }
@@ -338,35 +343,31 @@  static int insert_inline_extent(struct btrfs_trans_handle *trans,
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
-					  u64 end, size_t compressed_size,
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
+					  size_t compressed_size,
 					  int compress_type,
-					  struct page **compressed_pages)
+					  struct page **compressed_pages,
+					  bool update_i_size)
 {
 	struct btrfs_drop_extents_args drop_args = { 0 };
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
-	u64 isize = i_size_read(&inode->vfs_inode);
-	u64 actual_end = min(end + 1, isize);
-	u64 inline_len = actual_end - start;
-	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
-	u64 data_len = inline_len;
+	u64 data_len = compressed_size ? compressed_size : size;
 	int ret;
 	struct btrfs_path *path;
 
-	if (compressed_size)
-		data_len = compressed_size;
-
-	if (start > 0 ||
-	    actual_end > fs_info->sectorsize ||
+	/*
+	 * We can create an inline extent if it ends at or beyond the current
+	 * i_size, is no larger than a sector (decompressed), and the (possibly
+	 * compressed) data fits in a leaf and the configured maximum inline
+	 * size.
+	 */
+	if (size < i_size_read(&inode->vfs_inode) ||
+	    size > fs_info->sectorsize ||
 	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
-	    (!compressed_size &&
-	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
-	    end + 1 < isize ||
-	    data_len > fs_info->max_inline) {
+	    data_len > fs_info->max_inline)
 		return 1;
-	}
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -380,30 +381,21 @@  static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
 	trans->block_rsv = &inode->block_rsv;
 
 	drop_args.path = path;
-	drop_args.start = start;
-	drop_args.end = aligned_end;
+	drop_args.start = 0;
+	drop_args.end = fs_info->sectorsize;
 	drop_args.drop_cache = true;
 	drop_args.replace_extent = true;
-
-	if (compressed_size && compressed_pages)
-		drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
-		   compressed_size);
-	else
-		drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
-		    inline_len);
-
+	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
-	if (isize > actual_end)
-		inline_len = min_t(u64, isize, actual_end);
-	ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
-				   root, &inode->vfs_inode, start,
-				   inline_len, compressed_size,
-				   compress_type, compressed_pages);
+	ret = insert_inline_extent(trans, path, drop_args.extent_inserted, root,
+				   &inode->vfs_inode, size, compressed_size,
+				   compress_type, compressed_pages,
+				   update_i_size);
 	if (ret && ret != -ENOSPC) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -412,7 +404,7 @@  static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
 		goto out;
 	}
 
-	btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
+	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
 	ret = btrfs_update_inode(trans, root, inode);
 	if (ret && ret != -ENOSPC) {
 		btrfs_abort_transaction(trans, ret);
@@ -695,14 +687,15 @@  static noinline int compress_file_range(struct async_chunk *async_chunk)
 			/* we didn't compress the entire range, try
 			 * to make an uncompressed inline extent.
 			 */
-			ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+			ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
 						    0, BTRFS_COMPRESS_NONE,
-						    NULL);
+						    NULL, false);
 		} else {
 			/* try making a compressed inline extent */
-			ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+			ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
 						    total_compressed,
-						    compress_type, pages);
+						    compress_type, pages,
+						    false);
 		}
 		if (ret <= 0) {
 			unsigned long clear_flags = EXTENT_DELALLOC |
@@ -1098,9 +1091,12 @@  static noinline int cow_file_range(struct btrfs_inode *inode,
 	 * So here we skip inline extent creation completely.
 	 */
 	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
+		u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
+				       end + 1);
+
 		/* lets try to make an inline extent */
-		ret = cow_file_range_inline(inode, start, end, 0,
-					    BTRFS_COMPRESS_NONE, NULL);
+		ret = cow_file_range_inline(inode, actual_end, 0,
+					    BTRFS_COMPRESS_NONE, NULL, false);
 		if (ret == 0) {
 			/*
 			 * We use DO_ACCOUNTING here because we need the