diff mbox

btrfs: qgroups, properly handle no reservations

Message ID 20180221201921.17944-1-jeffm@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Mahoney Feb. 21, 2018, 8:19 p.m. UTC
From: Jeff Mahoney <jeffm@suse.com>

There are several places where we call btrfs_qgroup_reserve_meta and
assume that a return value of 0 means that the reservation was successful.

Later, we use the original bytes value passed to that call to free
bytes during error handling or to pass the number of bytes reserved to
the caller.

This patch returns -ENODATA when we don't perform a reservation so that
callers can make the distinction.  This also lets call sites not
necessarily care whether qgroups are enabled.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/btrfs/extent-tree.c | 33 ++++++++++++++++-----------------
 fs/btrfs/qgroup.c      |  4 ++--
 fs/btrfs/transaction.c |  5 ++++-
 3 files changed, 22 insertions(+), 20 deletions(-)

Comments

Qu Wenruo Feb. 22, 2018, 1:36 a.m. UTC | #1
On 2018年02月22日 04:19, jeffm@suse.com wrote:
> From: Jeff Mahoney <jeffm@suse.com>
> 
> There are several places where we call btrfs_qgroup_reserve_meta and
> assume that a return value of 0 means that the reservation was successful.
> 
> Later, we use the original bytes value passed to that call to free
> bytes during error handling or to pass the number of bytes reserved to
> the caller.
> 
> This patch returns -ENODATA when we don't perform a reservation so that
> callers can make the distinction.  This also lets call sites not
> necessarily care whether qgroups are enabled.

IMHO if we don't need to reserve, returning 0 seems good enough.
Caller doesn't really need to care if it has reserved some bytes.

Or is there any special case where we need to distinguish such case?

Thanks,
Qu

> 
> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> ---
>  fs/btrfs/extent-tree.c | 33 ++++++++++++++++-----------------
>  fs/btrfs/qgroup.c      |  4 ++--
>  fs/btrfs/transaction.c |  5 ++++-
>  3 files changed, 22 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index c1618ab9fecf..2d5e963fae88 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -5988,20 +5988,18 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
>  				     u64 *qgroup_reserved,
>  				     bool use_global_rsv)
>  {
> -	u64 num_bytes;
>  	int ret;
>  	struct btrfs_fs_info *fs_info = root->fs_info;
>  	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> +	/* One for parent inode, two for dir entries */
> +	u64 num_bytes = 3 * fs_info->nodesize;
>  
> -	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
> -		/* One for parent inode, two for dir entries */
> -		num_bytes = 3 * fs_info->nodesize;
> -		ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
> -		if (ret)
> -			return ret;
> -	} else {
> +	ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
> +	if (ret == -ENODATA) {
>  		num_bytes = 0;
> -	}
> +		ret = 0;
> +	} else if (ret)
> +		return ret;
>  
>  	*qgroup_reserved = num_bytes;
>  
> @@ -6057,6 +6055,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>  	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
>  	int ret = 0;
>  	bool delalloc_lock = true;
> +	u64 qgroup_reserved;
>  
>  	/* If we are a free space inode we need to not flush since we will be in
>  	 * the middle of a transaction commit.  We also don't need the delalloc
> @@ -6090,17 +6089,17 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>  	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
>  	spin_unlock(&inode->lock);
>  
> -	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
> -		ret = btrfs_qgroup_reserve_meta(root,
> -				nr_extents * fs_info->nodesize, true);
> -		if (ret)
> -			goto out_fail;
> -	}
> +	qgroup_reserved = nr_extents * fs_info->nodesize;
> +	ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved, true);
> +	if (ret == -ENODATA) {
> +		ret = 0;
> +		qgroup_reserved = 0;
> +	} if (ret)
> +		goto out_fail;
>  
>  	ret = btrfs_inode_rsv_refill(inode, flush);
>  	if (unlikely(ret)) {
> -		btrfs_qgroup_free_meta(root,
> -				       nr_extents * fs_info->nodesize);
> +		btrfs_qgroup_free_meta(root, qgroup_reserved);
>  		goto out_fail;
>  	}
>  
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index aa259d6986e1..5d9e011243c6 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -3025,7 +3025,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
>  
>  	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
>  	    !is_fstree(root->objectid) || num_bytes == 0)
> -		return 0;
> +		return -ENODATA;
>  
>  	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
>  	trace_qgroup_meta_reserve(root, (s64)num_bytes);
> @@ -3057,7 +3057,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
>  	struct btrfs_fs_info *fs_info = root->fs_info;
>  
>  	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
> -	    !is_fstree(root->objectid))
> +	    !is_fstree(root->objectid) || num_bytes == 0)
>  		return;
>  
>  	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 04f07144b45c..ab67b73bd7fa 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -510,7 +510,10 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
>  		qgroup_reserved = num_items * fs_info->nodesize;
>  		ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved,
>  						enforce_qgroups);
> -		if (ret)
> +		if (ret == -ENODATA) {
> +			ret = 0;
> +			qgroup_reserved = 0;
> +		} else if (ret)
>  			return ERR_PTR(ret);
>  
>  		num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
>
Jeff Mahoney Feb. 22, 2018, 1:50 a.m. UTC | #2
On 2/21/18 8:36 PM, Qu Wenruo wrote:
> 
> 
> On 2018年02月22日 04:19, jeffm@suse.com wrote:
>> From: Jeff Mahoney <jeffm@suse.com>
>>
>> There are several places where we call btrfs_qgroup_reserve_meta and
>> assume that a return value of 0 means that the reservation was successful.
>>
>> Later, we use the original bytes value passed to that call to free
>> bytes during error handling or to pass the number of bytes reserved to
>> the caller.
>>
>> This patch returns -ENODATA when we don't perform a reservation so that
>> callers can make the distinction.  This also lets call sites not
>> necessarily care whether qgroups are enabled.
> 
> IMHO if we don't need to reserve, returning 0 seems good enough.
> Caller doesn't really need to care if it has reserved some bytes.
> 
> Or is there any special case where we need to distinguish such case?

Anywhere where the reservation takes place prior to the transaction
starting, which is pretty much everywhere.  We wait until transaction
commit to flip the bit to turn on quotas, which means that if a
transaction commits that enables quotas lands in between the reservation
being take and any error handling that involves freeing the reservation,
we'll end up with an underflow.

This is the first patch of a series I'm working on, but it can stand
alone.  The rest is the patch set I mentioned when we talked a few
months ago where the lifetimes of reservations are incorrect.  We can't
just drop all the reservations at the end of the transaction because 1)
the lifetime of some reservations can cross transactions and 2) because
especially in the start_transaction case, we do the reservation prior to
waiting to join the transaction.  So if the transaction we're waiting on
commits, our reservation goes away with it but we continue on as if we
still have it.

-Jeff

> Thanks,
> Qu
> 
>>
>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
>> ---
>>  fs/btrfs/extent-tree.c | 33 ++++++++++++++++-----------------
>>  fs/btrfs/qgroup.c      |  4 ++--
>>  fs/btrfs/transaction.c |  5 ++++-
>>  3 files changed, 22 insertions(+), 20 deletions(-)
>>
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index c1618ab9fecf..2d5e963fae88 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -5988,20 +5988,18 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
>>  				     u64 *qgroup_reserved,
>>  				     bool use_global_rsv)
>>  {
>> -	u64 num_bytes;
>>  	int ret;
>>  	struct btrfs_fs_info *fs_info = root->fs_info;
>>  	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
>> +	/* One for parent inode, two for dir entries */
>> +	u64 num_bytes = 3 * fs_info->nodesize;
>>  
>> -	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
>> -		/* One for parent inode, two for dir entries */
>> -		num_bytes = 3 * fs_info->nodesize;
>> -		ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
>> -		if (ret)
>> -			return ret;
>> -	} else {
>> +	ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
>> +	if (ret == -ENODATA) {
>>  		num_bytes = 0;
>> -	}
>> +		ret = 0;
>> +	} else if (ret)
>> +		return ret;
>>  
>>  	*qgroup_reserved = num_bytes;
>>  
>> @@ -6057,6 +6055,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>>  	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
>>  	int ret = 0;
>>  	bool delalloc_lock = true;
>> +	u64 qgroup_reserved;
>>  
>>  	/* If we are a free space inode we need to not flush since we will be in
>>  	 * the middle of a transaction commit.  We also don't need the delalloc
>> @@ -6090,17 +6089,17 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>>  	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
>>  	spin_unlock(&inode->lock);
>>  
>> -	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
>> -		ret = btrfs_qgroup_reserve_meta(root,
>> -				nr_extents * fs_info->nodesize, true);
>> -		if (ret)
>> -			goto out_fail;
>> -	}
>> +	qgroup_reserved = nr_extents * fs_info->nodesize;
>> +	ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved, true);
>> +	if (ret == -ENODATA) {
>> +		ret = 0;
>> +		qgroup_reserved = 0;
>> +	} if (ret)
>> +		goto out_fail;
>>  
>>  	ret = btrfs_inode_rsv_refill(inode, flush);
>>  	if (unlikely(ret)) {
>> -		btrfs_qgroup_free_meta(root,
>> -				       nr_extents * fs_info->nodesize);
>> +		btrfs_qgroup_free_meta(root, qgroup_reserved);
>>  		goto out_fail;
>>  	}
>>  
>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>> index aa259d6986e1..5d9e011243c6 100644
>> --- a/fs/btrfs/qgroup.c
>> +++ b/fs/btrfs/qgroup.c
>> @@ -3025,7 +3025,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
>>  
>>  	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
>>  	    !is_fstree(root->objectid) || num_bytes == 0)
>> -		return 0;
>> +		return -ENODATA;
>>  
>>  	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
>>  	trace_qgroup_meta_reserve(root, (s64)num_bytes);
>> @@ -3057,7 +3057,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
>>  	struct btrfs_fs_info *fs_info = root->fs_info;
>>  
>>  	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
>> -	    !is_fstree(root->objectid))
>> +	    !is_fstree(root->objectid) || num_bytes == 0)
>>  		return;
>>  
>>  	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
>> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
>> index 04f07144b45c..ab67b73bd7fa 100644
>> --- a/fs/btrfs/transaction.c
>> +++ b/fs/btrfs/transaction.c
>> @@ -510,7 +510,10 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
>>  		qgroup_reserved = num_items * fs_info->nodesize;
>>  		ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved,
>>  						enforce_qgroups);
>> -		if (ret)
>> +		if (ret == -ENODATA) {
>> +			ret = 0;
>> +			qgroup_reserved = 0;
>> +		} else if (ret)
>>  			return ERR_PTR(ret);
>>  
>>  		num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
>>
>
Qu Wenruo Feb. 22, 2018, 2:05 a.m. UTC | #3
On 2018年02月22日 09:50, Jeff Mahoney wrote:
> On 2/21/18 8:36 PM, Qu Wenruo wrote:
>>
>>
>> On 2018年02月22日 04:19, jeffm@suse.com wrote:
>>> From: Jeff Mahoney <jeffm@suse.com>
>>>
>>> There are several places where we call btrfs_qgroup_reserve_meta and
>>> assume that a return value of 0 means that the reservation was successful.
>>>
>>> Later, we use the original bytes value passed to that call to free
>>> bytes during error handling or to pass the number of bytes reserved to
>>> the caller.
>>>
>>> This patch returns -ENODATA when we don't perform a reservation so that
>>> callers can make the distinction.  This also lets call sites not
>>> necessarily care whether qgroups are enabled.
>>
>> IMHO if we don't need to reserve, returning 0 seems good enough.
>> Caller doesn't really need to care if it has reserved some bytes.
>>
>> Or is there any special case where we need to distinguish such case?
> 
> Anywhere where the reservation takes place prior to the transaction
> starting, which is pretty much everywhere.  We wait until transaction
> commit to flip the bit to turn on quotas, which means that if a
> transaction commits that enables quotas lands in between the reservation
> being take and any error handling that involves freeing the reservation,
> we'll end up with an underflow.

So the same case as btrfs_qgroup_reserve_data().

In that case we could use ret > 0 to indicates the real bytes we
reserved, instead of -ENODATA which normally means error.

> 
> This is the first patch of a series I'm working on, but it can stand
> alone.  The rest is the patch set I mentioned when we talked a few
> months ago where the lifetimes of reservations are incorrect.  We can't
> just drop all the reservations at the end of the transaction because 1)
> the lifetime of some reservations can cross transactions and 2) because
> especially in the start_transaction case, we do the reservation prior to
> waiting to join the transaction.  So if the transaction we're waiting on
> commits, our reservation goes away with it but we continue on as if we
> still have it.

Right, the same problem I also addressed in patchset "[PATCH v2 00/10]
Use split qgroup rsv type".

In 6th patch, "[PATCH v2 06/10] btrfs: qgroup: Use
root->qgroup_meta_rsv_* to record qgroup meta reserved space" qgroup
meta reserve will only be increased if we succeeded in reserving
metadata, so later free won't underflow that number.

Thanks,
Qu


> 
> -Jeff
> 
>> Thanks,
>> Qu
>>
>>>
>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
>>> ---
>>>  fs/btrfs/extent-tree.c | 33 ++++++++++++++++-----------------
>>>  fs/btrfs/qgroup.c      |  4 ++--
>>>  fs/btrfs/transaction.c |  5 ++++-
>>>  3 files changed, 22 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>>> index c1618ab9fecf..2d5e963fae88 100644
>>> --- a/fs/btrfs/extent-tree.c
>>> +++ b/fs/btrfs/extent-tree.c
>>> @@ -5988,20 +5988,18 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
>>>  				     u64 *qgroup_reserved,
>>>  				     bool use_global_rsv)
>>>  {
>>> -	u64 num_bytes;
>>>  	int ret;
>>>  	struct btrfs_fs_info *fs_info = root->fs_info;
>>>  	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
>>> +	/* One for parent inode, two for dir entries */
>>> +	u64 num_bytes = 3 * fs_info->nodesize;
>>>  
>>> -	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
>>> -		/* One for parent inode, two for dir entries */
>>> -		num_bytes = 3 * fs_info->nodesize;
>>> -		ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
>>> -		if (ret)
>>> -			return ret;
>>> -	} else {
>>> +	ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
>>> +	if (ret == -ENODATA) {
>>>  		num_bytes = 0;
>>> -	}
>>> +		ret = 0;
>>> +	} else if (ret)
>>> +		return ret;
>>>  
>>>  	*qgroup_reserved = num_bytes;
>>>  
>>> @@ -6057,6 +6055,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>>>  	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
>>>  	int ret = 0;
>>>  	bool delalloc_lock = true;
>>> +	u64 qgroup_reserved;
>>>  
>>>  	/* If we are a free space inode we need to not flush since we will be in
>>>  	 * the middle of a transaction commit.  We also don't need the delalloc
>>> @@ -6090,17 +6089,17 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>>>  	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
>>>  	spin_unlock(&inode->lock);
>>>  
>>> -	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
>>> -		ret = btrfs_qgroup_reserve_meta(root,
>>> -				nr_extents * fs_info->nodesize, true);
>>> -		if (ret)
>>> -			goto out_fail;
>>> -	}
>>> +	qgroup_reserved = nr_extents * fs_info->nodesize;
>>> +	ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved, true);
>>> +	if (ret == -ENODATA) {
>>> +		ret = 0;
>>> +		qgroup_reserved = 0;
>>> +	} if (ret)
>>> +		goto out_fail;
>>>  
>>>  	ret = btrfs_inode_rsv_refill(inode, flush);
>>>  	if (unlikely(ret)) {
>>> -		btrfs_qgroup_free_meta(root,
>>> -				       nr_extents * fs_info->nodesize);
>>> +		btrfs_qgroup_free_meta(root, qgroup_reserved);
>>>  		goto out_fail;
>>>  	}
>>>  
>>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>>> index aa259d6986e1..5d9e011243c6 100644
>>> --- a/fs/btrfs/qgroup.c
>>> +++ b/fs/btrfs/qgroup.c
>>> @@ -3025,7 +3025,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
>>>  
>>>  	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
>>>  	    !is_fstree(root->objectid) || num_bytes == 0)
>>> -		return 0;
>>> +		return -ENODATA;
>>>  
>>>  	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
>>>  	trace_qgroup_meta_reserve(root, (s64)num_bytes);
>>> @@ -3057,7 +3057,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
>>>  	struct btrfs_fs_info *fs_info = root->fs_info;
>>>  
>>>  	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
>>> -	    !is_fstree(root->objectid))
>>> +	    !is_fstree(root->objectid) || num_bytes == 0)
>>>  		return;
>>>  
>>>  	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
>>> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
>>> index 04f07144b45c..ab67b73bd7fa 100644
>>> --- a/fs/btrfs/transaction.c
>>> +++ b/fs/btrfs/transaction.c
>>> @@ -510,7 +510,10 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
>>>  		qgroup_reserved = num_items * fs_info->nodesize;
>>>  		ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved,
>>>  						enforce_qgroups);
>>> -		if (ret)
>>> +		if (ret == -ENODATA) {
>>> +			ret = 0;
>>> +			qgroup_reserved = 0;
>>> +		} else if (ret)
>>>  			return ERR_PTR(ret);
>>>  
>>>  		num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
>>>
>>
> 
>
David Sterba March 7, 2018, 4:02 p.m. UTC | #4
On Thu, Feb 22, 2018 at 10:05:36AM +0800, Qu Wenruo wrote:
> 
> 
> On 2018年02月22日 09:50, Jeff Mahoney wrote:
> > On 2/21/18 8:36 PM, Qu Wenruo wrote:
> >>
> >>
> >> On 2018年02月22日 04:19, jeffm@suse.com wrote:
> >>> From: Jeff Mahoney <jeffm@suse.com>
> >>>
> >>> There are several places where we call btrfs_qgroup_reserve_meta and
> >>> assume that a return value of 0 means that the reservation was successful.
> >>>
> >>> Later, we use the original bytes value passed to that call to free
> >>> bytes during error handling or to pass the number of bytes reserved to
> >>> the caller.
> >>>
> >>> This patch returns -ENODATA when we don't perform a reservation so that
> >>> callers can make the distinction.  This also lets call sites not
> >>> necessarily care whether qgroups are enabled.
> >>
> >> IMHO if we don't need to reserve, returning 0 seems good enough.
> >> Caller doesn't really need to care if it has reserved some bytes.
> >>
> >> Or is there any special case where we need to distinguish such case?
> > 
> > Anywhere where the reservation takes place prior to the transaction
> > starting, which is pretty much everywhere.  We wait until transaction
> > commit to flip the bit to turn on quotas, which means that if a
> > transaction commits that enables quotas lands in between the reservation
> > being take and any error handling that involves freeing the reservation,
> > we'll end up with an underflow.
> 
> So the same case as btrfs_qgroup_reserve_data().
> 
> In that case we could use ret > 0 to indicates the real bytes we
> reserved, instead of -ENODATA which normally means error.
> 
> > 
> > This is the first patch of a series I'm working on, but it can stand
> > alone.  The rest is the patch set I mentioned when we talked a few
> > months ago where the lifetimes of reservations are incorrect.  We can't
> > just drop all the reservations at the end of the transaction because 1)
> > the lifetime of some reservations can cross transactions and 2) because
> > especially in the start_transaction case, we do the reservation prior to
> > waiting to join the transaction.  So if the transaction we're waiting on
> > commits, our reservation goes away with it but we continue on as if we
> > still have it.
> 
> Right, the same problem I also addressed in patchset "[PATCH v2 00/10]
> Use split qgroup rsv type".
> 
> In 6th patch, "[PATCH v2 06/10] btrfs: qgroup: Use
> root->qgroup_meta_rsv_* to record qgroup meta reserved space" qgroup
> meta reserve will only be increased if we succeeded in reserving
> metadata, so later free won't underflow that number.

What should we do now when there are 2 different fixes? Applying Jeff's
patch on top of your qgroup-types patches causes some conflicts that do
not seem to be difficult, but the end result might not work as expected.

The changes do not seem to be fundamentally conflicting, would it make
sense to merge both? The patchset has been in for-next for a while but I
don't run qgroup specific tests besides what's in fstests. Also the
patchset fixes more problems so I think we need to merge it at some
point and now it's a good time about deciding whether it'd go to 4.17.

I did a pass through the patches, there are some minor things to fix but
a review from somebody with qgroup knowledge would be still desirable.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo March 8, 2018, 1:02 a.m. UTC | #5
On 2018年03月08日 00:02, David Sterba wrote:
> On Thu, Feb 22, 2018 at 10:05:36AM +0800, Qu Wenruo wrote:
>>
>>
>> On 2018年02月22日 09:50, Jeff Mahoney wrote:
>>> On 2/21/18 8:36 PM, Qu Wenruo wrote:
>>>>
>>>>
>>>> On 2018年02月22日 04:19, jeffm@suse.com wrote:
>>>>> From: Jeff Mahoney <jeffm@suse.com>
>>>>>
>>>>> There are several places where we call btrfs_qgroup_reserve_meta and
>>>>> assume that a return value of 0 means that the reservation was successful.
>>>>>
>>>>> Later, we use the original bytes value passed to that call to free
>>>>> bytes during error handling or to pass the number of bytes reserved to
>>>>> the caller.
>>>>>
>>>>> This patch returns -ENODATA when we don't perform a reservation so that
>>>>> callers can make the distinction.  This also lets call sites not
>>>>> necessarily care whether qgroups are enabled.
>>>>
>>>> IMHO if we don't need to reserve, returning 0 seems good enough.
>>>> Caller doesn't really need to care if it has reserved some bytes.
>>>>
>>>> Or is there any special case where we need to distinguish such case?
>>>
>>> Anywhere where the reservation takes place prior to the transaction
>>> starting, which is pretty much everywhere.  We wait until transaction
>>> commit to flip the bit to turn on quotas, which means that if a
>>> transaction commits that enables quotas lands in between the reservation
>>> being take and any error handling that involves freeing the reservation,
>>> we'll end up with an underflow.
>>
>> So the same case as btrfs_qgroup_reserve_data().
>>
>> In that case we could use ret > 0 to indicates the real bytes we
>> reserved, instead of -ENODATA which normally means error.
>>
>>>
>>> This is the first patch of a series I'm working on, but it can stand
>>> alone.  The rest is the patch set I mentioned when we talked a few
>>> months ago where the lifetimes of reservations are incorrect.  We can't
>>> just drop all the reservations at the end of the transaction because 1)
>>> the lifetime of some reservations can cross transactions and 2) because
>>> especially in the start_transaction case, we do the reservation prior to
>>> waiting to join the transaction.  So if the transaction we're waiting on
>>> commits, our reservation goes away with it but we continue on as if we
>>> still have it.
>>
>> Right, the same problem I also addressed in patchset "[PATCH v2 00/10]
>> Use split qgroup rsv type".
>>
>> In 6th patch, "[PATCH v2 06/10] btrfs: qgroup: Use
>> root->qgroup_meta_rsv_* to record qgroup meta reserved space" qgroup
>> meta reserve will only be increased if we succeeded in reserving
>> metadata, so later free won't underflow that number.
> 
> What should we do now when there are 2 different fixes? Applying Jeff's
> patch on top of your qgroup-types patches causes some conflicts that do
> not seem to be difficult, but the end result might not work as expected.

Jeff's patch is a more pinpoint solution to handle metadata reservation
error, while mine is a generic one which adds another layer to catch
possible underflow.

I think both could co-exist.

The only thing I'm not a fan is the return value of -ENODATA.
Despite that it should be fine.

Thanks,
Qu

> 
> The changes do not seem to be fundamentally conflicting, would it make
> sense to merge both?
> The patchset has been in for-next for a while but I
> don't run qgroup specific tests besides what's in fstests. Also the
> patchset fixes more problems so I think we need to merge it at some
> point and now it's a good time about deciding whether it'd go to 4.17.
> 
> I did a pass through the patches, there are some minor things to fix but
> a review from somebody with qgroup knowledge would be still desirable.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
diff mbox

Patch

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c1618ab9fecf..2d5e963fae88 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5988,20 +5988,18 @@  int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     u64 *qgroup_reserved,
 				     bool use_global_rsv)
 {
-	u64 num_bytes;
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	/* One for parent inode, two for dir entries */
+	u64 num_bytes = 3 * fs_info->nodesize;
 
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
-		/* One for parent inode, two for dir entries */
-		num_bytes = 3 * fs_info->nodesize;
-		ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
-		if (ret)
-			return ret;
-	} else {
+	ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+	if (ret == -ENODATA) {
 		num_bytes = 0;
-	}
+		ret = 0;
+	} else if (ret)
+		return ret;
 
 	*qgroup_reserved = num_bytes;
 
@@ -6057,6 +6055,7 @@  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 	int ret = 0;
 	bool delalloc_lock = true;
+	u64 qgroup_reserved;
 
 	/* If we are a free space inode we need to not flush since we will be in
 	 * the middle of a transaction commit.  We also don't need the delalloc
@@ -6090,17 +6089,17 @@  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
-		ret = btrfs_qgroup_reserve_meta(root,
-				nr_extents * fs_info->nodesize, true);
-		if (ret)
-			goto out_fail;
-	}
+	qgroup_reserved = nr_extents * fs_info->nodesize;
+	ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved, true);
+	if (ret == -ENODATA) {
+		ret = 0;
+		qgroup_reserved = 0;
+	} if (ret)
+		goto out_fail;
 
 	ret = btrfs_inode_rsv_refill(inode, flush);
 	if (unlikely(ret)) {
-		btrfs_qgroup_free_meta(root,
-				       nr_extents * fs_info->nodesize);
+		btrfs_qgroup_free_meta(root, qgroup_reserved);
 		goto out_fail;
 	}
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index aa259d6986e1..5d9e011243c6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3025,7 +3025,7 @@  int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
 	    !is_fstree(root->objectid) || num_bytes == 0)
-		return 0;
+		return -ENODATA;
 
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
 	trace_qgroup_meta_reserve(root, (s64)num_bytes);
@@ -3057,7 +3057,7 @@  void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
-	    !is_fstree(root->objectid))
+	    !is_fstree(root->objectid) || num_bytes == 0)
 		return;
 
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04f07144b45c..ab67b73bd7fa 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -510,7 +510,10 @@  start_transaction(struct btrfs_root *root, unsigned int num_items,
 		qgroup_reserved = num_items * fs_info->nodesize;
 		ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved,
 						enforce_qgroups);
-		if (ret)
+		if (ret == -ENODATA) {
+			ret = 0;
+			qgroup_reserved = 0;
+		} else if (ret)
 			return ERR_PTR(ret);
 
 		num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);