[v2] ocfs2: check the metadate alloc before marking extent written
diff mbox

Message ID 5A261299.6080403@huawei.com
State New
Headers show

Commit Message

zhendong chen Dec. 5, 2017, 3:29 a.m. UTC
We need to check the free number of the records in each loop to mark
extent written, because the last extent block may be changed through
many times marking extent written and the 'num_free_extents' also be
changed. In the worst case, the 'num_free_extents' may become less
than the beginning of the loop. So we should not estimate the free
number of the records at the beginning of the loop to mark extent
written.

Reported-by: John Lightsey <john@nixnuts.net>
Signed-off-by: Alex Chen <alex.chen@huawei.com>
Reviewed-by: Jun Piao <piaojun@huawei.com>
---
 fs/ocfs2/aops.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 13 deletions(-)

Comments

Joseph Qi Dec. 5, 2017, 5:38 a.m. UTC | #1
On 17/12/5 11:29, alex chen wrote:
> We need to check the free number of the records in each loop to mark
> extent written, because the last extent block may be changed through
> many times marking extent written and the 'num_free_extents' also be
> changed. In the worst case, the 'num_free_extents' may become less
> than the beginning of the loop. So we should not estimate the free
> number of the records at the beginning of the loop to mark extent
> written.
> 
> Reported-by: John Lightsey <john@nixnuts.net>
> Signed-off-by: Alex Chen <alex.chen@huawei.com>
> Reviewed-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>

> ---
>  fs/ocfs2/aops.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 64 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index d151632..7e1659d 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -2272,6 +2272,35 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
>  	return ret;
>  }
> 
> +static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et,
> +		struct ocfs2_alloc_context *meta_ac, int max_rec_needed)
> +{
> +	int status = 0, free_extents;
> +
> +	free_extents = ocfs2_num_free_extents(et);
> +	if (free_extents < 0) {
> +		status = free_extents;
> +		mlog_errno(status);
> +		return status;
> +	}
> +
> +	/*
> +	 * there are two cases which could cause us to EAGAIN in the
> +	 * we-need-more-metadata case:
> +	 * 1) we haven't reserved *any*
> +	 * 2) we are so fragmented, we've needed to add metadata too
> +	 *    many times.
> +	 */
> +	if (free_extents < max_rec_needed) {
> +		if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac)
> +				< ocfs2_extend_meta_needed(et->et_root_el)))
> +			status = 1;
> +	}
> +
> +	return status;
> +}
> +
> +#define OCFS2_MAX_REC_NEEDED_SPLIT 2
>  static int ocfs2_dio_end_io_write(struct inode *inode,
>  				  struct ocfs2_dio_write_ctxt *dwc,
>  				  loff_t offset,
> @@ -2281,14 +2310,13 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>  	struct ocfs2_extent_tree et;
>  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>  	struct ocfs2_inode_info *oi = OCFS2_I(inode);
> -	struct ocfs2_unwritten_extent *ue = NULL;
> +	struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue;
>  	struct buffer_head *di_bh = NULL;
>  	struct ocfs2_dinode *di;
> -	struct ocfs2_alloc_context *data_ac = NULL;
>  	struct ocfs2_alloc_context *meta_ac = NULL;
>  	handle_t *handle = NULL;
>  	loff_t end = offset + bytes;
> -	int ret = 0, credits = 0, locked = 0;
> +	int ret = 0, credits = 0, locked = 0, restart = 0;
> 
>  	ocfs2_init_dealloc_ctxt(&dealloc);
> 
> @@ -2328,10 +2356,10 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
> 
>  	di = (struct ocfs2_dinode *)di_bh->b_data;
> 
> +restart_all:
>  	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
> -
> -	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
> -				    &data_ac, &meta_ac);
> +	ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT,
> +				    NULL, &meta_ac);
>  	if (ret) {
>  		mlog_errno(ret);
>  		goto unlock;
> @@ -2343,7 +2371,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>  	if (IS_ERR(handle)) {
>  		ret = PTR_ERR(handle);
>  		mlog_errno(ret);
> -		goto unlock;
> +		goto free_ac;
>  	}
>  	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
>  				      OCFS2_JOURNAL_ACCESS_WRITE);
> @@ -2352,7 +2380,17 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>  		goto commit;
>  	}
> 
> -	list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
> +	list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) {
> +		ret = ocfs2_dio_should_restart(&et, meta_ac,
> +				OCFS2_MAX_REC_NEEDED_SPLIT * 2);
> +		if (ret < 0) {
> +			mlog_errno(ret);
> +			break;
> +		} else if (ret == 1) {
> +			restart = 1;
> +			break;
> +		}
> +
>  		ret = ocfs2_mark_extent_written(inode, &et, handle,
>  						ue->ue_cpos, 1,
>  						ue->ue_phys,
> @@ -2361,24 +2399,37 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>  			mlog_errno(ret);
>  			break;
>  		}
> +
> +		dwc->dw_zero_count--;
> +		list_del_init(&ue->ue_node);
> +		spin_lock(&oi->ip_lock);
> +		list_del_init(&ue->ue_ip_node);
> +		spin_unlock(&oi->ip_lock);
> +		kfree(ue);
>  	}
> 
> -	if (end > i_size_read(inode)) {
> +	if (!restart && end > i_size_read(inode)) {
>  		ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
>  		if (ret < 0)
>  			mlog_errno(ret);
>  	}
> +
>  commit:
>  	ocfs2_commit_trans(osb, handle);
> +free_ac:
> +	if (meta_ac) {
> +		ocfs2_free_alloc_context(meta_ac);
> +		meta_ac = NULL;
> +	}
> +	if (restart) {
> +		restart = 0;
> +		goto restart_all;
> +	}
>  unlock:
>  	up_write(&oi->ip_alloc_sem);
>  	ocfs2_inode_unlock(inode, 1);
>  	brelse(di_bh);
>  out:
> -	if (data_ac)
> -		ocfs2_free_alloc_context(data_ac);
> -	if (meta_ac)
> -		ocfs2_free_alloc_context(meta_ac);
>  	ocfs2_run_deallocs(osb, &dealloc);
>  	if (locked)
>  		inode_unlock(inode);
>
Changwei Ge Dec. 12, 2017, 1:05 a.m. UTC | #2
Hi Alex,

On 2017/12/5 11:31, alex chen wrote:
> We need to check the free number of the records in each loop to mark
> extent written, because the last extent block may be changed through
> many times marking extent written and the 'num_free_extents' also be
> changed. In the worst case, the 'num_free_extents' may become less
> than the beginning of the loop. So we should not estimate the free
> number of the records at the beginning of the loop to mark extent
> written.
> 
> Reported-by: John Lightsey <john@nixnuts.net>
> Signed-off-by: Alex Chen <alex.chen@huawei.com>
> Reviewed-by: Jun Piao <piaojun@huawei.com>
> ---
>   fs/ocfs2/aops.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----------
>   1 file changed, 64 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index d151632..7e1659d 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -2272,6 +2272,35 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
>   	return ret;
>   }
> 
> +static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et,
> +		struct ocfs2_alloc_context *meta_ac, int max_rec_needed)
> +{
> +	int status = 0, free_extents;
> +
> +	free_extents = ocfs2_num_free_extents(et);
> +	if (free_extents < 0) {
> +		status = free_extents;
> +		mlog_errno(status);
> +		return status;
> +	}
> +
> +	/*
> +	 * there are two cases which could cause us to EAGAIN in the
> +	 * we-need-more-metadata case:
> +	 * 1) we haven't reserved *any*
> +	 * 2) we are so fragmented, we've needed to add metadata too
> +	 *    many times.
> +	 */
> +	if (free_extents < max_rec_needed) {
> +		if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac)
> +				< ocfs2_extend_meta_needed(et->et_root_el)))
> +			status = 1;
> +	}
> +
> +	return status;
> +}
> +
> +#define OCFS2_MAX_REC_NEEDED_SPLIT 2
>   static int ocfs2_dio_end_io_write(struct inode *inode,
>   				  struct ocfs2_dio_write_ctxt *dwc,
>   				  loff_t offset,
> @@ -2281,14 +2310,13 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>   	struct ocfs2_extent_tree et;
>   	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>   	struct ocfs2_inode_info *oi = OCFS2_I(inode);
> -	struct ocfs2_unwritten_extent *ue = NULL;
> +	struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue;
>   	struct buffer_head *di_bh = NULL;
>   	struct ocfs2_dinode *di;
> -	struct ocfs2_alloc_context *data_ac = NULL;
>   	struct ocfs2_alloc_context *meta_ac = NULL;
>   	handle_t *handle = NULL;
>   	loff_t end = offset + bytes;
> -	int ret = 0, credits = 0, locked = 0;
> +	int ret = 0, credits = 0, locked = 0, restart = 0;
> 
>   	ocfs2_init_dealloc_ctxt(&dealloc);
> 
> @@ -2328,10 +2356,10 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
> 
>   	di = (struct ocfs2_dinode *)di_bh->b_data;
> 
> +restart_all:
>   	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
> -
> -	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
> -				    &data_ac, &meta_ac);
> +	ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT,
> +				    NULL, &meta_ac);
>   	if (ret) {
>   		mlog_errno(ret);
>   		goto unlock;
> @@ -2343,7 +2371,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>   	if (IS_ERR(handle)) {
>   		ret = PTR_ERR(handle);
>   		mlog_errno(ret);
> -		goto unlock;
> +		goto free_ac;
>   	}
>   	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
>   				      OCFS2_JOURNAL_ACCESS_WRITE);
> @@ -2352,7 +2380,17 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>   		goto commit;
>   	}
> 
> -	list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
> +	list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) {
> +		ret = ocfs2_dio_should_restart(&et, meta_ac,
> +				OCFS2_MAX_REC_NEEDED_SPLIT * 2);
> +		if (ret < 0) {
> +			mlog_errno(ret);
> +			break;
> +		} else if (ret == 1) {
> +			restart = 1;

If there isn't enough extent records here, you break this loop and also 
commit transaction.
After re-executing from _restart_all_, a new transaction is started.
So you separate 'before _restart_all_' and 'after _restart_all_' into 
different transactions.


> +			break;
> +		}
> +
>   		ret = ocfs2_mark_extent_written(inode, &et, handle,
>   						ue->ue_cpos, 1,
>   						ue->ue_phys,
> @@ -2361,24 +2399,37 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>   			mlog_errno(ret);
>   			break;
>   		}
> +
> +		dwc->dw_zero_count--;
> +		list_del_init(&ue->ue_node);
> +		spin_lock(&oi->ip_lock);
> +		list_del_init(&ue->ue_ip_node);
> +		spin_unlock(&oi->ip_lock);
> +		kfree(ue);
>   	}
> 
> -	if (end > i_size_read(inode)) {
> +	if (!restart && end > i_size_read(inode)) {

Here you don't change inode size, however, obviously you already mark 
extents as *written*.

>   		ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
>   		if (ret < 0)
>   			mlog_errno(ret);
>   	}
> +
>   commit:
>   	ocfs2_commit_trans(osb, handle);

You commit transaction here and host just crashes at this point.
Then you will have a file with a smaller size than its real size.
Moreover, those space could not be declaimed.

Should we do more work on JBD2?

Thanks,
Changwei


> +free_ac:
> +	if (meta_ac) {
> +		ocfs2_free_alloc_context(meta_ac);
> +		meta_ac = NULL;
> +	}
> +	if (restart) {
> +		restart = 0;
> +		goto restart_all;
> +	}
>   unlock:
>   	up_write(&oi->ip_alloc_sem);
>   	ocfs2_inode_unlock(inode, 1);
>   	brelse(di_bh);
>   out:
> -	if (data_ac)
> -		ocfs2_free_alloc_context(data_ac);
> -	if (meta_ac)
> -		ocfs2_free_alloc_context(meta_ac);
>   	ocfs2_run_deallocs(osb, &dealloc);
>   	if (locked)
>   		inode_unlock(inode);
>
zhendong chen Dec. 13, 2017, 7:24 a.m. UTC | #3
Hi Changwei,

On 2017/12/12 9:05, Changwei Ge wrote:
> Hi Alex,
> 
> On 2017/12/5 11:31, alex chen wrote:
>> We need to check the free number of the records in each loop to mark
>> extent written, because the last extent block may be changed through
>> many times marking extent written and the 'num_free_extents' also be
>> changed. In the worst case, the 'num_free_extents' may become less
>> than the beginning of the loop. So we should not estimate the free
>> number of the records at the beginning of the loop to mark extent
>> written.
>>
>> Reported-by: John Lightsey <john@nixnuts.net>
>> Signed-off-by: Alex Chen <alex.chen@huawei.com>
>> Reviewed-by: Jun Piao <piaojun@huawei.com>
>> ---
>>   fs/ocfs2/aops.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----------
>>   1 file changed, 64 insertions(+), 13 deletions(-)
>>
>> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
>> index d151632..7e1659d 100644
>> --- a/fs/ocfs2/aops.c
>> +++ b/fs/ocfs2/aops.c
>> @@ -2272,6 +2272,35 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
>>   	return ret;
>>   }
>>
>> +static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et,
>> +		struct ocfs2_alloc_context *meta_ac, int max_rec_needed)
>> +{
>> +	int status = 0, free_extents;
>> +
>> +	free_extents = ocfs2_num_free_extents(et);
>> +	if (free_extents < 0) {
>> +		status = free_extents;
>> +		mlog_errno(status);
>> +		return status;
>> +	}
>> +
>> +	/*
>> +	 * there are two cases which could cause us to EAGAIN in the
>> +	 * we-need-more-metadata case:
>> +	 * 1) we haven't reserved *any*
>> +	 * 2) we are so fragmented, we've needed to add metadata too
>> +	 *    many times.
>> +	 */
>> +	if (free_extents < max_rec_needed) {
>> +		if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac)
>> +				< ocfs2_extend_meta_needed(et->et_root_el)))
>> +			status = 1;
>> +	}
>> +
>> +	return status;
>> +}
>> +
>> +#define OCFS2_MAX_REC_NEEDED_SPLIT 2
>>   static int ocfs2_dio_end_io_write(struct inode *inode,
>>   				  struct ocfs2_dio_write_ctxt *dwc,
>>   				  loff_t offset,
>> @@ -2281,14 +2310,13 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>>   	struct ocfs2_extent_tree et;
>>   	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>>   	struct ocfs2_inode_info *oi = OCFS2_I(inode);
>> -	struct ocfs2_unwritten_extent *ue = NULL;
>> +	struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue;
>>   	struct buffer_head *di_bh = NULL;
>>   	struct ocfs2_dinode *di;
>> -	struct ocfs2_alloc_context *data_ac = NULL;
>>   	struct ocfs2_alloc_context *meta_ac = NULL;
>>   	handle_t *handle = NULL;
>>   	loff_t end = offset + bytes;
>> -	int ret = 0, credits = 0, locked = 0;
>> +	int ret = 0, credits = 0, locked = 0, restart = 0;
>>
>>   	ocfs2_init_dealloc_ctxt(&dealloc);
>>
>> @@ -2328,10 +2356,10 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>>
>>   	di = (struct ocfs2_dinode *)di_bh->b_data;
>>
>> +restart_all:
>>   	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
>> -
>> -	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
>> -				    &data_ac, &meta_ac);
>> +	ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT,
>> +				    NULL, &meta_ac);
>>   	if (ret) {
>>   		mlog_errno(ret);
>>   		goto unlock;
>> @@ -2343,7 +2371,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>>   	if (IS_ERR(handle)) {
>>   		ret = PTR_ERR(handle);
>>   		mlog_errno(ret);
>> -		goto unlock;
>> +		goto free_ac;
>>   	}
>>   	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
>>   				      OCFS2_JOURNAL_ACCESS_WRITE);
>> @@ -2352,7 +2380,17 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>>   		goto commit;
>>   	}
>>
>> -	list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
>> +	list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) {
>> +		ret = ocfs2_dio_should_restart(&et, meta_ac,
>> +				OCFS2_MAX_REC_NEEDED_SPLIT * 2);
>> +		if (ret < 0) {
>> +			mlog_errno(ret);
>> +			break;
>> +		} else if (ret == 1) {
>> +			restart = 1;
> 
> If there isn't enough extent records here, you break this loop and also 
> commit transaction.
> After re-executing from _restart_all_, a new transaction is started.
> So you separate 'before _restart_all_' and 'after _restart_all_' into 
> different transactions.
> 
> 
>> +			break;
>> +		}
>> +
>>   		ret = ocfs2_mark_extent_written(inode, &et, handle,
>>   						ue->ue_cpos, 1,
>>   						ue->ue_phys,
>> @@ -2361,24 +2399,37 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>>   			mlog_errno(ret);
>>   			break;
>>   		}
>> +
>> +		dwc->dw_zero_count--;
>> +		list_del_init(&ue->ue_node);
>> +		spin_lock(&oi->ip_lock);
>> +		list_del_init(&ue->ue_ip_node);
>> +		spin_unlock(&oi->ip_lock);
>> +		kfree(ue);
>>   	}
>>
>> -	if (end > i_size_read(inode)) {
>> +	if (!restart && end > i_size_read(inode)) {
> 
> Here you don't change inode size, however, obviously you already mark 
> extents as *written*.

Here we should not change inode size, otherwise, the user may read some of the data
he wrote and the other part of data is zero, which breaks the consistency of WRITE.
Here it just mark extents to written and do not allocated data spaces.

> 
>>   		ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
>>   		if (ret < 0)
>>   			mlog_errno(ret);
>>   	}
>> +
>>   commit:
>>   	ocfs2_commit_trans(osb, handle);
> 
> You commit transaction here and host just crashes at this point.
> Then you will have a file with a smaller size than its real size.
> Moreover, those space could not be declaimed.

The problem you described above is exist. But it is a new problem and is not introduced by this patch.
It is introduced in commit c15471f79506(ocfs2: fix sparse file & data ordering issue in direct io).
Those space was allocated in ocfs2_dio_wr_get_block()->ocfs2_write_begin_nolock()->
ocfs2_write_cluster_by_desc() and it could not be declaimed when any errors happen in
ocfs2_dio_end_io_write().
This patch corrects the calculation method of the free number of the records.

At present, I have no idea to solve the new problem you described above, I think you can send the patch
to solve this problem if you have a good idea.

Thanks,
Alex

> 
> Should we do more work on JBD2?
> 
> Thanks,
> Changwei
> 
> 
>> +free_ac:
>> +	if (meta_ac) {
>> +		ocfs2_free_alloc_context(meta_ac);
>> +		meta_ac = NULL;
>> +	}
>> +	if (restart) {
>> +		restart = 0;
>> +		goto restart_all;
>> +	}
>>   unlock:
>>   	up_write(&oi->ip_alloc_sem);
>>   	ocfs2_inode_unlock(inode, 1);
>>   	brelse(di_bh);
>>   out:
>> -	if (data_ac)
>> -		ocfs2_free_alloc_context(data_ac);
>> -	if (meta_ac)
>> -		ocfs2_free_alloc_context(meta_ac);
>>   	ocfs2_run_deallocs(osb, &dealloc);
>>   	if (locked)
>>   		inode_unlock(inode);
>>
> 
> 
> .
>
Changwei Ge Dec. 14, 2017, 12:56 a.m. UTC | #4
On 2017/12/13 15:25, alex chen wrote:
> Hi Changwei,
> 
> On 2017/12/12 9:05, Changwei Ge wrote:
>> Hi Alex,
>>
>> On 2017/12/5 11:31, alex chen wrote:
>>> We need to check the free number of the records in each loop to mark
>>> extent written, because the last extent block may be changed through
>>> many times marking extent written and the 'num_free_extents' also be
>>> changed. In the worst case, the 'num_free_extents' may become less
>>> than the beginning of the loop. So we should not estimate the free
>>> number of the records at the beginning of the loop to mark extent
>>> written.
>>>
>>> Reported-by: John Lightsey <john@nixnuts.net>
>>> Signed-off-by: Alex Chen <alex.chen@huawei.com>
>>> Reviewed-by: Jun Piao <piaojun@huawei.com>
>>> ---
>>>    fs/ocfs2/aops.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----------
>>>    1 file changed, 64 insertions(+), 13 deletions(-)
>>>

>>
>> If there isn't enough extent records here, you break this loop and also
>> commit transaction.
>> After re-executing from _restart_all_, a new transaction is started.
>> So you separate 'before _restart_all_' and 'after _restart_all_' into
>> different transactions.
>>
>>
>>>
>>> -	if (end > i_size_read(inode)) {
>>> +	if (!restart && end > i_size_read(inode)) {
>>
>> Here you don't change inode size, however, obviously you already mark
>> extents as *written*.
> 
> Here we should not change inode size, otherwise, the user may read some of the data
> he wrote and the other part of data is zero, which breaks the consistency of WRITE.
> Here it just mark extents to written and do not allocated data spaces.

Very true.

> 
>>
>>>    		ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
>>>    		if (ret < 0)
>>>    			mlog_errno(ret);
>>>    	}
>>> +
>>>    commit:
>>>    	ocfs2_commit_trans(osb, handle);
>>
>> You commit transaction here and host just crashes at this point.
>> Then you will have a file with a smaller size than its real size.
>> Moreover, those space could not be declaimed.
> 
> The problem you described above is exist. But it is a new problem and is not introduced by this patch.
> It is introduced in commit c15471f79506(ocfs2: fix sparse file & data ordering issue in direct io).
> Those space was allocated in ocfs2_dio_wr_get_block()->ocfs2_write_begin_nolock()->
> ocfs2_write_cluster_by_desc() and it could not be declaimed when any errors happen in
> ocfs2_dio_end_io_write().

Hi Alex,
Actually, 1)the space can be declaimed via ocfs2 journal recovery by other nodes or itself during its
next mount, since it also resided in orphan directory. 2)If deleting corresponding inode is done, I suppose,
unwritten cluster might be declaimed by _fsck_ with a *consistent* inode size.(not very sure on this point).

The original version of ocfs2_dio_end_io_write() before your patch being applied, it ties _mark_extent_written_
to _set_inode_size_ in an unique jbd2 transaction. So it can guarantee the consistency between changing them.

As for your patch, you split them into different transactions which breaks the consistency, since your patch
starts and commit a transaction once _restart_all_ is needed(2 times at least), but only the last jbd2 committed
transaction includes changing inode size.

I think your way to fix this issue is acceptable.

If you have no idea how to improve patch, I will try to compose a patch to do some help based on your patch.

Perhaps Andrew can fold them into a single one, it's up to Andrew.

Thanks,
Changwei

> This patch corrects the calculation method of the free number of the records.
> 
> At present, I have no idea to solve the new problem you described above, I think you can send the patch
> to solve this problem if you have a good idea.
> 
> Thanks,
> Alex
> 
>>
>> Should we do more work on JBD2?
>>
>> Thanks,
>> Changwei
>>
>>
>>> +free_ac:
>>> +	if (meta_ac) {
>>> +		ocfs2_free_alloc_context(meta_ac);
>>> +		meta_ac = NULL;
>>> +	}
>>> +	if (restart) {
>>> +		restart = 0;
>>> +		goto restart_all;
>>> +	}
>>>    unlock:
>>>    	up_write(&oi->ip_alloc_sem);

> 
>
zhendong chen Dec. 14, 2017, 3:01 a.m. UTC | #5
Hi Changwei,

On 2017/12/14 8:56, Changwei Ge wrote:
> On 2017/12/13 15:25, alex chen wrote:
>> Hi Changwei,
>>
>> On 2017/12/12 9:05, Changwei Ge wrote:
>>> Hi Alex,
>>>
>>> On 2017/12/5 11:31, alex chen wrote:
>>>> We need to check the free number of the records in each loop to mark
>>>> extent written, because the last extent block may be changed through
>>>> many times marking extent written and the 'num_free_extents' also be
>>>> changed. In the worst case, the 'num_free_extents' may become less
>>>> than the beginning of the loop. So we should not estimate the free
>>>> number of the records at the beginning of the loop to mark extent
>>>> written.
>>>>
>>>> Reported-by: John Lightsey <john@nixnuts.net>
>>>> Signed-off-by: Alex Chen <alex.chen@huawei.com>
>>>> Reviewed-by: Jun Piao <piaojun@huawei.com>
>>>> ---
>>>>    fs/ocfs2/aops.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----------
>>>>    1 file changed, 64 insertions(+), 13 deletions(-)
>>>>
> 
>>>
>>> If there isn't enough extent records here, you break this loop and also
>>> commit transaction.
>>> After re-executing from _restart_all_, a new transaction is started.
>>> So you separate 'before _restart_all_' and 'after _restart_all_' into
>>> different transactions.
>>>
>>>
>>>>
>>>> -	if (end > i_size_read(inode)) {
>>>> +	if (!restart && end > i_size_read(inode)) {
>>>
>>> Here you don't change inode size, however, obviously you already mark
>>> extents as *written*.
>>
>> Here we should not change inode size, otherwise, the user may read some of the data
>> he wrote and the other part of data is zero, which breaks the consistency of WRITE.
>> Here it just mark extents to written and do not allocated data spaces.
> 
> Very true.
> 
>>
>>>
>>>>    		ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
>>>>    		if (ret < 0)
>>>>    			mlog_errno(ret);
>>>>    	}
>>>> +
>>>>    commit:
>>>>    	ocfs2_commit_trans(osb, handle);
>>>
>>> You commit transaction here and host just crashes at this point.
>>> Then you will have a file with a smaller size than its real size.
>>> Moreover, those space could not be declaimed.
>>
>> The problem you described above is exist. But it is a new problem and is not introduced by this patch.
>> It is introduced in commit c15471f79506(ocfs2: fix sparse file & data ordering issue in direct io).
>> Those space was allocated in ocfs2_dio_wr_get_block()->ocfs2_write_begin_nolock()->
>> ocfs2_write_cluster_by_desc() and it could not be declaimed when any errors happen in
>> ocfs2_dio_end_io_write().
> 
> Hi Alex,
> Actually, 1)the space can be declaimed via ocfs2 journal recovery by other nodes or itself during its
> next mount, since it also resided in orphan directory. 2)If deleting corresponding inode is done, I suppose,
> unwritten cluster might be declaimed by _fsck_ with a *consistent* inode size.(not very sure on this point).
> 
I have a different understanding about this.
I think the space can be declaimed until the inode is deleted from orphan directory, Do you agree?
In ocfs2_dio_end_io_write(), we deleted the inode from orphan directory before calling ocfs2_lock_allocators()
and ocfs2_mark_extent_written. In other words, the space can't be declaimed when any errors happened in the
process of marking extent to written unless the file is deleted or we run fsck.ocfs2.

> The original version of ocfs2_dio_end_io_write() before your patch being applied, it ties _mark_extent_written_
> to _set_inode_size_ in an unique jbd2 transaction. So it can guarantee the consistency between changing them.
> 
Yes.

> As for your patch, you split them into different transactions which breaks the consistency, since your patch
> starts and commit a transaction once _restart_all_ is needed(2 times at least), but only the last jbd2 committed
> transaction includes changing inode size.
I have understood it. This is indeed a point that can be optimized. Do you have any good suggestions?

Thanks,
Alex
> 
> I think your way to fix this issue is acceptable.
> 
> If you have no idea how to improve patch, I will try to compose a patch to do some help based on your patch.
> 
I am honored to be with you to solve this problem.

> Perhaps Andrew can fold them into a single one, it's up to Andrew.
> 
> Thanks,
> Changwei
> 
>> This patch corrects the calculation method of the free number of the records.
>>
>> At present, I have no idea to solve the new problem you described above, I think you can send the patch
>> to solve this problem if you have a good idea.
>>
>> Thanks,
>> Alex
>>
>>>
>>> Should we do more work on JBD2?
>>>
>>> Thanks,
>>> Changwei
>>>
>>>
>>>> +free_ac:
>>>> +	if (meta_ac) {
>>>> +		ocfs2_free_alloc_context(meta_ac);
>>>> +		meta_ac = NULL;
>>>> +	}
>>>> +	if (restart) {
>>>> +		restart = 0;
>>>> +		goto restart_all;
>>>> +	}
>>>>    unlock:
>>>>    	up_write(&oi->ip_alloc_sem);
> 
>>
>>
> 
> 
> .
>
Changwei Ge Dec. 14, 2017, 6:01 a.m. UTC | #6
On 2017/12/14 11:04, alex chen wrote:
> Hi Changwei,
> 
> On 2017/12/14 8:56, Changwei Ge wrote:
>> On 2017/12/13 15:25, alex chen wrote:
>>> Hi Changwei,
>>>
>>> On 2017/12/12 9:05, Changwei Ge wrote:
>>>> Hi Alex,
>>>>
>>>> On 2017/12/5 11:31, alex chen wrote:
>>>>> We need to check the free number of the records in each loop to mark
>>>>> extent written, because the last extent block may be changed through
>>>>> many times marking extent written and the 'num_free_extents' also be
>>>>> changed. In the worst case, the 'num_free_extents' may become less
>>>>> than the beginning of the loop. So we should not estimate the free
>>>>> number of the records at the beginning of the loop to mark extent
>>>>> written.
>>>>>
>>>>> Reported-by: John Lightsey <john@nixnuts.net>
>>>>> Signed-off-by: Alex Chen <alex.chen@huawei.com>
>>>>> Reviewed-by: Jun Piao <piaojun@huawei.com>
>>>>> ---
>>>>>     fs/ocfs2/aops.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----------
>>>>>     1 file changed, 64 insertions(+), 13 deletions(-)
>>>>>
>>
>>>>
>>>> If there isn't enough extent records here, you break this loop and also
>>>> commit transaction.
>>>> After re-executing from _restart_all_, a new transaction is started.
>>>> So you separate 'before _restart_all_' and 'after _restart_all_' into
>>>> different transactions.
>>>>
>>>>
>>>>>
>>>>> -	if (end > i_size_read(inode)) {
>>>>> +	if (!restart && end > i_size_read(inode)) {
>>>>
>>>> Here you don't change inode size, however, obviously you already mark
>>>> extents as *written*.
>>>
>>> Here we should not change inode size, otherwise, the user may read some of the data
>>> he wrote and the other part of data is zero, which breaks the consistency of WRITE.
>>> Here it just mark extents to written and do not allocated data spaces.
>>
>> Very true.
>>
>>>
>>>>
>>>>>     		ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
>>>>>     		if (ret < 0)
>>>>>     			mlog_errno(ret);
>>>>>     	}
>>>>> +
>>>>>     commit:
>>>>>     	ocfs2_commit_trans(osb, handle);
>>>>
>>>> You commit transaction here and host just crashes at this point.
>>>> Then you will have a file with a smaller size than its real size.
>>>> Moreover, those space could not be declaimed.
>>>
>>> The problem you described above is exist. But it is a new problem and is not introduced by this patch.
>>> It is introduced in commit c15471f79506(ocfs2: fix sparse file & data ordering issue in direct io).
>>> Those space was allocated in ocfs2_dio_wr_get_block()->ocfs2_write_begin_nolock()->
>>> ocfs2_write_cluster_by_desc() and it could not be declaimed when any errors happen in
>>> ocfs2_dio_end_io_write().
>>
>> Hi Alex,
>> Actually, 1)the space can be declaimed via ocfs2 journal recovery by other nodes or itself during its
>> next mount, since it also resided in orphan directory. 2)If deleting corresponding inode is done, I suppose,
>> unwritten cluster might be declaimed by _fsck_ with a *consistent* inode size.(not very sure on this point).
>>
> I have a different understanding about this.
> I think the space can be declaimed until the inode is deleted from orphan directory, Do you agree?
Hi Alex,

Um, I am going to emphasize my point.
If system crash happens,
an inode which resides in orphan directory will get a chance to declaim clusters via journal replay.
The replay operation can be performed from other 1)cluster member nodes or 2)itself during its next mount.
Before deleting inode from orphan directory, we don't have to depend fsck tool to declaim clusters.

But, after deleting inode from orphan directory, journal replay can't find these leaked space.
So we might need fsck tool to get those space back.

> In ocfs2_dio_end_io_write(), we deleted the inode from orphan directory before calling ocfs2_lock_allocators()
> and ocfs2_mark_extent_written. In other words, the space can't be declaimed when any errors happened in the
> process of marking extent to written unless the file is deleted or we run fsck.ocfs2.
> 
>> The original version of ocfs2_dio_end_io_write() before your patch being applied, it ties _mark_extent_written_
>> to _set_inode_size_ in an unique jbd2 transaction. So it can guarantee the consistency between changing them.
>>
> Yes.
> 
>> As for your patch, you split them into different transactions which breaks the consistency, since your patch
>> starts and commit a transaction once _restart_all_ is needed(2 times at least), but only the last jbd2 committed
>> transaction includes changing inode size.
> I have understood it. This is indeed a point that can be optimized. Do you have any good suggestions?

I will post a patch based on your fix.

> 
> Thanks,
> Alex
>>
>> I think your way to fix this issue is acceptable.
>>
>> If you have no idea how to improve patch, I will try to compose a patch to do some help based on your patch.
>>
> I am honored to be with you to solve this problem.

:) My pleasure, too.

Thanks,
Changwei

> 
>> Perhaps Andrew can fold them into a single one, it's up to Andrew.
>>
>> Thanks,
>> Changwei
>>
>>> This patch corrects the calculation method of the free number of the records.
>>>
>>> At present, I have no idea to solve the new problem you described above, I think you can send the patch
>>> to solve this problem if you have a good idea.
>>>
>>> Thanks,
>>> Alex
>>>
>>>>
>>>> Should we do more work on JBD2?
>>>>
>>>> Thanks,
>>>> Changwei
>>>>
>>>>
>>>>> +free_ac:
>>>>> +	if (meta_ac) {
>>>>> +		ocfs2_free_alloc_context(meta_ac);
>>>>> +		meta_ac = NULL;
>>>>> +	}
>>>>> +	if (restart) {
>>>>> +		restart = 0;
>>>>> +		goto restart_all;
>>>>> +	}
>>>>>     unlock:
>>>>>     	up_write(&oi->ip_alloc_sem);
>>
>>>
>>>
>>
>>
>> .
>>
> 
>

Patch
diff mbox

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index d151632..7e1659d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2272,6 +2272,35 @@  static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
 	return ret;
 }

+static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et,
+		struct ocfs2_alloc_context *meta_ac, int max_rec_needed)
+{
+	int status = 0, free_extents;
+
+	free_extents = ocfs2_num_free_extents(et);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		return status;
+	}
+
+	/*
+	 * there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times.
+	 */
+	if (free_extents < max_rec_needed) {
+		if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac)
+				< ocfs2_extend_meta_needed(et->et_root_el)))
+			status = 1;
+	}
+
+	return status;
+}
+
+#define OCFS2_MAX_REC_NEEDED_SPLIT 2
 static int ocfs2_dio_end_io_write(struct inode *inode,
 				  struct ocfs2_dio_write_ctxt *dwc,
 				  loff_t offset,
@@ -2281,14 +2310,13 @@  static int ocfs2_dio_end_io_write(struct inode *inode,
 	struct ocfs2_extent_tree et;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_unwritten_extent *ue = NULL;
+	struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue;
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
-	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle = NULL;
 	loff_t end = offset + bytes;
-	int ret = 0, credits = 0, locked = 0;
+	int ret = 0, credits = 0, locked = 0, restart = 0;

 	ocfs2_init_dealloc_ctxt(&dealloc);

@@ -2328,10 +2356,10 @@  static int ocfs2_dio_end_io_write(struct inode *inode,

 	di = (struct ocfs2_dinode *)di_bh->b_data;

+restart_all:
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
-
-	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
-				    &data_ac, &meta_ac);
+	ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT,
+				    NULL, &meta_ac);
 	if (ret) {
 		mlog_errno(ret);
 		goto unlock;
@@ -2343,7 +2371,7 @@  static int ocfs2_dio_end_io_write(struct inode *inode,
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
-		goto unlock;
+		goto free_ac;
 	}
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2352,7 +2380,17 @@  static int ocfs2_dio_end_io_write(struct inode *inode,
 		goto commit;
 	}

-	list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+	list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) {
+		ret = ocfs2_dio_should_restart(&et, meta_ac,
+				OCFS2_MAX_REC_NEEDED_SPLIT * 2);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		} else if (ret == 1) {
+			restart = 1;
+			break;
+		}
+
 		ret = ocfs2_mark_extent_written(inode, &et, handle,
 						ue->ue_cpos, 1,
 						ue->ue_phys,
@@ -2361,24 +2399,37 @@  static int ocfs2_dio_end_io_write(struct inode *inode,
 			mlog_errno(ret);
 			break;
 		}
+
+		dwc->dw_zero_count--;
+		list_del_init(&ue->ue_node);
+		spin_lock(&oi->ip_lock);
+		list_del_init(&ue->ue_ip_node);
+		spin_unlock(&oi->ip_lock);
+		kfree(ue);
 	}

-	if (end > i_size_read(inode)) {
+	if (!restart && end > i_size_read(inode)) {
 		ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
 		if (ret < 0)
 			mlog_errno(ret);
 	}
+
 commit:
 	ocfs2_commit_trans(osb, handle);
+free_ac:
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if (restart) {
+		restart = 0;
+		goto restart_all;
+	}
 unlock:
 	up_write(&oi->ip_alloc_sem);
 	ocfs2_inode_unlock(inode, 1);
 	brelse(di_bh);
 out:
-	if (data_ac)
-		ocfs2_free_alloc_context(data_ac);
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
 	ocfs2_run_deallocs(osb, &dealloc);
 	if (locked)
 		inode_unlock(inode);