Message ID | 5A25F95B.5050600@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 17/12/5 09:41, alex chen wrote: > We need to check the free number of the records in each loop to mark > extent written, because the last extent block may be changed through > many times marking extent written and the 'num_free_extents' also be > changed. In the worst case, the 'num_free_extents' may become less > than the beginning of the loop. So we should not estimate the free > number of the records at the beginning of the loop to mark extent > written. > > Reported-by: John Lightsey <john@nixnuts.net> > Signed-off-by: Alex Chen <alex.chen@huawei.com> > Reviewed-by: Jun Piao <piaojun@huawei.com> > --- > fs/ocfs2/aops.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++--------- > 1 file changed, 64 insertions(+), 12 deletions(-) > > diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c > index d151632..702aebc 100644 > --- a/fs/ocfs2/aops.c > +++ b/fs/ocfs2/aops.c > @@ -2272,6 +2272,35 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, > return ret; > } > > +static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et, > + struct ocfs2_alloc_context *meta_ac, int max_rec_needed) > +{ > + int status = 0, free_extents; > + > + free_extents = ocfs2_num_free_extents(et); > + if (free_extents < 0) { > + status = free_extents; > + mlog_errno(status); > + return status; > + } > + > + /* > + * there are two cases which could cause us to EAGAIN in the > + * we-need-more-metadata case: > + * 1) we haven't reserved *any* > + * 2) we are so fragmented, we've needed to add metadata too > + * many times. > + */ > + if (free_extents < max_rec_needed) { > + if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac) > + < ocfs2_extend_meta_needed(et->et_root_el))) > + status = 1; > + } > + > + return status; > +} > + > +#define OCFS2_MAX_REC_NEEDED_SPLIT 2 > static int ocfs2_dio_end_io_write(struct inode *inode, > struct ocfs2_dio_write_ctxt *dwc, > loff_t offset, > @@ -2281,14 +2310,13 @@ static int ocfs2_dio_end_io_write(struct inode *inode, > struct ocfs2_extent_tree et; > struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); > struct ocfs2_inode_info *oi = OCFS2_I(inode); > - struct ocfs2_unwritten_extent *ue = NULL; > + struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue; > struct buffer_head *di_bh = NULL; > struct ocfs2_dinode *di; > - struct ocfs2_alloc_context *data_ac = NULL; > struct ocfs2_alloc_context *meta_ac = NULL; > handle_t *handle = NULL; > loff_t end = offset + bytes; > - int ret = 0, credits = 0, locked = 0; > + int ret = 0, credits = 0, locked = 0, restart = 0; > > ocfs2_init_dealloc_ctxt(&dealloc); > > @@ -2330,8 +2358,9 @@ static int ocfs2_dio_end_io_write(struct inode *inode, > Should we restart here? > ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); > > - ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, > - &data_ac, &meta_ac); > +restart_all: > + ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT, > + NULL, &meta_ac); > if (ret) { > mlog_errno(ret); > goto unlock; > @@ -2343,7 +2372,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, > if (IS_ERR(handle)) { > ret = PTR_ERR(handle); > mlog_errno(ret); > - goto unlock; > + goto free_ac; I don't think it is a necessary change here. > } > ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, > OCFS2_JOURNAL_ACCESS_WRITE); > @@ -2352,7 +2381,17 @@ static int ocfs2_dio_end_io_write(struct inode *inode, > goto commit; > } > > - list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { > + list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) { > + ret = ocfs2_dio_should_restart(&et, meta_ac, > + OCFS2_MAX_REC_NEEDED_SPLIT * 2); > + if (ret < 0) { > + mlog_errno(ret); > + break; > + } else if (ret == 1) { > + restart = 1; > + break; > + } > + > ret = ocfs2_mark_extent_written(inode, &et, handle, > ue->ue_cpos, 1, > ue->ue_phys, > @@ -2361,24 +2400,37 @@ static int ocfs2_dio_end_io_write(struct inode *inode, > mlog_errno(ret); > break; > } > + > + dwc->dw_zero_count--; > + list_del_init(&ue->ue_node); > + spin_lock(&oi->ip_lock); > + list_del_init(&ue->ue_ip_node); > + spin_unlock(&oi->ip_lock); > + kfree(ue); > } > > - if (end > i_size_read(inode)) { > + if (!restart && end > i_size_read(inode)) { > ret = ocfs2_set_inode_size(handle, inode, di_bh, end); > if (ret < 0) > mlog_errno(ret); > } > + > commit: > ocfs2_commit_trans(osb, handle); > +free_ac: > + if (meta_ac) { > + ocfs2_free_alloc_context(meta_ac); > + meta_ac = NULL; > + } > + if (restart) { > + restart = 0; > + goto restart_all; > + } > unlock: > up_write(&oi->ip_alloc_sem); > ocfs2_inode_unlock(inode, 1); > brelse(di_bh); > out: > - if (data_ac) > - ocfs2_free_alloc_context(data_ac); Agree to cleanup the unused data_ac. > - if (meta_ac) > - ocfs2_free_alloc_context(meta_ac);Just move the restart logic down is enough. Thanks, Joseph > ocfs2_run_deallocs(osb, &dealloc); > if (locked) > inode_unlock(inode); >
Hi Joseph, Thanks for your reviews. On 2017/12/5 10:21, Joseph Qi wrote: > > > On 17/12/5 09:41, alex chen wrote: >> We need to check the free number of the records in each loop to mark >> extent written, because the last extent block may be changed through >> many times marking extent written and the 'num_free_extents' also be >> changed. In the worst case, the 'num_free_extents' may become less >> than the beginning of the loop. So we should not estimate the free >> number of the records at the beginning of the loop to mark extent >> written. >> >> Reported-by: John Lightsey <john@nixnuts.net> >> Signed-off-by: Alex Chen <alex.chen@huawei.com> >> Reviewed-by: Jun Piao <piaojun@huawei.com> >> --- >> fs/ocfs2/aops.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++--------- >> 1 file changed, 64 insertions(+), 12 deletions(-) >> >> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c >> index d151632..702aebc 100644 >> --- a/fs/ocfs2/aops.c >> +++ b/fs/ocfs2/aops.c >> @@ -2272,6 +2272,35 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, >> return ret; >> } >> >> +static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et, >> + struct ocfs2_alloc_context *meta_ac, int max_rec_needed) >> +{ >> + int status = 0, free_extents; >> + >> + free_extents = ocfs2_num_free_extents(et); >> + if (free_extents < 0) { >> + status = free_extents; >> + mlog_errno(status); >> + return status; >> + } >> + >> + /* >> + * there are two cases which could cause us to EAGAIN in the >> + * we-need-more-metadata case: >> + * 1) we haven't reserved *any* >> + * 2) we are so fragmented, we've needed to add metadata too >> + * many times. >> + */ >> + if (free_extents < max_rec_needed) { >> + if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac) >> + < ocfs2_extend_meta_needed(et->et_root_el))) >> + status = 1; >> + } >> + >> + return status; >> +} >> + >> +#define OCFS2_MAX_REC_NEEDED_SPLIT 2 >> static int ocfs2_dio_end_io_write(struct inode *inode, >> struct ocfs2_dio_write_ctxt *dwc, >> loff_t offset, >> @@ -2281,14 +2310,13 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >> struct ocfs2_extent_tree et; >> struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); >> struct ocfs2_inode_info *oi = OCFS2_I(inode); >> - struct ocfs2_unwritten_extent *ue = NULL; >> + struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue; >> struct buffer_head *di_bh = NULL; >> struct ocfs2_dinode *di; >> - struct ocfs2_alloc_context *data_ac = NULL; >> struct ocfs2_alloc_context *meta_ac = NULL; >> handle_t *handle = NULL; >> loff_t end = offset + bytes; >> - int ret = 0, credits = 0, locked = 0; >> + int ret = 0, credits = 0, locked = 0, restart = 0; >> >> ocfs2_init_dealloc_ctxt(&dealloc); >> >> @@ -2330,8 +2358,9 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >> > Should we restart here? OK. we should restart before initialized the inode extent tree. > >> ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); >> >> - ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, >> - &data_ac, &meta_ac); >> +restart_all: >> + ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT, >> + NULL, &meta_ac); >> if (ret) { >> mlog_errno(ret); >> goto unlock; >> @@ -2343,7 +2372,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >> if (IS_ERR(handle)) { >> ret = PTR_ERR(handle); >> mlog_errno(ret); >> - goto unlock; >> + goto free_ac; > I don't think it is a necessary change here. we need to free the 'meta_ac' which allocated in ocfs2_lock_allocators() when we start transaction failed. Thanks, Alex > >> } >> ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, >> OCFS2_JOURNAL_ACCESS_WRITE); >> @@ -2352,7 +2381,17 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >> goto commit; >> } >> >> - list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { >> + list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) { >> + ret = ocfs2_dio_should_restart(&et, meta_ac, >> + OCFS2_MAX_REC_NEEDED_SPLIT * 2); >> + if (ret < 0) { >> + mlog_errno(ret); >> + break; >> + } else if (ret == 1) { >> + restart = 1; >> + break; >> + } >> + >> ret = ocfs2_mark_extent_written(inode, &et, handle, >> ue->ue_cpos, 1, >> ue->ue_phys, >> @@ -2361,24 +2400,37 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >> mlog_errno(ret); >> break; >> } >> + >> + dwc->dw_zero_count--; >> + list_del_init(&ue->ue_node); >> + spin_lock(&oi->ip_lock); >> + list_del_init(&ue->ue_ip_node); >> + spin_unlock(&oi->ip_lock); >> + kfree(ue); >> } >> >> - if (end > i_size_read(inode)) { >> + if (!restart && end > i_size_read(inode)) { >> ret = ocfs2_set_inode_size(handle, inode, di_bh, end); >> if (ret < 0) >> mlog_errno(ret); >> } >> + >> commit: >> ocfs2_commit_trans(osb, handle); >> +free_ac: >> + if (meta_ac) { >> + ocfs2_free_alloc_context(meta_ac); >> + meta_ac = NULL; >> + } >> + if (restart) { >> + restart = 0; >> + goto restart_all; >> + } >> unlock: >> up_write(&oi->ip_alloc_sem); >> ocfs2_inode_unlock(inode, 1); >> brelse(di_bh); >> out: >> - if (data_ac) >> - ocfs2_free_alloc_context(data_ac); > Agree to cleanup the unused data_ac. > >> - if (meta_ac) >> - ocfs2_free_alloc_context(meta_ac);Just move the restart logic down is enough. > > Thanks, > Joseph > >> ocfs2_run_deallocs(osb, &dealloc); >> if (locked) >> inode_unlock(inode); >> > > . >
On 17/12/5 10:36, alex chen wrote: > Hi Joseph, > > Thanks for your reviews. > > On 2017/12/5 10:21, Joseph Qi wrote: >> >> >> On 17/12/5 09:41, alex chen wrote: >>> We need to check the free number of the records in each loop to mark >>> extent written, because the last extent block may be changed through >>> many times marking extent written and the 'num_free_extents' also be >>> changed. In the worst case, the 'num_free_extents' may become less >>> than the beginning of the loop. So we should not estimate the free >>> number of the records at the beginning of the loop to mark extent >>> written. >>> >>> Reported-by: John Lightsey <john@nixnuts.net> >>> Signed-off-by: Alex Chen <alex.chen@huawei.com> >>> Reviewed-by: Jun Piao <piaojun@huawei.com> >>> --- >>> fs/ocfs2/aops.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++--------- >>> 1 file changed, 64 insertions(+), 12 deletions(-) >>> >>> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c >>> index d151632..702aebc 100644 >>> --- a/fs/ocfs2/aops.c >>> +++ b/fs/ocfs2/aops.c >>> @@ -2272,6 +2272,35 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, >>> return ret; >>> } >>> >>> +static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et, >>> + struct ocfs2_alloc_context *meta_ac, int max_rec_needed) >>> +{ >>> + int status = 0, free_extents; >>> + >>> + free_extents = ocfs2_num_free_extents(et); >>> + if (free_extents < 0) { >>> + status = free_extents; >>> + mlog_errno(status); >>> + return status; >>> + } >>> + >>> + /* >>> + * there are two cases which could cause us to EAGAIN in the >>> + * we-need-more-metadata case: >>> + * 1) we haven't reserved *any* >>> + * 2) we are so fragmented, we've needed to add metadata too >>> + * many times. >>> + */ >>> + if (free_extents < max_rec_needed) { >>> + if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac) >>> + < ocfs2_extend_meta_needed(et->et_root_el))) >>> + status = 1; >>> + } >>> + >>> + return status; >>> +} >>> + >>> +#define OCFS2_MAX_REC_NEEDED_SPLIT 2 >>> static int ocfs2_dio_end_io_write(struct inode *inode, >>> struct ocfs2_dio_write_ctxt *dwc, >>> loff_t offset, >>> @@ -2281,14 +2310,13 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>> struct ocfs2_extent_tree et; >>> struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); >>> struct ocfs2_inode_info *oi = OCFS2_I(inode); >>> - struct ocfs2_unwritten_extent *ue = NULL; >>> + struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue; >>> struct buffer_head *di_bh = NULL; >>> struct ocfs2_dinode *di; >>> - struct ocfs2_alloc_context *data_ac = NULL; >>> struct ocfs2_alloc_context *meta_ac = NULL; >>> handle_t *handle = NULL; >>> loff_t end = offset + bytes; >>> - int ret = 0, credits = 0, locked = 0; >>> + int ret = 0, credits = 0, locked = 0, restart = 0; >>> >>> ocfs2_init_dealloc_ctxt(&dealloc); >>> >>> @@ -2330,8 +2358,9 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>> >> Should we restart here? > > OK. we should restart before initialized the inode extent tree. > >> >>> ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); >>> >>> - ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, >>> - &data_ac, &meta_ac); >>> +restart_all: >>> + ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT, >>> + NULL, &meta_ac); >>> if (ret) { >>> mlog_errno(ret); >>> goto unlock; >>> @@ -2343,7 +2372,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>> if (IS_ERR(handle)) { >>> ret = PTR_ERR(handle); >>> mlog_errno(ret); >>> - goto unlock; >>> + goto free_ac; >> I don't think it is a necessary change here. > > we need to free the 'meta_ac' which allocated in ocfs2_lock_allocators() when we > start transaction failed. > As I described in the last mail, just move the restart check after the original free meta_ac check. > Thanks, > Alex >> >>> } >>> ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, >>> OCFS2_JOURNAL_ACCESS_WRITE); >>> @@ -2352,7 +2381,17 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>> goto commit; >>> } >>> >>> - list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { >>> + list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) { >>> + ret = ocfs2_dio_should_restart(&et, meta_ac, >>> + OCFS2_MAX_REC_NEEDED_SPLIT * 2); >>> + if (ret < 0) { >>> + mlog_errno(ret); >>> + break; >>> + } else if (ret == 1) { >>> + restart = 1; >>> + break; >>> + } >>> + >>> ret = ocfs2_mark_extent_written(inode, &et, handle, >>> ue->ue_cpos, 1, >>> ue->ue_phys, >>> @@ -2361,24 +2400,37 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>> mlog_errno(ret); >>> break; >>> } >>> + >>> + dwc->dw_zero_count--; >>> + list_del_init(&ue->ue_node); >>> + spin_lock(&oi->ip_lock); >>> + list_del_init(&ue->ue_ip_node); >>> + spin_unlock(&oi->ip_lock); >>> + kfree(ue); >>> } >>> >>> - if (end > i_size_read(inode)) { >>> + if (!restart && end > i_size_read(inode)) { >>> ret = ocfs2_set_inode_size(handle, inode, di_bh, end); >>> if (ret < 0) >>> mlog_errno(ret); >>> } >>> + >>> commit: >>> ocfs2_commit_trans(osb, handle); >>> +free_ac: >>> + if (meta_ac) { >>> + ocfs2_free_alloc_context(meta_ac); >>> + meta_ac = NULL; >>> + } >>> + if (restart) { >>> + restart = 0; >>> + goto restart_all; >>> + } >>> unlock: >>> up_write(&oi->ip_alloc_sem); >>> ocfs2_inode_unlock(inode, 1); >>> brelse(di_bh); >>> out: >>> - if (data_ac) >>> - ocfs2_free_alloc_context(data_ac); >> Agree to cleanup the unused data_ac. >> >>> - if (meta_ac) >>> - ocfs2_free_alloc_context(meta_ac); Just move the restart logic down is enough. >> >> Thanks, >> Joseph >> >>> ocfs2_run_deallocs(osb, &dealloc); >>> if (locked) >>> inode_unlock(inode); >>> >> >> . >> >
On 2017/12/5 10:45, Joseph Qi wrote: > > > On 17/12/5 10:36, alex chen wrote: >> Hi Joseph, >> >> Thanks for your reviews. >> >> On 2017/12/5 10:21, Joseph Qi wrote: >>> >>> >>> On 17/12/5 09:41, alex chen wrote: >>>> We need to check the free number of the records in each loop to mark >>>> extent written, because the last extent block may be changed through >>>> many times marking extent written and the 'num_free_extents' also be >>>> changed. In the worst case, the 'num_free_extents' may become less >>>> than the beginning of the loop. So we should not estimate the free >>>> number of the records at the beginning of the loop to mark extent >>>> written. >>>> >>>> Reported-by: John Lightsey <john@nixnuts.net> >>>> Signed-off-by: Alex Chen <alex.chen@huawei.com> >>>> Reviewed-by: Jun Piao <piaojun@huawei.com> >>>> --- >>>> fs/ocfs2/aops.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++--------- >>>> 1 file changed, 64 insertions(+), 12 deletions(-) >>>> >>>> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c >>>> index d151632..702aebc 100644 >>>> --- a/fs/ocfs2/aops.c >>>> +++ b/fs/ocfs2/aops.c >>>> @@ -2272,6 +2272,35 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, >>>> return ret; >>>> } >>>> >>>> +static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et, >>>> + struct ocfs2_alloc_context *meta_ac, int max_rec_needed) >>>> +{ >>>> + int status = 0, free_extents; >>>> + >>>> + free_extents = ocfs2_num_free_extents(et); >>>> + if (free_extents < 0) { >>>> + status = free_extents; >>>> + mlog_errno(status); >>>> + return status; >>>> + } >>>> + >>>> + /* >>>> + * there are two cases which could cause us to EAGAIN in the >>>> + * we-need-more-metadata case: >>>> + * 1) we haven't reserved *any* >>>> + * 2) we are so fragmented, we've needed to add metadata too >>>> + * many times. >>>> + */ >>>> + if (free_extents < max_rec_needed) { >>>> + if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac) >>>> + < ocfs2_extend_meta_needed(et->et_root_el))) >>>> + status = 1; >>>> + } >>>> + >>>> + return status; >>>> +} >>>> + >>>> +#define OCFS2_MAX_REC_NEEDED_SPLIT 2 >>>> static int ocfs2_dio_end_io_write(struct inode *inode, >>>> struct ocfs2_dio_write_ctxt *dwc, >>>> loff_t offset, >>>> @@ -2281,14 +2310,13 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>> struct ocfs2_extent_tree et; >>>> struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); >>>> struct ocfs2_inode_info *oi = OCFS2_I(inode); >>>> - struct ocfs2_unwritten_extent *ue = NULL; >>>> + struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue; >>>> struct buffer_head *di_bh = NULL; >>>> struct ocfs2_dinode *di; >>>> - struct ocfs2_alloc_context *data_ac = NULL; >>>> struct ocfs2_alloc_context *meta_ac = NULL; >>>> handle_t *handle = NULL; >>>> loff_t end = offset + bytes; >>>> - int ret = 0, credits = 0, locked = 0; >>>> + int ret = 0, credits = 0, locked = 0, restart = 0; >>>> >>>> ocfs2_init_dealloc_ctxt(&dealloc); >>>> >>>> @@ -2330,8 +2358,9 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>> >>> Should we restart here? >> >> OK. we should restart before initialized the inode extent tree. >> >>> >>>> ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); >>>> >>>> - ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, >>>> - &data_ac, &meta_ac); >>>> +restart_all: >>>> + ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT, >>>> + NULL, &meta_ac); >>>> if (ret) { >>>> mlog_errno(ret); >>>> goto unlock; >>>> @@ -2343,7 +2372,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>> if (IS_ERR(handle)) { >>>> ret = PTR_ERR(handle); >>>> mlog_errno(ret); >>>> - goto unlock; >>>> + goto free_ac; >>> I don't think it is a necessary change here. >> >> we need to free the 'meta_ac' which allocated in ocfs2_lock_allocators() when we >> start transaction failed. >> > As I described in the last mail, just move the restart check after the > original free meta_ac check. > >> Thanks, >> Alex >>> >>>> } >>>> ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, >>>> OCFS2_JOURNAL_ACCESS_WRITE); >>>> @@ -2352,7 +2381,17 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>> goto commit; >>>> } >>>> >>>> - list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { >>>> + list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) { >>>> + ret = ocfs2_dio_should_restart(&et, meta_ac, >>>> + OCFS2_MAX_REC_NEEDED_SPLIT * 2); >>>> + if (ret < 0) { >>>> + mlog_errno(ret); >>>> + break; >>>> + } else if (ret == 1) { >>>> + restart = 1; >>>> + break; >>>> + } >>>> + >>>> ret = ocfs2_mark_extent_written(inode, &et, handle, >>>> ue->ue_cpos, 1, >>>> ue->ue_phys, >>>> @@ -2361,24 +2400,37 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>> mlog_errno(ret); >>>> break; >>>> } >>>> + >>>> + dwc->dw_zero_count--; >>>> + list_del_init(&ue->ue_node); >>>> + spin_lock(&oi->ip_lock); >>>> + list_del_init(&ue->ue_ip_node); >>>> + spin_unlock(&oi->ip_lock); >>>> + kfree(ue); >>>> } >>>> >>>> - if (end > i_size_read(inode)) { >>>> + if (!restart && end > i_size_read(inode)) { >>>> ret = ocfs2_set_inode_size(handle, inode, di_bh, end); >>>> if (ret < 0) >>>> mlog_errno(ret); >>>> } >>>> + >>>> commit: >>>> ocfs2_commit_trans(osb, handle); >>>> +free_ac: >>>> + if (meta_ac) { >>>> + ocfs2_free_alloc_context(meta_ac); >>>> + meta_ac = NULL; >>>> + } >>>> + if (restart) { >>>> + restart = 0; >>>> + goto restart_all; >>>> + } >>>> unlock: >>>> up_write(&oi->ip_alloc_sem); >>>> ocfs2_inode_unlock(inode, 1); >>>> brelse(di_bh); >>>> out: >>>> - if (data_ac) >>>> - ocfs2_free_alloc_context(data_ac); >>> Agree to cleanup the unused data_ac. >>> >>>> - if (meta_ac) >>>> - ocfs2_free_alloc_context(meta_ac); > Just move the restart logic down is enough. If we check the restart here, we will unlock the 'ip_alloc_sem' and release the inode buffer, which we don't need to do. Thanks, Alex >>> >>> Thanks, >>> Joseph >>> >>>> ocfs2_run_deallocs(osb, &dealloc); >>>> if (locked) >>>> inode_unlock(inode); >>>> >>> >>> . >>> >> > > . >
On 17/12/5 11:09, alex chen wrote: > > > On 2017/12/5 10:45, Joseph Qi wrote: >> >> >> On 17/12/5 10:36, alex chen wrote: >>> Hi Joseph, >>> >>> Thanks for your reviews. >>> >>> On 2017/12/5 10:21, Joseph Qi wrote: >>>> >>>> >>>> On 17/12/5 09:41, alex chen wrote: >>>>> We need to check the free number of the records in each loop to mark >>>>> extent written, because the last extent block may be changed through >>>>> many times marking extent written and the 'num_free_extents' also be >>>>> changed. In the worst case, the 'num_free_extents' may become less >>>>> than the beginning of the loop. So we should not estimate the free >>>>> number of the records at the beginning of the loop to mark extent >>>>> written. >>>>> >>>>> Reported-by: John Lightsey <john@nixnuts.net> >>>>> Signed-off-by: Alex Chen <alex.chen@huawei.com> >>>>> Reviewed-by: Jun Piao <piaojun@huawei.com> >>>>> --- >>>>> fs/ocfs2/aops.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++--------- >>>>> 1 file changed, 64 insertions(+), 12 deletions(-) >>>>> >>>>> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c >>>>> index d151632..702aebc 100644 >>>>> --- a/fs/ocfs2/aops.c >>>>> +++ b/fs/ocfs2/aops.c >>>>> @@ -2272,6 +2272,35 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, >>>>> return ret; >>>>> } >>>>> >>>>> +static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et, >>>>> + struct ocfs2_alloc_context *meta_ac, int max_rec_needed) >>>>> +{ >>>>> + int status = 0, free_extents; >>>>> + >>>>> + free_extents = ocfs2_num_free_extents(et); >>>>> + if (free_extents < 0) { >>>>> + status = free_extents; >>>>> + mlog_errno(status); >>>>> + return status; >>>>> + } >>>>> + >>>>> + /* >>>>> + * there are two cases which could cause us to EAGAIN in the >>>>> + * we-need-more-metadata case: >>>>> + * 1) we haven't reserved *any* >>>>> + * 2) we are so fragmented, we've needed to add metadata too >>>>> + * many times. >>>>> + */ >>>>> + if (free_extents < max_rec_needed) { >>>>> + if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac) >>>>> + < ocfs2_extend_meta_needed(et->et_root_el))) >>>>> + status = 1; >>>>> + } >>>>> + >>>>> + return status; >>>>> +} >>>>> + >>>>> +#define OCFS2_MAX_REC_NEEDED_SPLIT 2 >>>>> static int ocfs2_dio_end_io_write(struct inode *inode, >>>>> struct ocfs2_dio_write_ctxt *dwc, >>>>> loff_t offset, >>>>> @@ -2281,14 +2310,13 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>>> struct ocfs2_extent_tree et; >>>>> struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); >>>>> struct ocfs2_inode_info *oi = OCFS2_I(inode); >>>>> - struct ocfs2_unwritten_extent *ue = NULL; >>>>> + struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue; >>>>> struct buffer_head *di_bh = NULL; >>>>> struct ocfs2_dinode *di; >>>>> - struct ocfs2_alloc_context *data_ac = NULL; >>>>> struct ocfs2_alloc_context *meta_ac = NULL; >>>>> handle_t *handle = NULL; >>>>> loff_t end = offset + bytes; >>>>> - int ret = 0, credits = 0, locked = 0; >>>>> + int ret = 0, credits = 0, locked = 0, restart = 0; >>>>> >>>>> ocfs2_init_dealloc_ctxt(&dealloc); >>>>> >>>>> @@ -2330,8 +2358,9 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>>> >>>> Should we restart here? >>> >>> OK. we should restart before initialized the inode extent tree. >>> >>>> >>>>> ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); >>>>> >>>>> - ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, >>>>> - &data_ac, &meta_ac); >>>>> +restart_all: >>>>> + ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT, >>>>> + NULL, &meta_ac); >>>>> if (ret) { >>>>> mlog_errno(ret); >>>>> goto unlock; >>>>> @@ -2343,7 +2372,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>>> if (IS_ERR(handle)) { >>>>> ret = PTR_ERR(handle); >>>>> mlog_errno(ret); >>>>> - goto unlock; >>>>> + goto free_ac; >>>> I don't think it is a necessary change here. >>> >>> we need to free the 'meta_ac' which allocated in ocfs2_lock_allocators() when we >>> start transaction failed. >>> >> As I described in the last mail, just move the restart check after the >> original free meta_ac check. >> >>> Thanks, >>> Alex >>>> >>>>> } >>>>> ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, >>>>> OCFS2_JOURNAL_ACCESS_WRITE); >>>>> @@ -2352,7 +2381,17 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>>> goto commit; >>>>> } >>>>> >>>>> - list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { >>>>> + list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) { >>>>> + ret = ocfs2_dio_should_restart(&et, meta_ac, >>>>> + OCFS2_MAX_REC_NEEDED_SPLIT * 2); >>>>> + if (ret < 0) { >>>>> + mlog_errno(ret); >>>>> + break; >>>>> + } else if (ret == 1) { >>>>> + restart = 1; >>>>> + break; >>>>> + } >>>>> + >>>>> ret = ocfs2_mark_extent_written(inode, &et, handle, >>>>> ue->ue_cpos, 1, >>>>> ue->ue_phys, >>>>> @@ -2361,24 +2400,37 @@ static int ocfs2_dio_end_io_write(struct inode *inode, >>>>> mlog_errno(ret); >>>>> break; >>>>> } >>>>> + >>>>> + dwc->dw_zero_count--; >>>>> + list_del_init(&ue->ue_node); >>>>> + spin_lock(&oi->ip_lock); >>>>> + list_del_init(&ue->ue_ip_node); >>>>> + spin_unlock(&oi->ip_lock); >>>>> + kfree(ue); >>>>> } >>>>> >>>>> - if (end > i_size_read(inode)) { >>>>> + if (!restart && end > i_size_read(inode)) { >>>>> ret = ocfs2_set_inode_size(handle, inode, di_bh, end); >>>>> if (ret < 0) >>>>> mlog_errno(ret); >>>>> } >>>>> + >>>>> commit: >>>>> ocfs2_commit_trans(osb, handle); >>>>> +free_ac: >>>>> + if (meta_ac) { >>>>> + ocfs2_free_alloc_context(meta_ac); >>>>> + meta_ac = NULL; >>>>> + } >>>>> + if (restart) { >>>>> + restart = 0; >>>>> + goto restart_all; >>>>> + } >>>>> unlock: >>>>> up_write(&oi->ip_alloc_sem); >>>>> ocfs2_inode_unlock(inode, 1); >>>>> brelse(di_bh); >>>>> out: >>>>> - if (data_ac) >>>>> - ocfs2_free_alloc_context(data_ac); >>>> Agree to cleanup the unused data_ac. >>>> >>>>> - if (meta_ac) >>>>> - ocfs2_free_alloc_context(meta_ac); >> Just move the restart logic down is enough. > > If we check the restart here, we will unlock the 'ip_alloc_sem' and release the > inode buffer, which we don't need to do. > Oh, my mistake. You are right, we should restart under inode lock as well as ip_alloc_sem. > Thanks, > Alex >>>> >>>> Thanks, >>>> Joseph >>>> >>>>> ocfs2_run_deallocs(osb, &dealloc); >>>>> if (locked) >>>>> inode_unlock(inode); >>>>> >>>> >>>> . >>>> >>> >> >> . >> >
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index d151632..702aebc 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2272,6 +2272,35 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, return ret; } +static int ocfs2_dio_should_restart(struct ocfs2_extent_tree *et, + struct ocfs2_alloc_context *meta_ac, int max_rec_needed) +{ + int status = 0, free_extents; + + free_extents = ocfs2_num_free_extents(et); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + return status; + } + + /* + * there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. + */ + if (free_extents < max_rec_needed) { + if (!meta_ac || (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(et->et_root_el))) + status = 1; + } + + return status; +} + +#define OCFS2_MAX_REC_NEEDED_SPLIT 2 static int ocfs2_dio_end_io_write(struct inode *inode, struct ocfs2_dio_write_ctxt *dwc, loff_t offset, @@ -2281,14 +2310,13 @@ static int ocfs2_dio_end_io_write(struct inode *inode, struct ocfs2_extent_tree et; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_unwritten_extent *ue = NULL; + struct ocfs2_unwritten_extent *ue = NULL, *tmp_ue; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; loff_t end = offset + bytes; - int ret = 0, credits = 0, locked = 0; + int ret = 0, credits = 0, locked = 0, restart = 0; ocfs2_init_dealloc_ctxt(&dealloc); @@ -2330,8 +2358,9 @@ static int ocfs2_dio_end_io_write(struct inode *inode, ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); - ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, - &data_ac, &meta_ac); +restart_all: + ret = ocfs2_lock_allocators(inode, &et, 0, OCFS2_MAX_REC_NEEDED_SPLIT, + NULL, &meta_ac); if (ret) { mlog_errno(ret); goto unlock; @@ -2343,7 +2372,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto unlock; + goto free_ac; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -2352,7 +2381,17 @@ static int ocfs2_dio_end_io_write(struct inode *inode, goto commit; } - list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + list_for_each_entry_safe(ue, tmp_ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_dio_should_restart(&et, meta_ac, + OCFS2_MAX_REC_NEEDED_SPLIT * 2); + if (ret < 0) { + mlog_errno(ret); + break; + } else if (ret == 1) { + restart = 1; + break; + } + ret = ocfs2_mark_extent_written(inode, &et, handle, ue->ue_cpos, 1, ue->ue_phys, @@ -2361,24 +2400,37 @@ static int ocfs2_dio_end_io_write(struct inode *inode, mlog_errno(ret); break; } + + dwc->dw_zero_count--; + list_del_init(&ue->ue_node); + spin_lock(&oi->ip_lock); + list_del_init(&ue->ue_ip_node); + spin_unlock(&oi->ip_lock); + kfree(ue); } - if (end > i_size_read(inode)) { + if (!restart && end > i_size_read(inode)) { ret = ocfs2_set_inode_size(handle, inode, di_bh, end); if (ret < 0) mlog_errno(ret); } + commit: ocfs2_commit_trans(osb, handle); +free_ac: + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + if (restart) { + restart = 0; + goto restart_all; + } unlock: up_write(&oi->ip_alloc_sem); ocfs2_inode_unlock(inode, 1); brelse(di_bh); out: - if (data_ac) - ocfs2_free_alloc_context(data_ac); - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); ocfs2_run_deallocs(osb, &dealloc); if (locked) inode_unlock(inode);