diff mbox

[f2fs-dev,RFC,v4] f2fs: flush cp pack except cp pack 2 page at first

Message ID 9047C53C18267742AB12E43B65C7F9F70BCE500E@dggemi505-mbx.china.huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Gao Xiang Jan. 31, 2018, 6:39 a.m. UTC
Previously, we attempt to flush the whole cp pack in a single bio,
however, when suddenly powering off at this time, we could get into
an extreme scenario that cp pack 1 page and cp pack 2 page are updated
and latest, but payload or current summaries are still partially
outdated. (see reliable write in the UFS specification)

This patch submits the whole cp pack except cp pack 2 page at first,
and then writes the cp pack 2 page with an extra independent
bio with pre-io barrier.

Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
---
Change log from v3:
  - further review comments are applied from Jaegeuk and Chao
  - Tested on this patch (without multiple-device): mount, boot Android with f2fs userdata and make fragment
  - If any problem with this patch or I miss something, please kindly share your comments, thanks :)
Change log from v2:
  - Apply the review comments from Chao
Change log from v1:
  - Apply the review comments from Chao
  - time data from "finish block_ops" to " finish checkpoint" (tested on ARM64 with TOSHIBA 128GB UFS):
     Before patch: 0.002273  0.001973  0.002789  0.005159  0.002050
     After patch: 0.002502  0.001624  0.002487  0.003049  0.002696
 fs/f2fs/checkpoint.c | 67 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 21 deletions(-)

Comments

Chao Yu Jan. 31, 2018, 10:21 a.m. UTC | #1
On 2018/1/31 14:39, Gaoxiang (OS) wrote:
> Previously, we attempt to flush the whole cp pack in a single bio,
> however, when suddenly powering off at this time, we could get into
> an extreme scenario that cp pack 1 page and cp pack 2 page are updated
> and latest, but payload or current summaries are still partially
> outdated. (see reliable write in the UFS specification)
> 
> This patch submits the whole cp pack except cp pack 2 page at first,
> and then writes the cp pack 2 page with an extra independent
> bio with pre-io barrier.
> 
> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
> Reviewed-by: Chao Yu <yuchao0@huawei.com>
> ---
> Change log from v3:
>   - further review comments are applied from Jaegeuk and Chao
>   - Tested on this patch (without multiple-device): mount, boot Android with f2fs userdata and make fragment
>   - If any problem with this patch or I miss something, please kindly share your comments, thanks :)
> Change log from v2:
>   - Apply the review comments from Chao
> Change log from v1:
>   - Apply the review comments from Chao
>   - time data from "finish block_ops" to " finish checkpoint" (tested on ARM64 with TOSHIBA 128GB UFS):
>      Before patch: 0.002273  0.001973  0.002789  0.005159  0.002050
>      After patch: 0.002502  0.001624  0.002487  0.003049  0.002696
>  fs/f2fs/checkpoint.c | 67 ++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 46 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> index 14d2fed..916dc72 100644
> --- a/fs/f2fs/checkpoint.c
> +++ b/fs/f2fs/checkpoint.c
> @@ -1158,6 +1158,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  	spin_unlock_irqrestore(&sbi->cp_lock, flags);
>  }
>  
> +static void commit_checkpoint(struct f2fs_sb_info *sbi,
> +	void *src, block_t blk_addr)
> +{
> +	struct writeback_control wbc = {
> +		.for_reclaim = 0,
> +	};
> +
> +	/*
> +	 * pagevec_lookup_tag and lock_page again will take
> +	 * some extra time. Therefore, update_meta_pages and
> +	 * sync_meta_pages are combined in this function.
> +	 */
> +	struct page *page = grab_meta_page(sbi, blk_addr);
> +	int err;
> +
> +	memcpy(page_address(page), src, PAGE_SIZE);
> +	set_page_dirty(page);
> +
> +	f2fs_wait_on_page_writeback(page, META, true);
> +	f2fs_bug_on(sbi, PageWriteback(page));
> +	if (unlikely(!clear_page_dirty_for_io(page)))
> +		f2fs_bug_on(sbi, 1);
> +
> +	/* writeout cp pack 2 page */
> +	err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
> +	f2fs_bug_on(sbi, err);
> +
> +	f2fs_put_page(page, 0);
> +
> +	/* submit checkpoint (with barrier if NOBARRIER is not set) */
> +	f2fs_submit_merged_write(sbi, META_FLUSH);
> +}
> +
>  static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  {
>  	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
> @@ -1260,16 +1293,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  		}
>  	}
>  
> -	/* need to wait for end_io results */
> -	wait_on_all_pages_writeback(sbi);
> -	if (unlikely(f2fs_cp_error(sbi)))
> -		return -EIO;
> -
> -	/* flush all device cache */
> -	err = f2fs_flush_device_cache(sbi);
> -	if (err)
> -		return err;
> -
>  	/* write out checkpoint buffer at block 0 */
>  	update_meta_page(sbi, ckpt, start_blk++);
>  
> @@ -1297,15 +1320,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  		start_blk += NR_CURSEG_NODE_TYPE;
>  	}
>  
> -	/* writeout checkpoint block */
> -	update_meta_page(sbi, ckpt, start_blk);
> -
> -	/* wait for previous submitted node/meta pages writeback */
> -	wait_on_all_pages_writeback(sbi);
> -
> -	if (unlikely(f2fs_cp_error(sbi)))
> -		return -EIO;
> -
>  	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
>  	filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
>  
> @@ -1313,12 +1327,23 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  	sbi->last_valid_block_count = sbi->total_valid_block_count;
>  	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
>  
> -	/* Here, we only have one bio having CP pack */
> -	sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
> +	/* Here, we have one bio having CP pack except cp pack 2 page */
> +	sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
> +
> +	/* flush all device cache */
> +	err = f2fs_flush_device_cache(sbi);
> +	if (err)
> +		return err;
>  
>  	/* wait for previous submitted meta pages writeback */
>  	wait_on_all_pages_writeback(sbi);

Move f2fs_flush_device_cache here? since meta area can cross the multiple
devices, we should make sure all metadata were in device cache at least, and
then trigger the flush.

>  
> +	if (unlikely(f2fs_cp_error(sbi)))
> +		return -EIO;
> +
> +	/* barrier and flush checkpoint cp pack 2 page if it can */
> +	commit_checkpoint(sbi, ckpt, start_blk);

Jaegeuk, are we really allow to make critical do_checkpoint which is on path of
fsync()/sync() be asynchronous?

Thanks,

> +
>  	release_ino_entry(sbi, false);
>  
>  	if (unlikely(f2fs_cp_error(sbi)))
>
Jaegeuk Kim Jan. 31, 2018, 10:28 p.m. UTC | #2
On 01/31, Chao Yu wrote:
> On 2018/1/31 14:39, Gaoxiang (OS) wrote:
> > Previously, we attempt to flush the whole cp pack in a single bio,
> > however, when suddenly powering off at this time, we could get into
> > an extreme scenario that cp pack 1 page and cp pack 2 page are updated
> > and latest, but payload or current summaries are still partially
> > outdated. (see reliable write in the UFS specification)
> > 
> > This patch submits the whole cp pack except cp pack 2 page at first,
> > and then writes the cp pack 2 page with an extra independent
> > bio with pre-io barrier.
> > 
> > Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
> > Reviewed-by: Chao Yu <yuchao0@huawei.com>
> > ---
> > Change log from v3:
> >   - further review comments are applied from Jaegeuk and Chao
> >   - Tested on this patch (without multiple-device): mount, boot Android with f2fs userdata and make fragment
> >   - If any problem with this patch or I miss something, please kindly share your comments, thanks :)
> > Change log from v2:
> >   - Apply the review comments from Chao
> > Change log from v1:
> >   - Apply the review comments from Chao
> >   - time data from "finish block_ops" to " finish checkpoint" (tested on ARM64 with TOSHIBA 128GB UFS):
> >      Before patch: 0.002273  0.001973  0.002789  0.005159  0.002050
> >      After patch: 0.002502  0.001624  0.002487  0.003049  0.002696
> >  fs/f2fs/checkpoint.c | 67 ++++++++++++++++++++++++++++++++++++----------------
> >  1 file changed, 46 insertions(+), 21 deletions(-)
> > 
> > diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> > index 14d2fed..916dc72 100644
> > --- a/fs/f2fs/checkpoint.c
> > +++ b/fs/f2fs/checkpoint.c
> > @@ -1158,6 +1158,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  	spin_unlock_irqrestore(&sbi->cp_lock, flags);
> >  }
> >  
> > +static void commit_checkpoint(struct f2fs_sb_info *sbi,
> > +	void *src, block_t blk_addr)
> > +{
> > +	struct writeback_control wbc = {
> > +		.for_reclaim = 0,
> > +	};
> > +
> > +	/*
> > +	 * pagevec_lookup_tag and lock_page again will take
> > +	 * some extra time. Therefore, update_meta_pages and
> > +	 * sync_meta_pages are combined in this function.
> > +	 */
> > +	struct page *page = grab_meta_page(sbi, blk_addr);
> > +	int err;
> > +
> > +	memcpy(page_address(page), src, PAGE_SIZE);
> > +	set_page_dirty(page);
> > +
> > +	f2fs_wait_on_page_writeback(page, META, true);
> > +	f2fs_bug_on(sbi, PageWriteback(page));
> > +	if (unlikely(!clear_page_dirty_for_io(page)))
> > +		f2fs_bug_on(sbi, 1);
> > +
> > +	/* writeout cp pack 2 page */
> > +	err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
> > +	f2fs_bug_on(sbi, err);
> > +
> > +	f2fs_put_page(page, 0);
> > +
> > +	/* submit checkpoint (with barrier if NOBARRIER is not set) */
> > +	f2fs_submit_merged_write(sbi, META_FLUSH);
> > +}
> > +
> >  static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  {
> >  	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
> > @@ -1260,16 +1293,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  		}
> >  	}
> >  
> > -	/* need to wait for end_io results */
> > -	wait_on_all_pages_writeback(sbi);
> > -	if (unlikely(f2fs_cp_error(sbi)))
> > -		return -EIO;
> > -
> > -	/* flush all device cache */
> > -	err = f2fs_flush_device_cache(sbi);
> > -	if (err)
> > -		return err;
> > -
> >  	/* write out checkpoint buffer at block 0 */
> >  	update_meta_page(sbi, ckpt, start_blk++);
> >  
> > @@ -1297,15 +1320,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  		start_blk += NR_CURSEG_NODE_TYPE;
> >  	}
> >  
> > -	/* writeout checkpoint block */
> > -	update_meta_page(sbi, ckpt, start_blk);
> > -
> > -	/* wait for previous submitted node/meta pages writeback */
> > -	wait_on_all_pages_writeback(sbi);
> > -
> > -	if (unlikely(f2fs_cp_error(sbi)))
> > -		return -EIO;
> > -
> >  	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
> >  	filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);

 - remove

> >  
> > @@ -1313,12 +1327,23 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  	sbi->last_valid_block_count = sbi->total_valid_block_count;
> >  	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
> >  
> > -	/* Here, we only have one bio having CP pack */
> > -	sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
> > +	/* Here, we have one bio having CP pack except cp pack 2 page */
> > +	sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
> > +
> > +	/* flush all device cache */
> > +	err = f2fs_flush_device_cache(sbi);
> > +	if (err)
> > +		return err;
> >  
> >  	/* wait for previous submitted meta pages writeback */
> >  	wait_on_all_pages_writeback(sbi);
> 
> Move f2fs_flush_device_cache here? since meta area can cross the multiple
> devices, we should make sure all metadata were in device cache at least, and
> then trigger the flush.

Agreed, and need to flush, only if we have multiple devices.

> 
> >  
> > +	if (unlikely(f2fs_cp_error(sbi)))
> > +		return -EIO;
> > +
> > +	/* barrier and flush checkpoint cp pack 2 page if it can */
> > +	commit_checkpoint(sbi, ckpt, start_blk);
> 
> Jaegeuk, are we really allow to make critical do_checkpoint which is on path of
> fsync()/sync() be asynchronous?

Yeah, so we need to wait end_io on synchronous paths like f2fs_sync_fs(1).

> 
> Thanks,
> 
> > +
> >  	release_ino_entry(sbi, false);
> >  
> >  	if (unlikely(f2fs_cp_error(sbi)))
> >
Gao Xiang Jan. 31, 2018, 11:42 p.m. UTC | #3
Hi Jaegeuk and Chao,

On 2018/2/1 6:28, Jaegeuk Kim wrote:
> On 01/31, Chao Yu wrote:
>> On 2018/1/31 14:39, Gaoxiang (OS) wrote:
>>> Previously, we attempt to flush the whole cp pack in a single bio,
>>> however, when suddenly powering off at this time, we could get into
>>> an extreme scenario that cp pack 1 page and cp pack 2 page are updated
>>> and latest, but payload or current summaries are still partially
>>> outdated. (see reliable write in the UFS specification)
>>>
>>> This patch submits the whole cp pack except cp pack 2 page at first,
>>> and then writes the cp pack 2 page with an extra independent
>>> bio with pre-io barrier.
>>>
>>> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
>>> Reviewed-by: Chao Yu <yuchao0@huawei.com>
>>> ---
>>> Change log from v3:
>>>    - further review comments are applied from Jaegeuk and Chao
>>>    - Tested on this patch (without multiple-device): mount, boot Android with f2fs userdata and make fragment
>>>    - If any problem with this patch or I miss something, please kindly share your comments, thanks :)
>>> Change log from v2:
>>>    - Apply the review comments from Chao
>>> Change log from v1:
>>>    - Apply the review comments from Chao
>>>    - time data from "finish block_ops" to " finish checkpoint" (tested on ARM64 with TOSHIBA 128GB UFS):
>>>       Before patch: 0.002273  0.001973  0.002789  0.005159  0.002050
>>>       After patch: 0.002502  0.001624  0.002487  0.003049  0.002696
>>>   fs/f2fs/checkpoint.c | 67 ++++++++++++++++++++++++++++++++++++----------------
>>>   1 file changed, 46 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
>>> index 14d2fed..916dc72 100644
>>> --- a/fs/f2fs/checkpoint.c
>>> +++ b/fs/f2fs/checkpoint.c
>>> @@ -1158,6 +1158,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   	spin_unlock_irqrestore(&sbi->cp_lock, flags);
>>>   }
>>>   
>>> +static void commit_checkpoint(struct f2fs_sb_info *sbi,
>>> +	void *src, block_t blk_addr)
>>> +{
>>> +	struct writeback_control wbc = {
>>> +		.for_reclaim = 0,
>>> +	};
>>> +
>>> +	/*
>>> +	 * pagevec_lookup_tag and lock_page again will take
>>> +	 * some extra time. Therefore, update_meta_pages and
>>> +	 * sync_meta_pages are combined in this function.
>>> +	 */
>>> +	struct page *page = grab_meta_page(sbi, blk_addr);
>>> +	int err;
>>> +
>>> +	memcpy(page_address(page), src, PAGE_SIZE);
>>> +	set_page_dirty(page);
>>> +
>>> +	f2fs_wait_on_page_writeback(page, META, true);
>>> +	f2fs_bug_on(sbi, PageWriteback(page));
>>> +	if (unlikely(!clear_page_dirty_for_io(page)))
>>> +		f2fs_bug_on(sbi, 1);
>>> +
>>> +	/* writeout cp pack 2 page */
>>> +	err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
>>> +	f2fs_bug_on(sbi, err);
>>> +
>>> +	f2fs_put_page(page, 0);
>>> +
>>> +	/* submit checkpoint (with barrier if NOBARRIER is not set) */
>>> +	f2fs_submit_merged_write(sbi, META_FLUSH);
>>> +}
>>> +
>>>   static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   {
>>>   	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
>>> @@ -1260,16 +1293,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   		}
>>>   	}
>>>   
>>> -	/* need to wait for end_io results */
>>> -	wait_on_all_pages_writeback(sbi);
>>> -	if (unlikely(f2fs_cp_error(sbi)))
>>> -		return -EIO;
>>> -
>>> -	/* flush all device cache */
>>> -	err = f2fs_flush_device_cache(sbi);
>>> -	if (err)
>>> -		return err;
>>> -
>>>   	/* write out checkpoint buffer at block 0 */
>>>   	update_meta_page(sbi, ckpt, start_blk++);
>>>   
>>> @@ -1297,15 +1320,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   		start_blk += NR_CURSEG_NODE_TYPE;
>>>   	}
>>>   
>>> -	/* writeout checkpoint block */
>>> -	update_meta_page(sbi, ckpt, start_blk);
>>> -
>>> -	/* wait for previous submitted node/meta pages writeback */
>>> -	wait_on_all_pages_writeback(sbi);
>>> -
>>> -	if (unlikely(f2fs_cp_error(sbi)))
>>> -		return -EIO;
>>> -
>>>   	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
>>>   	filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
> 
>   - remove
> 
You mean remove
filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
and
filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
or remove
filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);

Actually, I have no idea why do these two filemap_fdatawait_range stay 
here and what are these used and waited for in this place,
however I found it was modified recently and for many times, I guess 
they have some use.

>>>   
>>> @@ -1313,12 +1327,23 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>   	sbi->last_valid_block_count = sbi->total_valid_block_count;
>>>   	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
>>>   
>>> -	/* Here, we only have one bio having CP pack */
>>> -	sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
>>> +	/* Here, we have one bio having CP pack except cp pack 2 page */
>>> +	sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
>>> +
>>> +	/* flush all device cache */
>>> +	err = f2fs_flush_device_cache(sbi);
>>> +	if (err)
>>> +		return err;
>>>   
>>>   	/* wait for previous submitted meta pages writeback */
>>>   	wait_on_all_pages_writeback(sbi);
>>
>> Move f2fs_flush_device_cache here? since meta area can cross the multiple
>> devices, we should make sure all metadata were in device cache at least, and
>> then trigger the flush.
> 
> Agreed, and need to flush, only if we have multiple devices.
> 

OK, and it seems that
- current f2fs_flush_device_cache
    - for (i = 1; i < sbi->s_ndevs; i++) {
it would skip if it only has only one devices?

>>
>>>   
>>> +	if (unlikely(f2fs_cp_error(sbi)))
>>> +		return -EIO;
>>> +
>>> +	/* barrier and flush checkpoint cp pack 2 page if it can */
>>> +	commit_checkpoint(sbi, ckpt, start_blk);
>>
>> Jaegeuk, are we really allow to make critical do_checkpoint which is on path of
>> fsync()/sync() be asynchronous?
> 
> Yeah, so we need to wait end_io on synchronous paths like f2fs_sync_fs(1).
>

How about adding a CP_SYNC (or CP_ASYNC) control flag in cp_control.reason?
and which paths could be synchronous paths:
     sync, umount, recovery - synchronous, right?

Thanks,

>>
>> Thanks,
>>
>>> +
>>>   	release_ino_entry(sbi, false);
>>>   
>>>   	if (unlikely(f2fs_cp_error(sbi)))
>>>
Chao Yu Feb. 1, 2018, 1:56 p.m. UTC | #4
On 2018/2/1 6:28, Jaegeuk Kim wrote:
> On 01/31, Chao Yu wrote:
>> On 2018/1/31 14:39, Gaoxiang (OS) wrote:
>>> Previously, we attempt to flush the whole cp pack in a single bio,
>>> however, when suddenly powering off at this time, we could get into
>>> an extreme scenario that cp pack 1 page and cp pack 2 page are updated
>>> and latest, but payload or current summaries are still partially
>>> outdated. (see reliable write in the UFS specification)
>>>
>>> This patch submits the whole cp pack except cp pack 2 page at first,
>>> and then writes the cp pack 2 page with an extra independent
>>> bio with pre-io barrier.
>>>
>>> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
>>> Reviewed-by: Chao Yu <yuchao0@huawei.com>
>>> ---
>>> Change log from v3:
>>>   - further review comments are applied from Jaegeuk and Chao
>>>   - Tested on this patch (without multiple-device): mount, boot Android with f2fs userdata and make fragment
>>>   - If any problem with this patch or I miss something, please kindly share your comments, thanks :)
>>> Change log from v2:
>>>   - Apply the review comments from Chao
>>> Change log from v1:
>>>   - Apply the review comments from Chao
>>>   - time data from "finish block_ops" to " finish checkpoint" (tested on ARM64 with TOSHIBA 128GB UFS):
>>>      Before patch: 0.002273  0.001973  0.002789  0.005159  0.002050
>>>      After patch: 0.002502  0.001624  0.002487  0.003049  0.002696
>>>  fs/f2fs/checkpoint.c | 67 ++++++++++++++++++++++++++++++++++++----------------
>>>  1 file changed, 46 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
>>> index 14d2fed..916dc72 100644
>>> --- a/fs/f2fs/checkpoint.c
>>> +++ b/fs/f2fs/checkpoint.c
>>> @@ -1158,6 +1158,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  	spin_unlock_irqrestore(&sbi->cp_lock, flags);
>>>  }
>>>  
>>> +static void commit_checkpoint(struct f2fs_sb_info *sbi,
>>> +	void *src, block_t blk_addr)
>>> +{
>>> +	struct writeback_control wbc = {
>>> +		.for_reclaim = 0,
>>> +	};
>>> +
>>> +	/*
>>> +	 * pagevec_lookup_tag and lock_page again will take
>>> +	 * some extra time. Therefore, update_meta_pages and
>>> +	 * sync_meta_pages are combined in this function.
>>> +	 */
>>> +	struct page *page = grab_meta_page(sbi, blk_addr);
>>> +	int err;
>>> +
>>> +	memcpy(page_address(page), src, PAGE_SIZE);
>>> +	set_page_dirty(page);
>>> +
>>> +	f2fs_wait_on_page_writeback(page, META, true);
>>> +	f2fs_bug_on(sbi, PageWriteback(page));
>>> +	if (unlikely(!clear_page_dirty_for_io(page)))
>>> +		f2fs_bug_on(sbi, 1);
>>> +
>>> +	/* writeout cp pack 2 page */
>>> +	err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
>>> +	f2fs_bug_on(sbi, err);
>>> +
>>> +	f2fs_put_page(page, 0);
>>> +
>>> +	/* submit checkpoint (with barrier if NOBARRIER is not set) */
>>> +	f2fs_submit_merged_write(sbi, META_FLUSH);
>>> +}
>>> +
>>>  static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  {
>>>  	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
>>> @@ -1260,16 +1293,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  		}
>>>  	}
>>>  
>>> -	/* need to wait for end_io results */
>>> -	wait_on_all_pages_writeback(sbi);
>>> -	if (unlikely(f2fs_cp_error(sbi)))
>>> -		return -EIO;
>>> -
>>> -	/* flush all device cache */
>>> -	err = f2fs_flush_device_cache(sbi);
>>> -	if (err)
>>> -		return err;
>>> -
>>>  	/* write out checkpoint buffer at block 0 */
>>>  	update_meta_page(sbi, ckpt, start_blk++);
>>>  
>>> @@ -1297,15 +1320,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  		start_blk += NR_CURSEG_NODE_TYPE;
>>>  	}
>>>  
>>> -	/* writeout checkpoint block */
>>> -	update_meta_page(sbi, ckpt, start_blk);
>>> -
>>> -	/* wait for previous submitted node/meta pages writeback */
>>> -	wait_on_all_pages_writeback(sbi);
>>> -
>>> -	if (unlikely(f2fs_cp_error(sbi)))
>>> -		return -EIO;
>>> -
>>>  	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
>>>  	filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
> 
>  - remove

Agreed.

> 
>>>  
>>> @@ -1313,12 +1327,23 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  	sbi->last_valid_block_count = sbi->total_valid_block_count;
>>>  	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
>>>  
>>> -	/* Here, we only have one bio having CP pack */
>>> -	sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
>>> +	/* Here, we have one bio having CP pack except cp pack 2 page */
>>> +	sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
>>> +
>>> +	/* flush all device cache */
>>> +	err = f2fs_flush_device_cache(sbi);
>>> +	if (err)
>>> +		return err;
>>>  
>>>  	/* wait for previous submitted meta pages writeback */
>>>  	wait_on_all_pages_writeback(sbi);
>>
>> Move f2fs_flush_device_cache here? since meta area can cross the multiple
>> devices, we should make sure all metadata were in device cache at least, and
>> then trigger the flush.
> 
> Agreed, and need to flush, only if we have multiple devices.

f2fs_flush_device_cache is designed as only flushing devices except the first
one when f2fs enables multiple device. Calling it directly will be OK. :)

> 
>>
>>>  
>>> +	if (unlikely(f2fs_cp_error(sbi)))
>>> +		return -EIO;
>>> +
>>> +	/* barrier and flush checkpoint cp pack 2 page if it can */
>>> +	commit_checkpoint(sbi, ckpt, start_blk);
>>
>> Jaegeuk, are we really allow to make critical do_checkpoint which is on path of
>> fsync()/sync() be asynchronous?
> 
> Yeah, so we need to wait end_io on synchronous paths like f2fs_sync_fs(1).

I think we should consider the case Xiang mentioned:

1. write async checkpoint #1 from gc;
2. write sync checkpoint #2 from sync_fs, end_io finished w/ error; -> cp #2 becomes corrupted
3. checkpoint #1's end_io finished w/ error; -> cp #1 becomes corrupted

Then, entire filesystem becomes corrupted now.

Thanks,

> 
>>
>> Thanks,
>>
>>> +
>>>  	release_ino_entry(sbi, false);
>>>  
>>>  	if (unlikely(f2fs_cp_error(sbi)))
>>>
Jaegeuk Kim Feb. 10, 2018, 1:59 a.m. UTC | #5
On 02/01, Gao Xiang wrote:
> Hi Jaegeuk and Chao,
> 
> On 2018/2/1 6:28, Jaegeuk Kim wrote:
> > On 01/31, Chao Yu wrote:
> > > On 2018/1/31 14:39, Gaoxiang (OS) wrote:
> > > > Previously, we attempt to flush the whole cp pack in a single bio,
> > > > however, when suddenly powering off at this time, we could get into
> > > > an extreme scenario that cp pack 1 page and cp pack 2 page are updated
> > > > and latest, but payload or current summaries are still partially
> > > > outdated. (see reliable write in the UFS specification)
> > > > 
> > > > This patch submits the whole cp pack except cp pack 2 page at first,
> > > > and then writes the cp pack 2 page with an extra independent
> > > > bio with pre-io barrier.
> > > > 
> > > > Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
> > > > Reviewed-by: Chao Yu <yuchao0@huawei.com>
> > > > ---
> > > > Change log from v3:
> > > >    - further review comments are applied from Jaegeuk and Chao
> > > >    - Tested on this patch (without multiple-device): mount, boot Android with f2fs userdata and make fragment
> > > >    - If any problem with this patch or I miss something, please kindly share your comments, thanks :)
> > > > Change log from v2:
> > > >    - Apply the review comments from Chao
> > > > Change log from v1:
> > > >    - Apply the review comments from Chao
> > > >    - time data from "finish block_ops" to " finish checkpoint" (tested on ARM64 with TOSHIBA 128GB UFS):
> > > >       Before patch: 0.002273  0.001973  0.002789  0.005159  0.002050
> > > >       After patch: 0.002502  0.001624  0.002487  0.003049  0.002696
> > > >   fs/f2fs/checkpoint.c | 67 ++++++++++++++++++++++++++++++++++++----------------
> > > >   1 file changed, 46 insertions(+), 21 deletions(-)
> > > > 
> > > > diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> > > > index 14d2fed..916dc72 100644
> > > > --- a/fs/f2fs/checkpoint.c
> > > > +++ b/fs/f2fs/checkpoint.c
> > > > @@ -1158,6 +1158,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> > > >   	spin_unlock_irqrestore(&sbi->cp_lock, flags);
> > > >   }
> > > > +static void commit_checkpoint(struct f2fs_sb_info *sbi,
> > > > +	void *src, block_t blk_addr)
> > > > +{
> > > > +	struct writeback_control wbc = {
> > > > +		.for_reclaim = 0,
> > > > +	};
> > > > +
> > > > +	/*
> > > > +	 * pagevec_lookup_tag and lock_page again will take
> > > > +	 * some extra time. Therefore, update_meta_pages and
> > > > +	 * sync_meta_pages are combined in this function.
> > > > +	 */
> > > > +	struct page *page = grab_meta_page(sbi, blk_addr);
> > > > +	int err;
> > > > +
> > > > +	memcpy(page_address(page), src, PAGE_SIZE);
> > > > +	set_page_dirty(page);
> > > > +
> > > > +	f2fs_wait_on_page_writeback(page, META, true);
> > > > +	f2fs_bug_on(sbi, PageWriteback(page));
> > > > +	if (unlikely(!clear_page_dirty_for_io(page)))
> > > > +		f2fs_bug_on(sbi, 1);
> > > > +
> > > > +	/* writeout cp pack 2 page */
> > > > +	err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
> > > > +	f2fs_bug_on(sbi, err);
> > > > +
> > > > +	f2fs_put_page(page, 0);
> > > > +
> > > > +	/* submit checkpoint (with barrier if NOBARRIER is not set) */
> > > > +	f2fs_submit_merged_write(sbi, META_FLUSH);
> > > > +}
> > > > +
> > > >   static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> > > >   {
> > > >   	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
> > > > @@ -1260,16 +1293,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> > > >   		}
> > > >   	}
> > > > -	/* need to wait for end_io results */
> > > > -	wait_on_all_pages_writeback(sbi);
> > > > -	if (unlikely(f2fs_cp_error(sbi)))
> > > > -		return -EIO;
> > > > -
> > > > -	/* flush all device cache */
> > > > -	err = f2fs_flush_device_cache(sbi);
> > > > -	if (err)
> > > > -		return err;
> > > > -
> > > >   	/* write out checkpoint buffer at block 0 */
> > > >   	update_meta_page(sbi, ckpt, start_blk++);
> > > > @@ -1297,15 +1320,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> > > >   		start_blk += NR_CURSEG_NODE_TYPE;
> > > >   	}
> > > > -	/* writeout checkpoint block */
> > > > -	update_meta_page(sbi, ckpt, start_blk);
> > > > -
> > > > -	/* wait for previous submitted node/meta pages writeback */
> > > > -	wait_on_all_pages_writeback(sbi);
> > > > -
> > > > -	if (unlikely(f2fs_cp_error(sbi)))
> > > > -		return -EIO;
> > > > -
> > > >   	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
> > > >   	filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
> > 
> >   - remove
> > 
> You mean remove
> filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
> and
> filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
> or remove
> filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);

Yup, both of them.

> 
> Actually, I have no idea why do these two filemap_fdatawait_range stay here
> and what are these used and waited for in this place,
> however I found it was modified recently and for many times, I guess they
> have some use.
> 
> > > > @@ -1313,12 +1327,23 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> > > >   	sbi->last_valid_block_count = sbi->total_valid_block_count;
> > > >   	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
> > > > -	/* Here, we only have one bio having CP pack */
> > > > -	sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
> > > > +	/* Here, we have one bio having CP pack except cp pack 2 page */
> > > > +	sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
> > > > +
> > > > +	/* flush all device cache */
> > > > +	err = f2fs_flush_device_cache(sbi);
> > > > +	if (err)
> > > > +		return err;
> > > >   	/* wait for previous submitted meta pages writeback */
> > > >   	wait_on_all_pages_writeback(sbi);
> > > 
> > > Move f2fs_flush_device_cache here? since meta area can cross the multiple
> > > devices, we should make sure all metadata were in device cache at least, and
> > > then trigger the flush.
> > 
> > Agreed, and need to flush, only if we have multiple devices.
> > 
> 
> OK, and it seems that
> - current f2fs_flush_device_cache
>    - for (i = 1; i < sbi->s_ndevs; i++) {
> it would skip if it only has only one devices?

Yup, right.

> 
> > > 
> > > > +	if (unlikely(f2fs_cp_error(sbi)))
> > > > +		return -EIO;
> > > > +
> > > > +	/* barrier and flush checkpoint cp pack 2 page if it can */
> > > > +	commit_checkpoint(sbi, ckpt, start_blk);
> > > 
> > > Jaegeuk, are we really allow to make critical do_checkpoint which is on path of
> > > fsync()/sync() be asynchronous?
> > 
> > Yeah, so we need to wait end_io on synchronous paths like f2fs_sync_fs(1).
> > 
> 
> How about adding a CP_SYNC (or CP_ASYNC) control flag in cp_control.reason?
> and which paths could be synchronous paths:
>     sync, umount, recovery - synchronous, right?
> 
> Thanks,
> 
> > > 
> > > Thanks,
> > > 
> > > > +
> > > >   	release_ino_entry(sbi, false);
> > > >   	if (unlikely(f2fs_cp_error(sbi)))
> > > >
diff mbox

Patch

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 14d2fed..916dc72 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1158,6 +1158,39 @@  static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	spin_unlock_irqrestore(&sbi->cp_lock, flags);
 }
 
+static void commit_checkpoint(struct f2fs_sb_info *sbi,
+	void *src, block_t blk_addr)
+{
+	struct writeback_control wbc = {
+		.for_reclaim = 0,
+	};
+
+	/*
+	 * pagevec_lookup_tag and lock_page again will take
+	 * some extra time. Therefore, update_meta_pages and
+	 * sync_meta_pages are combined in this function.
+	 */
+	struct page *page = grab_meta_page(sbi, blk_addr);
+	int err;
+
+	memcpy(page_address(page), src, PAGE_SIZE);
+	set_page_dirty(page);
+
+	f2fs_wait_on_page_writeback(page, META, true);
+	f2fs_bug_on(sbi, PageWriteback(page));
+	if (unlikely(!clear_page_dirty_for_io(page)))
+		f2fs_bug_on(sbi, 1);
+
+	/* writeout cp pack 2 page */
+	err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
+	f2fs_bug_on(sbi, err);
+
+	f2fs_put_page(page, 0);
+
+	/* submit checkpoint (with barrier if NOBARRIER is not set) */
+	f2fs_submit_merged_write(sbi, META_FLUSH);
+}
+
 static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1260,16 +1293,6 @@  static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		}
 	}
 
-	/* need to wait for end_io results */
-	wait_on_all_pages_writeback(sbi);
-	if (unlikely(f2fs_cp_error(sbi)))
-		return -EIO;
-
-	/* flush all device cache */
-	err = f2fs_flush_device_cache(sbi);
-	if (err)
-		return err;
-
 	/* write out checkpoint buffer at block 0 */
 	update_meta_page(sbi, ckpt, start_blk++);
 
@@ -1297,15 +1320,6 @@  static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		start_blk += NR_CURSEG_NODE_TYPE;
 	}
 
-	/* writeout checkpoint block */
-	update_meta_page(sbi, ckpt, start_blk);
-
-	/* wait for previous submitted node/meta pages writeback */
-	wait_on_all_pages_writeback(sbi);
-
-	if (unlikely(f2fs_cp_error(sbi)))
-		return -EIO;
-
 	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
 	filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
 
@@ -1313,12 +1327,23 @@  static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
 	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
 
-	/* Here, we only have one bio having CP pack */
-	sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
+	/* Here, we have one bio having CP pack except cp pack 2 page */
+	sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+
+	/* flush all device cache */
+	err = f2fs_flush_device_cache(sbi);
+	if (err)
+		return err;
 
 	/* wait for previous submitted meta pages writeback */
 	wait_on_all_pages_writeback(sbi);
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
+	/* barrier and flush checkpoint cp pack 2 page if it can */
+	commit_checkpoint(sbi, ckpt, start_blk);
+
 	release_ino_entry(sbi, false);
 
 	if (unlikely(f2fs_cp_error(sbi)))