diff mbox

[f2fs-dev,RFC] f2fs: clean up build_sit_entries

Message ID 9047C53C18267742AB12E43B65C7F9F70BCB4020@dggemi505-mbs.china.huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Gao Xiang Dec. 17, 2017, 12:07 p.m. UTC
clean up some redundant repeat code in build_sit_entries
and seperate some logic into new functions:
 - build_discard_entries
 - build_sec_entries

Note that "if (f2fs_discard_en(sbi))" and "if (sbi->segs_per_sec > 1)"
are all taken out of the loops they are unchangable.

Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
---
 fs/f2fs/segment.c | 80 +++++++++++++++++++++++++------------------------------
 1 file changed, 37 insertions(+), 43 deletions(-)

Comments

Jaegeuk Kim Dec. 19, 2017, 11:40 p.m. UTC | #1
On 12/17, gaoxiang (P) wrote:
> clean up some redundant repeat code in build_sit_entries
> and seperate some logic into new functions:
>  - build_discard_entries
>  - build_sec_entries

This patch makes another big loop redundantly?

> 
> Note that "if (f2fs_discard_en(sbi))" and "if (sbi->segs_per_sec > 1)"
> are all taken out of the loops they are unchangable.
> 
> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
> ---
>  fs/f2fs/segment.c | 80 +++++++++++++++++++++++++------------------------------
>  1 file changed, 37 insertions(+), 43 deletions(-)
> 
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 40e1d20..bcd8a40 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -3476,12 +3476,39 @@ static int build_curseg(struct f2fs_sb_info *sbi)
>  	return restore_curseg_summaries(sbi);
>  }
>  
> +static void build_discard_entries(struct f2fs_sb_info *sbi)
> +{
> +	unsigned segno;
> +
> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno) {
> +		struct seg_entry *se = get_seg_entry(sbi, segno);
> +
> +		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG))
> +			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
> +		else {
> +			memcpy(se->discard_map, se->cur_valid_map,
> +				SIT_VBLOCK_MAP_SIZE);
> +
> +			sbi->discard_blks += sbi->blocks_per_seg -
> +				se->valid_blocks;
> +		}
> +	}
> +}
> +
> +static void build_sec_entries(struct f2fs_sb_info *sbi)
> +{
> +	unsigned segno;
> +
> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno)
> +		get_sec_entry(sbi, segno)->valid_blocks +=
> +			get_seg_entry(sbi, segno)->valid_blocks;
> +}
> +
>  static void build_sit_entries(struct f2fs_sb_info *sbi)
>  {
>  	struct sit_info *sit_i = SIT_I(sbi);
>  	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
>  	struct f2fs_journal *journal = curseg->journal;
> -	struct seg_entry *se;
>  	struct f2fs_sit_entry sit;
>  	int sit_blk_cnt = SIT_BLK_CNT(sbi);
>  	unsigned int i, start, end;
> @@ -3498,67 +3525,34 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
>  			struct f2fs_sit_block *sit_blk;
>  			struct page *page;
>  
> -			se = &sit_i->sentries[start];
>  			page = get_current_sit_page(sbi, start);
>  			sit_blk = (struct f2fs_sit_block *)page_address(page);
>  			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
>  			f2fs_put_page(page, 1);
>  
>  			check_block_count(sbi, start, &sit);
> -			seg_info_from_raw_sit(se, &sit);
> -
> -			/* build discard map only one time */
> -			if (f2fs_discard_en(sbi)) {
> -				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
> -					memset(se->discard_map, 0xff,
> -						SIT_VBLOCK_MAP_SIZE);
> -				} else {
> -					memcpy(se->discard_map,
> -						se->cur_valid_map,
> -						SIT_VBLOCK_MAP_SIZE);
> -					sbi->discard_blks +=
> -						sbi->blocks_per_seg -
> -						se->valid_blocks;
> -				}
> -			}
>  
> -			if (sbi->segs_per_sec > 1)
> -				get_sec_entry(sbi, start)->valid_blocks +=
> -							se->valid_blocks;
> +			seg_info_from_raw_sit(&sit_i->sentries[start], &sit);
>  		}
>  		start_blk += readed;
>  	} while (start_blk < sit_blk_cnt);
>  
>  	down_read(&curseg->journal_rwsem);
>  	for (i = 0; i < sits_in_cursum(journal); i++) {
> -		unsigned int old_valid_blocks;
> -
>  		start = le32_to_cpu(segno_in_journal(journal, i));
> -		se = &sit_i->sentries[start];
>  		sit = sit_in_journal(journal, i);
>  
> -		old_valid_blocks = se->valid_blocks;
> -
>  		check_block_count(sbi, start, &sit);
> -		seg_info_from_raw_sit(se, &sit);
> -
> -		if (f2fs_discard_en(sbi)) {
> -			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
> -				memset(se->discard_map, 0xff,
> -							SIT_VBLOCK_MAP_SIZE);
> -			} else {
> -				memcpy(se->discard_map, se->cur_valid_map,
> -							SIT_VBLOCK_MAP_SIZE);
> -				sbi->discard_blks += old_valid_blocks -
> -							se->valid_blocks;
> -			}
> -		}
> -
> -		if (sbi->segs_per_sec > 1)
> -			get_sec_entry(sbi, start)->valid_blocks +=
> -				se->valid_blocks - old_valid_blocks;
> +		seg_info_from_raw_sit(&sit_i->sentries[start], &sit);
>  	}
>  	up_read(&curseg->journal_rwsem);
> +
> +	/* build discard map only one time */
> +	if (f2fs_discard_en(sbi))
> +		build_discard_entries(sbi);
> +
> +	if (sbi->segs_per_sec > 1)
> +		build_sec_entries(sbi);
>  }
>  
>  static void init_free_segmap(struct f2fs_sb_info *sbi)
> -- 
> 2.1.4
Gao Xiang Dec. 20, 2017, 1:36 a.m. UTC | #2
Hi Jaegeuk,

On 2017/12/20 7:40, Jaegeuk Kim wrote:
> On 12/17, gaoxiang (P) wrote:

>> clean up some redundant repeat code in build_sit_entries

>> and seperate some logic into new functions:

>>   - build_discard_entries

>>   - build_sec_entries

> 

> This patch makes another big loop redundantly?

>

"condition & jump" is the fundamental of "if" and loop.
In the worst case, both f2fs_discard_en(sbi) and sbi->segs_per_sec > 1 
are true, and I think, we have:
- the original approach:
	MAIN_SEGS   <-- loop condition
	MAIN_SEGS   <-- f2fs_discard_en(sbi)
	MAIN_SEGS   <-- sbi->segs_per_sec > 1

	sits_in_cursum(journal) <-- loop_condition
	sits_in_cursum(journal) <-- f2fs_discard_en(sbi)
	sits_in_cursum(journal) <-- sbi->segs_per_sec > 1
    it takes about 3*(MAIN_SEGS + sits_in_cursum(journal)) jumps effort.
- this patch:
	MAIN_SEGS   <-- loop condition
	sits_in_cursum(journal) <-- loop_condition

	MAIN_SEGS   <-- f2fs_discard_en(sbi)
	MAIN_SEGS   <-- sbi->segs_per_sec > 1
    it takes abourt 3*MAIN_SEGS + sits_in_cursum(journal) jumps effort.

and in the best case, both f2fs_discard_en(sbi) and sbi->segs_per_sec > 
1 are false, and we have:
- this patch
	MAIN_SEGS   <-- loop condition

	sits_in_cursum(journal) <-- loop_condition
See 
https://stackoverflow.com/questions/1462710/can-c-compilers-optimize-if-statements-inside-for-loops

In addtion, this approach is easier to read than the original 
one, so splitting the loop would be beneficial.

Thanks

>>

>> Note that "if (f2fs_discard_en(sbi))" and "if (sbi->segs_per_sec > 1)"

>> are all taken out of the loops they are unchangable.

>>

>> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>

>> ---

>>   fs/f2fs/segment.c | 80 +++++++++++++++++++++++++------------------------------

>>   1 file changed, 37 insertions(+), 43 deletions(-)

>>

>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c

>> index 40e1d20..bcd8a40 100644

>> --- a/fs/f2fs/segment.c

>> +++ b/fs/f2fs/segment.c

>> @@ -3476,12 +3476,39 @@ static int build_curseg(struct f2fs_sb_info *sbi)

>>   	return restore_curseg_summaries(sbi);

>>   }

>>   

>> +static void build_discard_entries(struct f2fs_sb_info *sbi)

>> +{

>> +	unsigned segno;

>> +

>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno) {

>> +		struct seg_entry *se = get_seg_entry(sbi, segno);

>> +

>> +		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG))

>> +			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);

>> +		else {

>> +			memcpy(se->discard_map, se->cur_valid_map,

>> +				SIT_VBLOCK_MAP_SIZE);

>> +

>> +			sbi->discard_blks += sbi->blocks_per_seg -

>> +				se->valid_blocks;

>> +		}

>> +	}

>> +}

>> +

>> +static void build_sec_entries(struct f2fs_sb_info *sbi)

>> +{

>> +	unsigned segno;

>> +

>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno)

>> +		get_sec_entry(sbi, segno)->valid_blocks +=

>> +			get_seg_entry(sbi, segno)->valid_blocks;

>> +}

>> +

>>   static void build_sit_entries(struct f2fs_sb_info *sbi)

>>   {

>>   	struct sit_info *sit_i = SIT_I(sbi);

>>   	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);

>>   	struct f2fs_journal *journal = curseg->journal;

>> -	struct seg_entry *se;

>>   	struct f2fs_sit_entry sit;

>>   	int sit_blk_cnt = SIT_BLK_CNT(sbi);

>>   	unsigned int i, start, end;

>> @@ -3498,67 +3525,34 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)

>>   			struct f2fs_sit_block *sit_blk;

>>   			struct page *page;

>>   

>> -			se = &sit_i->sentries[start];

>>   			page = get_current_sit_page(sbi, start);

>>   			sit_blk = (struct f2fs_sit_block *)page_address(page);

>>   			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];

>>   			f2fs_put_page(page, 1);

>>   

>>   			check_block_count(sbi, start, &sit);

>> -			seg_info_from_raw_sit(se, &sit);

>> -

>> -			/* build discard map only one time */

>> -			if (f2fs_discard_en(sbi)) {

>> -				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>> -					memset(se->discard_map, 0xff,

>> -						SIT_VBLOCK_MAP_SIZE);

>> -				} else {

>> -					memcpy(se->discard_map,

>> -						se->cur_valid_map,

>> -						SIT_VBLOCK_MAP_SIZE);

>> -					sbi->discard_blks +=

>> -						sbi->blocks_per_seg -

>> -						se->valid_blocks;

>> -				}

>> -			}

>>   

>> -			if (sbi->segs_per_sec > 1)

>> -				get_sec_entry(sbi, start)->valid_blocks +=

>> -							se->valid_blocks;

>> +			seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>   		}

>>   		start_blk += readed;

>>   	} while (start_blk < sit_blk_cnt);

>>   

>>   	down_read(&curseg->journal_rwsem);

>>   	for (i = 0; i < sits_in_cursum(journal); i++) {

>> -		unsigned int old_valid_blocks;

>> -

>>   		start = le32_to_cpu(segno_in_journal(journal, i));

>> -		se = &sit_i->sentries[start];

>>   		sit = sit_in_journal(journal, i);

>>   

>> -		old_valid_blocks = se->valid_blocks;

>> -

>>   		check_block_count(sbi, start, &sit);

>> -		seg_info_from_raw_sit(se, &sit);

>> -

>> -		if (f2fs_discard_en(sbi)) {

>> -			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>> -				memset(se->discard_map, 0xff,

>> -							SIT_VBLOCK_MAP_SIZE);

>> -			} else {

>> -				memcpy(se->discard_map, se->cur_valid_map,

>> -							SIT_VBLOCK_MAP_SIZE);

>> -				sbi->discard_blks += old_valid_blocks -

>> -							se->valid_blocks;

>> -			}

>> -		}

>> -

>> -		if (sbi->segs_per_sec > 1)

>> -			get_sec_entry(sbi, start)->valid_blocks +=

>> -				se->valid_blocks - old_valid_blocks;

>> +		seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>   	}

>>   	up_read(&curseg->journal_rwsem);

>> +

>> +	/* build discard map only one time */

>> +	if (f2fs_discard_en(sbi))

>> +		build_discard_entries(sbi);

>> +

>> +	if (sbi->segs_per_sec > 1)

>> +		build_sec_entries(sbi);

>>   }

>>   

>>   static void init_free_segmap(struct f2fs_sb_info *sbi)

>> -- 

>> 2.1.4
Jaegeuk Kim Dec. 20, 2017, 2:31 a.m. UTC | #3
On 12/20, Gaoxiang (OS) wrote:
> Hi Jaegeuk,
> 
> On 2017/12/20 7:40, Jaegeuk Kim wrote:
> > On 12/17, gaoxiang (P) wrote:
> >> clean up some redundant repeat code in build_sit_entries
> >> and seperate some logic into new functions:
> >>   - build_discard_entries
> >>   - build_sec_entries
> > 
> > This patch makes another big loop redundantly?
> >
> "condition & jump" is the fundamental of "if" and loop.
> In the worst case, both f2fs_discard_en(sbi) and sbi->segs_per_sec > 1 
> are true, and I think, we have:
> - the original approach:
> 	MAIN_SEGS   <-- loop condition
> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)
> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1
> 
> 	sits_in_cursum(journal) <-- loop_condition
> 	sits_in_cursum(journal) <-- f2fs_discard_en(sbi)
> 	sits_in_cursum(journal) <-- sbi->segs_per_sec > 1
>     it takes about 3*(MAIN_SEGS + sits_in_cursum(journal)) jumps effort.
> - this patch:
> 	MAIN_SEGS   <-- loop condition
> 	sits_in_cursum(journal) <-- loop_condition
> 
> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)
> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1
>     it takes abourt 3*MAIN_SEGS + sits_in_cursum(journal) jumps effort.
> 
> and in the best case, both f2fs_discard_en(sbi) and sbi->segs_per_sec > 
> 1 are false, and we have:
> - this patch
> 	MAIN_SEGS   <-- loop condition
> 
> 	sits_in_cursum(journal) <-- loop_condition
> See 
> https://stackoverflow.com/questions/1462710/can-c-compilers-optimize-if-statements-inside-for-loops

Interesting!

What I can think of the worst case looks like:

In current code,
	for (N) {
		do_something;
		if (f2fs_discard_en())
			do _something;
		if (sbi->segs_per_sec > 1)
			do_something;
	}
	for (sits_in_cursum()) {
		do_something;
		if (f2fs_discard_en())
			do _something;
		if (sbi->segs_per_sec > 1)
			do_something;
	}
=> O(N + sits_in_cursum())

Once compiler unrolled the loop, we can expect most of jumps could be hit by
branch predictor, since the loop does not have many branches.

In the patch,
	for (N)
		do_something;
	for (sits_in_cursum())
		do_something;

	if (f2fs_discard_en()) {
		for (N)
			do_something;
	}
	if (sbi->segs_per_sec > 1) {
		for (N)
			do_something;
	}
=> O(3*N + sits_in_cursum())

BTW, build_discard_entries(struct f2fs_sb_info *sbi) is more likely to split
the loops, as you describe.

In build_discard_entries(),
	for (N) {
		if (CP_TRIMMED_FLAG)
			do_something;
		else
			do_semething_else;
	}
Thanks,

> 
> In addtion, this approach is easier to read than the original 
> one, so splitting the loop would be beneficial.
> 
> Thanks
> 
> >>
> >> Note that "if (f2fs_discard_en(sbi))" and "if (sbi->segs_per_sec > 1)"
> >> are all taken out of the loops they are unchangable.
> >>
> >> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
> >> ---
> >>   fs/f2fs/segment.c | 80 +++++++++++++++++++++++++------------------------------
> >>   1 file changed, 37 insertions(+), 43 deletions(-)
> >>
> >> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> >> index 40e1d20..bcd8a40 100644
> >> --- a/fs/f2fs/segment.c
> >> +++ b/fs/f2fs/segment.c
> >> @@ -3476,12 +3476,39 @@ static int build_curseg(struct f2fs_sb_info *sbi)
> >>   	return restore_curseg_summaries(sbi);
> >>   }
> >>   
> >> +static void build_discard_entries(struct f2fs_sb_info *sbi)
> >> +{
> >> +	unsigned segno;
> >> +
> >> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno) {
> >> +		struct seg_entry *se = get_seg_entry(sbi, segno);
> >> +
> >> +		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG))
> >> +			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
> >> +		else {
> >> +			memcpy(se->discard_map, se->cur_valid_map,
> >> +				SIT_VBLOCK_MAP_SIZE);
> >> +
> >> +			sbi->discard_blks += sbi->blocks_per_seg -
> >> +				se->valid_blocks;
> >> +		}
> >> +	}
> >> +}
> >> +
> >> +static void build_sec_entries(struct f2fs_sb_info *sbi)
> >> +{
> >> +	unsigned segno;
> >> +
> >> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno)
> >> +		get_sec_entry(sbi, segno)->valid_blocks +=
> >> +			get_seg_entry(sbi, segno)->valid_blocks;
> >> +}
> >> +
> >>   static void build_sit_entries(struct f2fs_sb_info *sbi)
> >>   {
> >>   	struct sit_info *sit_i = SIT_I(sbi);
> >>   	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
> >>   	struct f2fs_journal *journal = curseg->journal;
> >> -	struct seg_entry *se;
> >>   	struct f2fs_sit_entry sit;
> >>   	int sit_blk_cnt = SIT_BLK_CNT(sbi);
> >>   	unsigned int i, start, end;
> >> @@ -3498,67 +3525,34 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
> >>   			struct f2fs_sit_block *sit_blk;
> >>   			struct page *page;
> >>   
> >> -			se = &sit_i->sentries[start];
> >>   			page = get_current_sit_page(sbi, start);
> >>   			sit_blk = (struct f2fs_sit_block *)page_address(page);
> >>   			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
> >>   			f2fs_put_page(page, 1);
> >>   
> >>   			check_block_count(sbi, start, &sit);
> >> -			seg_info_from_raw_sit(se, &sit);
> >> -
> >> -			/* build discard map only one time */
> >> -			if (f2fs_discard_en(sbi)) {
> >> -				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
> >> -					memset(se->discard_map, 0xff,
> >> -						SIT_VBLOCK_MAP_SIZE);
> >> -				} else {
> >> -					memcpy(se->discard_map,
> >> -						se->cur_valid_map,
> >> -						SIT_VBLOCK_MAP_SIZE);
> >> -					sbi->discard_blks +=
> >> -						sbi->blocks_per_seg -
> >> -						se->valid_blocks;
> >> -				}
> >> -			}
> >>   
> >> -			if (sbi->segs_per_sec > 1)
> >> -				get_sec_entry(sbi, start)->valid_blocks +=
> >> -							se->valid_blocks;
> >> +			seg_info_from_raw_sit(&sit_i->sentries[start], &sit);
> >>   		}
> >>   		start_blk += readed;
> >>   	} while (start_blk < sit_blk_cnt);
> >>   
> >>   	down_read(&curseg->journal_rwsem);
> >>   	for (i = 0; i < sits_in_cursum(journal); i++) {
> >> -		unsigned int old_valid_blocks;
> >> -
> >>   		start = le32_to_cpu(segno_in_journal(journal, i));
> >> -		se = &sit_i->sentries[start];
> >>   		sit = sit_in_journal(journal, i);
> >>   
> >> -		old_valid_blocks = se->valid_blocks;
> >> -
> >>   		check_block_count(sbi, start, &sit);
> >> -		seg_info_from_raw_sit(se, &sit);
> >> -
> >> -		if (f2fs_discard_en(sbi)) {
> >> -			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
> >> -				memset(se->discard_map, 0xff,
> >> -							SIT_VBLOCK_MAP_SIZE);
> >> -			} else {
> >> -				memcpy(se->discard_map, se->cur_valid_map,
> >> -							SIT_VBLOCK_MAP_SIZE);
> >> -				sbi->discard_blks += old_valid_blocks -
> >> -							se->valid_blocks;
> >> -			}
> >> -		}
> >> -
> >> -		if (sbi->segs_per_sec > 1)
> >> -			get_sec_entry(sbi, start)->valid_blocks +=
> >> -				se->valid_blocks - old_valid_blocks;
> >> +		seg_info_from_raw_sit(&sit_i->sentries[start], &sit);
> >>   	}
> >>   	up_read(&curseg->journal_rwsem);
> >> +
> >> +	/* build discard map only one time */
> >> +	if (f2fs_discard_en(sbi))
> >> +		build_discard_entries(sbi);
> >> +
> >> +	if (sbi->segs_per_sec > 1)
> >> +		build_sec_entries(sbi);
> >>   }
> >>   
> >>   static void init_free_segmap(struct f2fs_sb_info *sbi)
> >> -- 
> >> 2.1.4
Gao Xiang Dec. 20, 2017, 3:07 a.m. UTC | #4
Hi Jaegeuk,

On 2017/12/20 10:31, Jaegeuk Kim wrote:
> On 12/20, Gaoxiang (OS) wrote:

>> Hi Jaegeuk,

>>

>> On 2017/12/20 7:40, Jaegeuk Kim wrote:

>>> On 12/17, gaoxiang (P) wrote:

>>>> clean up some redundant repeat code in build_sit_entries

>>>> and seperate some logic into new functions:

>>>>    - build_discard_entries

>>>>    - build_sec_entries

>>>

>>> This patch makes another big loop redundantly?

>>>

>> "condition & jump" is the fundamental of "if" and loop.

>> In the worst case, both f2fs_discard_en(sbi) and sbi->segs_per_sec > 1

>> are true, and I think, we have:

>> - the original approach:

>> 	MAIN_SEGS   <-- loop condition

>> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)

>> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1

>>

>> 	sits_in_cursum(journal) <-- loop_condition

>> 	sits_in_cursum(journal) <-- f2fs_discard_en(sbi)

>> 	sits_in_cursum(journal) <-- sbi->segs_per_sec > 1

>>      it takes about 3*(MAIN_SEGS + sits_in_cursum(journal)) jumps effort.

>> - this patch:

>> 	MAIN_SEGS   <-- loop condition

>> 	sits_in_cursum(journal) <-- loop_condition

>>

>> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)

>> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1

>>      it takes abourt 3*MAIN_SEGS + sits_in_cursum(journal) jumps effort.

>>

>> and in the best case, both f2fs_discard_en(sbi) and sbi->segs_per_sec >

>> 1 are false, and we have:

>> - this patch

>> 	MAIN_SEGS   <-- loop condition

>>

>> 	sits_in_cursum(journal) <-- loop_condition

>> See

>> https://stackoverflow.com/questions/1462710/can-c-compilers-optimize-if-statements-inside-for-loops

> 

> Interesting!

> 

> What I can think of the worst case looks like:

> 

> In current code,

> 	for (N) {

> 		do_something;

> 		if (f2fs_discard_en())

> 			do _something;

> 		if (sbi->segs_per_sec > 1)

> 			do_something;

> 	}

> 	for (sits_in_cursum()) {

> 		do_something;

> 		if (f2fs_discard_en())

> 			do _something;

> 		if (sbi->segs_per_sec > 1)

> 			do_something;

> 	}

> => O(N + sits_in_cursum())


In the worst case Its 3(1 -- exit condition < MAIN_SEG, 2 --  f2fs_discard_en(), 3 -- 
sbi->segs_per_sec > 1) *N + 3*sits_in_cursum() if condition & jump 
instuctions at runtime.

If we use Big O notion, yes I think O(N + sits_in_cursum())

> 

> Once compiler unrolled the loop, we can expect most of jumps could be hit by

> branch predictor, since the loop does not have many branches.


The current code, the compiler could unroll the loop into, I guess
  	for (N) {
  		do_something;
        }
         if (f2fs_discard_en()) {
  		do _something;
        }
         if (sbi->segs_per_sec > 1) {
                do_something;
  	}
         ...
         for (sits_in_cursum()) {
                 do_something;
         }
         if (f2fs_discard_en()) {
                 do _something;
         }
         if (sbi->segs_per_sec > 1) {
                 do_something;
         }

Taking branch predictor into consideration, I think the unrolled one is more easier to predict
than the current rolled one. (as you say, the current has more conditions in the loop
and a exit condition to predict)

> 

> In the patch,

> 	for (N)

> 		do_something;

> 	for (sits_in_cursum())

> 		do_something;

> 

> 	if (f2fs_discard_en()) {

> 		for (N)

> 			do_something;

> 	}

> 	if (sbi->segs_per_sec > 1) {

> 		for (N)

> 			do_something;

> 	}

> => O(3*N + sits_in_cursum())


If we use Big O notion, I think O(N + sits_in_cursum()) too.


> 

> BTW, build_discard_entries(struct f2fs_sb_info *sbi) is more likely to split

> the loops, as you describe.

> 

> In build_discard_entries(),

> 	for (N) {

> 		if (CP_TRIMMED_FLAG)

> 			do_something;

> 		else

> 			do_semething_else;

> 	}

> Thanks,

> 


I think so, yet CP_TRIMMED_FLAG comes from a function argument, so compiler
cannot know whether it is constant.

I think if we count the runtime condition & jump instruction at runtime. 
the unrolls above are beneficial, I think.

Thanks.

>>

>> In addtion, this approach is easier to read than the original

>> one, so splitting the loop would be beneficial.

>>

>> Thanks

>>

>>>>

>>>> Note that "if (f2fs_discard_en(sbi))" and "if (sbi->segs_per_sec > 1)"

>>>> are all taken out of the loops they are unchangable.

>>>>

>>>> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>

>>>> ---

>>>>    fs/f2fs/segment.c | 80 +++++++++++++++++++++++++------------------------------

>>>>    1 file changed, 37 insertions(+), 43 deletions(-)

>>>>

>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c

>>>> index 40e1d20..bcd8a40 100644

>>>> --- a/fs/f2fs/segment.c

>>>> +++ b/fs/f2fs/segment.c

>>>> @@ -3476,12 +3476,39 @@ static int build_curseg(struct f2fs_sb_info *sbi)

>>>>    	return restore_curseg_summaries(sbi);

>>>>    }

>>>>    

>>>> +static void build_discard_entries(struct f2fs_sb_info *sbi)

>>>> +{

>>>> +	unsigned segno;

>>>> +

>>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno) {

>>>> +		struct seg_entry *se = get_seg_entry(sbi, segno);

>>>> +

>>>> +		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG))

>>>> +			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);

>>>> +		else {

>>>> +			memcpy(se->discard_map, se->cur_valid_map,

>>>> +				SIT_VBLOCK_MAP_SIZE);

>>>> +

>>>> +			sbi->discard_blks += sbi->blocks_per_seg -

>>>> +				se->valid_blocks;

>>>> +		}

>>>> +	}

>>>> +}

>>>> +

>>>> +static void build_sec_entries(struct f2fs_sb_info *sbi)

>>>> +{

>>>> +	unsigned segno;

>>>> +

>>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno)

>>>> +		get_sec_entry(sbi, segno)->valid_blocks +=

>>>> +			get_seg_entry(sbi, segno)->valid_blocks;

>>>> +}

>>>> +

>>>>    static void build_sit_entries(struct f2fs_sb_info *sbi)

>>>>    {

>>>>    	struct sit_info *sit_i = SIT_I(sbi);

>>>>    	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);

>>>>    	struct f2fs_journal *journal = curseg->journal;

>>>> -	struct seg_entry *se;

>>>>    	struct f2fs_sit_entry sit;

>>>>    	int sit_blk_cnt = SIT_BLK_CNT(sbi);

>>>>    	unsigned int i, start, end;

>>>> @@ -3498,67 +3525,34 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)

>>>>    			struct f2fs_sit_block *sit_blk;

>>>>    			struct page *page;

>>>>    

>>>> -			se = &sit_i->sentries[start];

>>>>    			page = get_current_sit_page(sbi, start);

>>>>    			sit_blk = (struct f2fs_sit_block *)page_address(page);

>>>>    			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];

>>>>    			f2fs_put_page(page, 1);

>>>>    

>>>>    			check_block_count(sbi, start, &sit);

>>>> -			seg_info_from_raw_sit(se, &sit);

>>>> -

>>>> -			/* build discard map only one time */

>>>> -			if (f2fs_discard_en(sbi)) {

>>>> -				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>>>> -					memset(se->discard_map, 0xff,

>>>> -						SIT_VBLOCK_MAP_SIZE);

>>>> -				} else {

>>>> -					memcpy(se->discard_map,

>>>> -						se->cur_valid_map,

>>>> -						SIT_VBLOCK_MAP_SIZE);

>>>> -					sbi->discard_blks +=

>>>> -						sbi->blocks_per_seg -

>>>> -						se->valid_blocks;

>>>> -				}

>>>> -			}

>>>>    

>>>> -			if (sbi->segs_per_sec > 1)

>>>> -				get_sec_entry(sbi, start)->valid_blocks +=

>>>> -							se->valid_blocks;

>>>> +			seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>>>    		}

>>>>    		start_blk += readed;

>>>>    	} while (start_blk < sit_blk_cnt);

>>>>    

>>>>    	down_read(&curseg->journal_rwsem);

>>>>    	for (i = 0; i < sits_in_cursum(journal); i++) {

>>>> -		unsigned int old_valid_blocks;

>>>> -

>>>>    		start = le32_to_cpu(segno_in_journal(journal, i));

>>>> -		se = &sit_i->sentries[start];

>>>>    		sit = sit_in_journal(journal, i);

>>>>    

>>>> -		old_valid_blocks = se->valid_blocks;

>>>> -

>>>>    		check_block_count(sbi, start, &sit);

>>>> -		seg_info_from_raw_sit(se, &sit);

>>>> -

>>>> -		if (f2fs_discard_en(sbi)) {

>>>> -			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>>>> -				memset(se->discard_map, 0xff,

>>>> -							SIT_VBLOCK_MAP_SIZE);

>>>> -			} else {

>>>> -				memcpy(se->discard_map, se->cur_valid_map,

>>>> -							SIT_VBLOCK_MAP_SIZE);

>>>> -				sbi->discard_blks += old_valid_blocks -

>>>> -							se->valid_blocks;

>>>> -			}

>>>> -		}

>>>> -

>>>> -		if (sbi->segs_per_sec > 1)

>>>> -			get_sec_entry(sbi, start)->valid_blocks +=

>>>> -				se->valid_blocks - old_valid_blocks;

>>>> +		seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>>>    	}

>>>>    	up_read(&curseg->journal_rwsem);

>>>> +

>>>> +	/* build discard map only one time */

>>>> +	if (f2fs_discard_en(sbi))

>>>> +		build_discard_entries(sbi);

>>>> +

>>>> +	if (sbi->segs_per_sec > 1)

>>>> +		build_sec_entries(sbi);

>>>>    }

>>>>    

>>>>    static void init_free_segmap(struct f2fs_sb_info *sbi)

>>>> -- 

>>>> 2.1.4
Gao Xiang Dec. 20, 2017, 3:12 a.m. UTC | #5
Hi Jaegeuk,

On 2017/12/20 10:31, Jaegeuk Kim wrote:
> On 12/20, Gaoxiang (OS) wrote:

>> Hi Jaegeuk,

>>

>> On 2017/12/20 7:40, Jaegeuk Kim wrote:

>>> On 12/17, gaoxiang (P) wrote:

>>>> clean up some redundant repeat code in build_sit_entries

>>>> and seperate some logic into new functions:

>>>>    - build_discard_entries

>>>>    - build_sec_entries

>>>

>>> This patch makes another big loop redundantly?

>>>

>> "condition & jump" is the fundamental of "if" and loop.

>> In the worst case, both f2fs_discard_en(sbi) and sbi->segs_per_sec > 1

>> are true, and I think, we have:

>> - the original approach:

>> 	MAIN_SEGS   <-- loop condition

>> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)

>> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1

>>

>> 	sits_in_cursum(journal) <-- loop_condition

>> 	sits_in_cursum(journal) <-- f2fs_discard_en(sbi)

>> 	sits_in_cursum(journal) <-- sbi->segs_per_sec > 1

>>      it takes about 3*(MAIN_SEGS + sits_in_cursum(journal)) jumps effort.

>> - this patch:

>> 	MAIN_SEGS   <-- loop condition

>> 	sits_in_cursum(journal) <-- loop_condition

>>

>> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)

>> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1

>>      it takes abourt 3*MAIN_SEGS + sits_in_cursum(journal) jumps effort.

>>

>> and in the best case, both f2fs_discard_en(sbi) and sbi->segs_per_sec >

>> 1 are false, and we have:

>> - this patch

>> 	MAIN_SEGS   <-- loop condition

>>

>> 	sits_in_cursum(journal) <-- loop_condition

>> See

>> https://stackoverflow.com/questions/1462710/can-c-compilers-optimize-if-statements-inside-for-loops

> 

> Interesting!

> 

> What I can think of the worst case looks like:

> 

> In current code,

> 	for (N) {

> 		do_something;

> 		if (f2fs_discard_en())

> 			do _something;

> 		if (sbi->segs_per_sec > 1)

> 			do_something;

> 	}

> 	for (sits_in_cursum()) {

> 		do_something;

> 		if (f2fs_discard_en())

> 			do _something;

> 		if (sbi->segs_per_sec > 1)

> 			do_something;

> 	}

> => O(N + sits_in_cursum())


In the worst case Its 3(1 -- exit condition < MAIN_SEG, 2 --  f2fs_discard_en(), 3 -- 
sbi->segs_per_sec > 1) *N + 3*sits_in_cursum() if condition & jump 
instuctions at runtime.

If we use Big O notion, yes I think O(N + sits_in_cursum())

> 

> Once compiler unrolled the loop, we can expect most of jumps could be hit by

> branch predictor, since the loop does not have many branches.


The current code, if the compiler decides to unroll the loop, it could unroll it into, I guess
  	for (N) {
  		do_something;
	}
	if (f2fs_discard_en()) {
		for(N) {
  			do _something;
		}
	}
	if (sbi->segs_per_sec > 1) {
		for(N) {
			do_something;
		}
	}
	...
	for (sits_in_cursum()) {
		do_something;
	}
	if (f2fs_discard_en()) {
		for (N) {
			do _something;
		}
	}
	if (sbi->segs_per_sec > 1) {
		for (N) {
			do_something;
		}
        	}
Taking branch predictor into consideration, I think the unrolled one is more easier to predict
than the current rolled one. (as you say, the current has more conditions in the loop
and a exit condition to predict)

> 

> In the patch,

> 	for (N)

> 		do_something;

> 	for (sits_in_cursum())

> 		do_something;

> 

> 	if (f2fs_discard_en()) {

> 		for (N)

> 			do_something;

> 	}

> 	if (sbi->segs_per_sec > 1) {

> 		for (N)

> 			do_something;

> 	}

> => O(3*N + sits_in_cursum())


If we use Big O notion, I think O(N + sits_in_cursum()) too.


> 

> BTW, build_discard_entries(struct f2fs_sb_info *sbi) is more likely to split

> the loops, as you describe.

> 

> In build_discard_entries(),

> 	for (N) {

> 		if (CP_TRIMMED_FLAG)

> 			do_something;

> 		else

> 			do_semething_else;

> 	}

> Thanks,

> 


I think so, yet CP_TRIMMED_FLAG comes from a function argument, so I think compiler
cannot decide whether it is constant.

I think if we count the runtime condition & jump instruction at runtime. 
the unrolls above are beneficial, I think.

Thanks.

>>

>> In addtion, this approach is easier to read than the original

>> one, so splitting the loop would be beneficial.

>>

>> Thanks

>>

>>>>

>>>> Note that "if (f2fs_discard_en(sbi))" and "if (sbi->segs_per_sec > 1)"

>>>> are all taken out of the loops they are unchangable.

>>>>

>>>> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>

>>>> ---

>>>>    fs/f2fs/segment.c | 80 +++++++++++++++++++++++++------------------------------

>>>>    1 file changed, 37 insertions(+), 43 deletions(-)

>>>>

>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c

>>>> index 40e1d20..bcd8a40 100644

>>>> --- a/fs/f2fs/segment.c

>>>> +++ b/fs/f2fs/segment.c

>>>> @@ -3476,12 +3476,39 @@ static int build_curseg(struct f2fs_sb_info *sbi)

>>>>    	return restore_curseg_summaries(sbi);

>>>>    }

>>>>    

>>>> +static void build_discard_entries(struct f2fs_sb_info *sbi)

>>>> +{

>>>> +	unsigned segno;

>>>> +

>>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno) {

>>>> +		struct seg_entry *se = get_seg_entry(sbi, segno);

>>>> +

>>>> +		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG))

>>>> +			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);

>>>> +		else {

>>>> +			memcpy(se->discard_map, se->cur_valid_map,

>>>> +				SIT_VBLOCK_MAP_SIZE);

>>>> +

>>>> +			sbi->discard_blks += sbi->blocks_per_seg -

>>>> +				se->valid_blocks;

>>>> +		}

>>>> +	}

>>>> +}

>>>> +

>>>> +static void build_sec_entries(struct f2fs_sb_info *sbi)

>>>> +{

>>>> +	unsigned segno;

>>>> +

>>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno)

>>>> +		get_sec_entry(sbi, segno)->valid_blocks +=

>>>> +			get_seg_entry(sbi, segno)->valid_blocks;

>>>> +}

>>>> +

>>>>    static void build_sit_entries(struct f2fs_sb_info *sbi)

>>>>    {

>>>>    	struct sit_info *sit_i = SIT_I(sbi);

>>>>    	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);

>>>>    	struct f2fs_journal *journal = curseg->journal;

>>>> -	struct seg_entry *se;

>>>>    	struct f2fs_sit_entry sit;

>>>>    	int sit_blk_cnt = SIT_BLK_CNT(sbi);

>>>>    	unsigned int i, start, end;

>>>> @@ -3498,67 +3525,34 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)

>>>>    			struct f2fs_sit_block *sit_blk;

>>>>    			struct page *page;

>>>>    

>>>> -			se = &sit_i->sentries[start];

>>>>    			page = get_current_sit_page(sbi, start);

>>>>    			sit_blk = (struct f2fs_sit_block *)page_address(page);

>>>>    			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];

>>>>    			f2fs_put_page(page, 1);

>>>>    

>>>>    			check_block_count(sbi, start, &sit);

>>>> -			seg_info_from_raw_sit(se, &sit);

>>>> -

>>>> -			/* build discard map only one time */

>>>> -			if (f2fs_discard_en(sbi)) {

>>>> -				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>>>> -					memset(se->discard_map, 0xff,

>>>> -						SIT_VBLOCK_MAP_SIZE);

>>>> -				} else {

>>>> -					memcpy(se->discard_map,

>>>> -						se->cur_valid_map,

>>>> -						SIT_VBLOCK_MAP_SIZE);

>>>> -					sbi->discard_blks +=

>>>> -						sbi->blocks_per_seg -

>>>> -						se->valid_blocks;

>>>> -				}

>>>> -			}

>>>>    

>>>> -			if (sbi->segs_per_sec > 1)

>>>> -				get_sec_entry(sbi, start)->valid_blocks +=

>>>> -							se->valid_blocks;

>>>> +			seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>>>    		}

>>>>    		start_blk += readed;

>>>>    	} while (start_blk < sit_blk_cnt);

>>>>    

>>>>    	down_read(&curseg->journal_rwsem);

>>>>    	for (i = 0; i < sits_in_cursum(journal); i++) {

>>>> -		unsigned int old_valid_blocks;

>>>> -

>>>>    		start = le32_to_cpu(segno_in_journal(journal, i));

>>>> -		se = &sit_i->sentries[start];

>>>>    		sit = sit_in_journal(journal, i);

>>>>    

>>>> -		old_valid_blocks = se->valid_blocks;

>>>> -

>>>>    		check_block_count(sbi, start, &sit);

>>>> -		seg_info_from_raw_sit(se, &sit);

>>>> -

>>>> -		if (f2fs_discard_en(sbi)) {

>>>> -			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>>>> -				memset(se->discard_map, 0xff,

>>>> -							SIT_VBLOCK_MAP_SIZE);

>>>> -			} else {

>>>> -				memcpy(se->discard_map, se->cur_valid_map,

>>>> -							SIT_VBLOCK_MAP_SIZE);

>>>> -				sbi->discard_blks += old_valid_blocks -

>>>> -							se->valid_blocks;

>>>> -			}

>>>> -		}

>>>> -

>>>> -		if (sbi->segs_per_sec > 1)

>>>> -			get_sec_entry(sbi, start)->valid_blocks +=

>>>> -				se->valid_blocks - old_valid_blocks;

>>>> +		seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>>>    	}

>>>>    	up_read(&curseg->journal_rwsem);

>>>> +

>>>> +	/* build discard map only one time */

>>>> +	if (f2fs_discard_en(sbi))

>>>> +		build_discard_entries(sbi);

>>>> +

>>>> +	if (sbi->segs_per_sec > 1)

>>>> +		build_sec_entries(sbi);

>>>>    }

>>>>    

>>>>    static void init_free_segmap(struct f2fs_sb_info *sbi)

>>>> -- 

>>>> 2.1.4
Gao Xiang Dec. 20, 2017, 5:46 a.m. UTC | #6
Hi Jaegeuk,

For compiler loop optimization,  refer to  "loop unswitching" section in 
https://zhuanlan.zhihu.com/p/23326776   (Chinese stack overflow-like website)
https://translate.google.com/translate?sl=zh-CN&tl=en&js=y&prev=_t&hl=zh-CN&ie=UTF-8&u=https%3A%2F%2Fzhuanlan.zhihu.com%2Fp%2F23326776&edit-text=&act=url

or

https://en.wikipedia.org/wiki/Loop_unswitching
http://www.compileroptimizations.com/category/unswitching.htm
https://gcc.gnu.org/onlinedocs/gcc-4.4.2/gcc/Optimize-Options.html
-O3
Optimize yet more. -O3 turns on all optimizations specified by -O2 and also turns on the -finline-functions, -funswitch-loops, -fpredictive-commoning, -fgcse-after-reload and -ftree-vectorize options.

For branch predictor, I think it is much easier to predict and analyze to parallelize
while (A){		// only one miss
	...
}

if (B) {
	while (A) {	// only one miss
		...
	}
}

Than

while (A) {
	...
	if (B) {		// A->B->A->B crossing
		...
	}
}

For most of processors.

On 2017/12/20 10:31, Jaegeuk Kim wrote:
> On 12/20, Gaoxiang (OS) wrote:

>> Hi Jaegeuk,

>>

>> On 2017/12/20 7:40, Jaegeuk Kim wrote:

>>> On 12/17, gaoxiang (P) wrote:

>>>> clean up some redundant repeat code in build_sit_entries

>>>> and seperate some logic into new functions:

>>>>    - build_discard_entries

>>>>    - build_sec_entries

>>>

>>> This patch makes another big loop redundantly?

>>>

>> "condition & jump" is the fundamental of "if" and loop.

>> In the worst case, both f2fs_discard_en(sbi) and sbi->segs_per_sec > 1

>> are true, and I think, we have:

>> - the original approach:

>> 	MAIN_SEGS   <-- loop condition

>> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)

>> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1

>>

>> 	sits_in_cursum(journal) <-- loop_condition

>> 	sits_in_cursum(journal) <-- f2fs_discard_en(sbi)

>> 	sits_in_cursum(journal) <-- sbi->segs_per_sec > 1

>>      it takes about 3*(MAIN_SEGS + sits_in_cursum(journal)) jumps effort.

>> - this patch:

>> 	MAIN_SEGS   <-- loop condition

>> 	sits_in_cursum(journal) <-- loop_condition

>>

>> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)

>> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1

>>      it takes abourt 3*MAIN_SEGS + sits_in_cursum(journal) jumps effort.

>>

>> and in the best case, both f2fs_discard_en(sbi) and sbi->segs_per_sec >

>> 1 are false, and we have:

>> - this patch

>> 	MAIN_SEGS   <-- loop condition

>>

>> 	sits_in_cursum(journal) <-- loop_condition

>> See

>> https://stackoverflow.com/questions/1462710/can-c-compilers-optimize-if-statements-inside-for-loops

> 

> Interesting!

> 

> What I can think of the worst case looks like:

> 

> In current code,

> 	for (N) {

> 		do_something;

> 		if (f2fs_discard_en())

> 			do _something;

> 		if (sbi->segs_per_sec > 1)

> 			do_something;

> 	}

> 	for (sits_in_cursum()) {

> 		do_something;

> 		if (f2fs_discard_en())

> 			do _something;

> 		if (sbi->segs_per_sec > 1)

> 			do_something;

> 	}

> => O(N + sits_in_cursum())


In the worst case Its 3(1 -- exit condition < MAIN_SEG, 2 --  f2fs_discard_en(), 3 -- 
sbi->segs_per_sec > 1) *N + 3*sits_in_cursum() if condition & jump 
instuctions at runtime.

If we use Big O notion, yes I think O(N + sits_in_cursum())

> 

> Once compiler unrolled the loop, we can expect most of jumps could be hit by

> branch predictor, since the loop does not have many branches.


The current code, if the compiler decides to unroll the loop, it could unroll it into, I guess
  	for (N) {
  		do_something;
	}
	if (f2fs_discard_en()) {
		for(N) {
  			do _something;
		}
	}
	if (sbi->segs_per_sec > 1) {
		for(N) {
			do_something;
		}
	}
	...
	/* extra */
	for (sits_in_cursum()) {
		do_something;
	}
	if (f2fs_discard_en()) {
		for (sits_in_cursum()) {
			do _something;
		}
	}
	if (sbi->segs_per_sec > 1) {
		for (sits_in_cursum()) {
			do_something;
		}
        	}
Taking branch predictor into consideration, I think the unrolled one is more easier to predict
than the current rolled one. (as you say, the current has more conditions in the loop
and a exit condition to predict)

> 

> In the patch,

> 	for (N)

> 		do_something;

> 	for (sits_in_cursum())

> 		do_something;

> 

> 	if (f2fs_discard_en()) {

> 		for (N)

> 			do_something;

> 	}

> 	if (sbi->segs_per_sec > 1) {

> 		for (N)

> 			do_something;

> 	}

> => O(3*N + sits_in_cursum())


If we use Big O notion, I think O(N + sits_in_cursum()) too.


> 

> BTW, build_discard_entries(struct f2fs_sb_info *sbi) is more likely to split

> the loops, as you describe.

> 

> In build_discard_entries(),

> 	for (N) {

> 		if (CP_TRIMMED_FLAG)

> 			do_something;

> 		else

> 			do_semething_else;

> 	}

> Thanks,

> 


I think so, yet CP_TRIMMED_FLAG comes from a function argument, so I think compiler
cannot decide whether it is constant.

I think if we count the runtime condition & jump instruction at runtime. 
the unrolls above are beneficial, I think.

Thanks.

>>

>> In addtion, this approach is easier to read than the original

>> one, so splitting the loop would be beneficial.

>>

>> Thanks

>>

>>>>

>>>> Note that "if (f2fs_discard_en(sbi))" and "if (sbi->segs_per_sec > 1)"

>>>> are all taken out of the loops they are unchangable.

>>>>

>>>> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>

>>>> ---

>>>>    fs/f2fs/segment.c | 80 +++++++++++++++++++++++++------------------------------

>>>>    1 file changed, 37 insertions(+), 43 deletions(-)

>>>>

>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c

>>>> index 40e1d20..bcd8a40 100644

>>>> --- a/fs/f2fs/segment.c

>>>> +++ b/fs/f2fs/segment.c

>>>> @@ -3476,12 +3476,39 @@ static int build_curseg(struct f2fs_sb_info *sbi)

>>>>    	return restore_curseg_summaries(sbi);

>>>>    }

>>>>    

>>>> +static void build_discard_entries(struct f2fs_sb_info *sbi)

>>>> +{

>>>> +	unsigned segno;

>>>> +

>>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno) {

>>>> +		struct seg_entry *se = get_seg_entry(sbi, segno);

>>>> +

>>>> +		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG))

>>>> +			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);

>>>> +		else {

>>>> +			memcpy(se->discard_map, se->cur_valid_map,

>>>> +				SIT_VBLOCK_MAP_SIZE);

>>>> +

>>>> +			sbi->discard_blks += sbi->blocks_per_seg -

>>>> +				se->valid_blocks;

>>>> +		}

>>>> +	}

>>>> +}

>>>> +

>>>> +static void build_sec_entries(struct f2fs_sb_info *sbi)

>>>> +{

>>>> +	unsigned segno;

>>>> +

>>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno)

>>>> +		get_sec_entry(sbi, segno)->valid_blocks +=

>>>> +			get_seg_entry(sbi, segno)->valid_blocks;

>>>> +}

>>>> +

>>>>    static void build_sit_entries(struct f2fs_sb_info *sbi)

>>>>    {

>>>>    	struct sit_info *sit_i = SIT_I(sbi);

>>>>    	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);

>>>>    	struct f2fs_journal *journal = curseg->journal;

>>>> -	struct seg_entry *se;

>>>>    	struct f2fs_sit_entry sit;

>>>>    	int sit_blk_cnt = SIT_BLK_CNT(sbi);

>>>>    	unsigned int i, start, end;

>>>> @@ -3498,67 +3525,34 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)

>>>>    			struct f2fs_sit_block *sit_blk;

>>>>    			struct page *page;

>>>>    

>>>> -			se = &sit_i->sentries[start];

>>>>    			page = get_current_sit_page(sbi, start);

>>>>    			sit_blk = (struct f2fs_sit_block *)page_address(page);

>>>>    			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];

>>>>    			f2fs_put_page(page, 1);

>>>>    

>>>>    			check_block_count(sbi, start, &sit);

>>>> -			seg_info_from_raw_sit(se, &sit);

>>>> -

>>>> -			/* build discard map only one time */

>>>> -			if (f2fs_discard_en(sbi)) {

>>>> -				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>>>> -					memset(se->discard_map, 0xff,

>>>> -						SIT_VBLOCK_MAP_SIZE);

>>>> -				} else {

>>>> -					memcpy(se->discard_map,

>>>> -						se->cur_valid_map,

>>>> -						SIT_VBLOCK_MAP_SIZE);

>>>> -					sbi->discard_blks +=

>>>> -						sbi->blocks_per_seg -

>>>> -						se->valid_blocks;

>>>> -				}

>>>> -			}

>>>>    

>>>> -			if (sbi->segs_per_sec > 1)

>>>> -				get_sec_entry(sbi, start)->valid_blocks +=

>>>> -							se->valid_blocks;

>>>> +			seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>>>    		}

>>>>    		start_blk += readed;

>>>>    	} while (start_blk < sit_blk_cnt);

>>>>    

>>>>    	down_read(&curseg->journal_rwsem);

>>>>    	for (i = 0; i < sits_in_cursum(journal); i++) {

>>>> -		unsigned int old_valid_blocks;

>>>> -

>>>>    		start = le32_to_cpu(segno_in_journal(journal, i));

>>>> -		se = &sit_i->sentries[start];

>>>>    		sit = sit_in_journal(journal, i);

>>>>    

>>>> -		old_valid_blocks = se->valid_blocks;

>>>> -

>>>>    		check_block_count(sbi, start, &sit);

>>>> -		seg_info_from_raw_sit(se, &sit);

>>>> -

>>>> -		if (f2fs_discard_en(sbi)) {

>>>> -			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>>>> -				memset(se->discard_map, 0xff,

>>>> -							SIT_VBLOCK_MAP_SIZE);

>>>> -			} else {

>>>> -				memcpy(se->discard_map, se->cur_valid_map,

>>>> -							SIT_VBLOCK_MAP_SIZE);

>>>> -				sbi->discard_blks += old_valid_blocks -

>>>> -							se->valid_blocks;

>>>> -			}

>>>> -		}

>>>> -

>>>> -		if (sbi->segs_per_sec > 1)

>>>> -			get_sec_entry(sbi, start)->valid_blocks +=

>>>> -				se->valid_blocks - old_valid_blocks;

>>>> +		seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>>>    	}

>>>>    	up_read(&curseg->journal_rwsem);

>>>> +

>>>> +	/* build discard map only one time */

>>>> +	if (f2fs_discard_en(sbi))

>>>> +		build_discard_entries(sbi);

>>>> +

>>>> +	if (sbi->segs_per_sec > 1)

>>>> +		build_sec_entries(sbi);

>>>>    }

>>>>    

>>>>    static void init_free_segmap(struct f2fs_sb_info *sbi)

>>>> -- 

>>>> 2.1.4
Jaegeuk Kim Dec. 20, 2017, 8:39 p.m. UTC | #7
On 12/20, Gaoxiang (OS) wrote:
> Hi Jaegeuk,
> 
> On 2017/12/20 10:31, Jaegeuk Kim wrote:
> > On 12/20, Gaoxiang (OS) wrote:
> >> Hi Jaegeuk,
> >>
> >> On 2017/12/20 7:40, Jaegeuk Kim wrote:
> >>> On 12/17, gaoxiang (P) wrote:
> >>>> clean up some redundant repeat code in build_sit_entries
> >>>> and seperate some logic into new functions:
> >>>>    - build_discard_entries
> >>>>    - build_sec_entries
> >>>
> >>> This patch makes another big loop redundantly?
> >>>
> >> "condition & jump" is the fundamental of "if" and loop.
> >> In the worst case, both f2fs_discard_en(sbi) and sbi->segs_per_sec > 1
> >> are true, and I think, we have:
> >> - the original approach:
> >> 	MAIN_SEGS   <-- loop condition
> >> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)
> >> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1
> >>
> >> 	sits_in_cursum(journal) <-- loop_condition
> >> 	sits_in_cursum(journal) <-- f2fs_discard_en(sbi)
> >> 	sits_in_cursum(journal) <-- sbi->segs_per_sec > 1
> >>      it takes about 3*(MAIN_SEGS + sits_in_cursum(journal)) jumps effort.
> >> - this patch:
> >> 	MAIN_SEGS   <-- loop condition
> >> 	sits_in_cursum(journal) <-- loop_condition
> >>
> >> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)
> >> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1
> >>      it takes abourt 3*MAIN_SEGS + sits_in_cursum(journal) jumps effort.
> >>
> >> and in the best case, both f2fs_discard_en(sbi) and sbi->segs_per_sec >
> >> 1 are false, and we have:
> >> - this patch
> >> 	MAIN_SEGS   <-- loop condition
> >>
> >> 	sits_in_cursum(journal) <-- loop_condition
> >> See
> >> https://stackoverflow.com/questions/1462710/can-c-compilers-optimize-if-statements-inside-for-loops
> > 
> > Interesting!
> > 
> > What I can think of the worst case looks like:
> > 
> > In current code,
> > 	for (N) {
> > 		do_something;
> > 		if (f2fs_discard_en())
> > 			do _something;
> > 		if (sbi->segs_per_sec > 1)
> > 			do_something;
> > 	}
> > 	for (sits_in_cursum()) {
> > 		do_something;
> > 		if (f2fs_discard_en())
> > 			do _something;
> > 		if (sbi->segs_per_sec > 1)
> > 			do_something;
> > 	}
> > => O(N + sits_in_cursum())
> 
> In the worst case Its 3(1 -- exit condition < MAIN_SEG, 2 --  f2fs_discard_en(), 3 -- 
> sbi->segs_per_sec > 1) *N + 3*sits_in_cursum() if condition & jump 
> instuctions at runtime.
> 
> If we use Big O notion, yes I think O(N + sits_in_cursum())
> 
> > 
> > Once compiler unrolled the loop, we can expect most of jumps could be hit by
> > branch predictor, since the loop does not have many branches.
> 
> The current code, if the compiler decides to unroll the loop, it could unroll it into, I guess
>   	for (N) {
>   		do_something;
> 	}
> 	if (f2fs_discard_en()) {
> 		for(N) {
>   			do _something;
> 		}
> 	}
> 	if (sbi->segs_per_sec > 1) {
> 		for(N) {
> 			do_something;
> 		}
> 	}
> 	...
> 	for (sits_in_cursum()) {
> 		do_something;
> 	}
> 	if (f2fs_discard_en()) {
> 		for (N) {
> 			do _something;
> 		}
> 	}
> 	if (sbi->segs_per_sec > 1) {
> 		for (N) {
> 			do_something;
> 		}
>         	}

Unrolled loop would be something like:
	for (N/n) {
		do_something #1;
		if (f2fs_discard_en())
			do_something;
		if (sbi->segs_per_sec > 1)
			do_something;
		...
		do_something #n;
		if (f2fs_discard_en())
			do_something;
		if (sbi->segs_per_sec > 1)
			do_something;
	}

And, branch predictor can avoid three branch conditions by BTB.

do_something #1;
  if (f2fs_discard_en())
    do _something;
      if (sbi->segs_per_sec > 1)
        do_something;
          do_something #2;
            if (f2fs_discard_en())
              do _something;
                if (sbi->segs_per_sec > 1)
                  do_something;
                    ...
                      do_something #N;
                        if (f2fs_discard_en())
                          do _something;
                            if (sbi->segs_per_sec > 1)
                              do_something;

Anyway, I don't see huge benefit regarding to performance and code readability,
but do worry about potential patch conflicts when backporting this.

> Taking branch predictor into consideration, I think the unrolled one is more easier to predict
> than the current rolled one. (as you say, the current has more conditions in the loop
> and a exit condition to predict)
> 
> > 
> > In the patch,
> > 	for (N)
> > 		do_something;
> > 	for (sits_in_cursum())
> > 		do_something;
> > 
> > 	if (f2fs_discard_en()) {
> > 		for (N)
> > 			do_something;
> > 	}
> > 	if (sbi->segs_per_sec > 1) {
> > 		for (N)
> > 			do_something;
> > 	}
> > => O(3*N + sits_in_cursum())
> 
> If we use Big O notion, I think O(N + sits_in_cursum()) too.
> > 
> > BTW, build_discard_entries(struct f2fs_sb_info *sbi) is more likely to split
> > the loops, as you describe.
> > 
> > In build_discard_entries(),
> > 	for (N) {
> > 		if (CP_TRIMMED_FLAG)
> > 			do_something;
> > 		else
> > 			do_semething_else;
> > 	}
> > Thanks,
> > 
> 
> I think so, yet CP_TRIMMED_FLAG comes from a function argument, so I think compiler
> cannot decide whether it is constant.
> 
> I think if we count the runtime condition & jump instruction at runtime. 
> the unrolls above are beneficial, I think.
> 
> Thanks.
> 
> >>
> >> In addtion, this approach is easier to read than the original
> >> one, so splitting the loop would be beneficial.
> >>
> >> Thanks
> >>
> >>>>
> >>>> Note that "if (f2fs_discard_en(sbi))" and "if (sbi->segs_per_sec > 1)"
> >>>> are all taken out of the loops they are unchangable.
> >>>>
> >>>> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
> >>>> ---
> >>>>    fs/f2fs/segment.c | 80 +++++++++++++++++++++++++------------------------------
> >>>>    1 file changed, 37 insertions(+), 43 deletions(-)
> >>>>
> >>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> >>>> index 40e1d20..bcd8a40 100644
> >>>> --- a/fs/f2fs/segment.c
> >>>> +++ b/fs/f2fs/segment.c
> >>>> @@ -3476,12 +3476,39 @@ static int build_curseg(struct f2fs_sb_info *sbi)
> >>>>    	return restore_curseg_summaries(sbi);
> >>>>    }
> >>>>    
> >>>> +static void build_discard_entries(struct f2fs_sb_info *sbi)
> >>>> +{
> >>>> +	unsigned segno;
> >>>> +
> >>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno) {
> >>>> +		struct seg_entry *se = get_seg_entry(sbi, segno);
> >>>> +
> >>>> +		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG))
> >>>> +			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
> >>>> +		else {
> >>>> +			memcpy(se->discard_map, se->cur_valid_map,
> >>>> +				SIT_VBLOCK_MAP_SIZE);
> >>>> +
> >>>> +			sbi->discard_blks += sbi->blocks_per_seg -
> >>>> +				se->valid_blocks;
> >>>> +		}
> >>>> +	}
> >>>> +}
> >>>> +
> >>>> +static void build_sec_entries(struct f2fs_sb_info *sbi)
> >>>> +{
> >>>> +	unsigned segno;
> >>>> +
> >>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno)
> >>>> +		get_sec_entry(sbi, segno)->valid_blocks +=
> >>>> +			get_seg_entry(sbi, segno)->valid_blocks;
> >>>> +}
> >>>> +
> >>>>    static void build_sit_entries(struct f2fs_sb_info *sbi)
> >>>>    {
> >>>>    	struct sit_info *sit_i = SIT_I(sbi);
> >>>>    	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
> >>>>    	struct f2fs_journal *journal = curseg->journal;
> >>>> -	struct seg_entry *se;
> >>>>    	struct f2fs_sit_entry sit;
> >>>>    	int sit_blk_cnt = SIT_BLK_CNT(sbi);
> >>>>    	unsigned int i, start, end;
> >>>> @@ -3498,67 +3525,34 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
> >>>>    			struct f2fs_sit_block *sit_blk;
> >>>>    			struct page *page;
> >>>>    
> >>>> -			se = &sit_i->sentries[start];
> >>>>    			page = get_current_sit_page(sbi, start);
> >>>>    			sit_blk = (struct f2fs_sit_block *)page_address(page);
> >>>>    			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
> >>>>    			f2fs_put_page(page, 1);
> >>>>    
> >>>>    			check_block_count(sbi, start, &sit);
> >>>> -			seg_info_from_raw_sit(se, &sit);
> >>>> -
> >>>> -			/* build discard map only one time */
> >>>> -			if (f2fs_discard_en(sbi)) {
> >>>> -				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
> >>>> -					memset(se->discard_map, 0xff,
> >>>> -						SIT_VBLOCK_MAP_SIZE);
> >>>> -				} else {
> >>>> -					memcpy(se->discard_map,
> >>>> -						se->cur_valid_map,
> >>>> -						SIT_VBLOCK_MAP_SIZE);
> >>>> -					sbi->discard_blks +=
> >>>> -						sbi->blocks_per_seg -
> >>>> -						se->valid_blocks;
> >>>> -				}
> >>>> -			}
> >>>>    
> >>>> -			if (sbi->segs_per_sec > 1)
> >>>> -				get_sec_entry(sbi, start)->valid_blocks +=
> >>>> -							se->valid_blocks;
> >>>> +			seg_info_from_raw_sit(&sit_i->sentries[start], &sit);
> >>>>    		}
> >>>>    		start_blk += readed;
> >>>>    	} while (start_blk < sit_blk_cnt);
> >>>>    
> >>>>    	down_read(&curseg->journal_rwsem);
> >>>>    	for (i = 0; i < sits_in_cursum(journal); i++) {
> >>>> -		unsigned int old_valid_blocks;
> >>>> -
> >>>>    		start = le32_to_cpu(segno_in_journal(journal, i));
> >>>> -		se = &sit_i->sentries[start];
> >>>>    		sit = sit_in_journal(journal, i);
> >>>>    
> >>>> -		old_valid_blocks = se->valid_blocks;
> >>>> -
> >>>>    		check_block_count(sbi, start, &sit);
> >>>> -		seg_info_from_raw_sit(se, &sit);
> >>>> -
> >>>> -		if (f2fs_discard_en(sbi)) {
> >>>> -			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
> >>>> -				memset(se->discard_map, 0xff,
> >>>> -							SIT_VBLOCK_MAP_SIZE);
> >>>> -			} else {
> >>>> -				memcpy(se->discard_map, se->cur_valid_map,
> >>>> -							SIT_VBLOCK_MAP_SIZE);
> >>>> -				sbi->discard_blks += old_valid_blocks -
> >>>> -							se->valid_blocks;
> >>>> -			}
> >>>> -		}
> >>>> -
> >>>> -		if (sbi->segs_per_sec > 1)
> >>>> -			get_sec_entry(sbi, start)->valid_blocks +=
> >>>> -				se->valid_blocks - old_valid_blocks;
> >>>> +		seg_info_from_raw_sit(&sit_i->sentries[start], &sit);
> >>>>    	}
> >>>>    	up_read(&curseg->journal_rwsem);
> >>>> +
> >>>> +	/* build discard map only one time */
> >>>> +	if (f2fs_discard_en(sbi))
> >>>> +		build_discard_entries(sbi);
> >>>> +
> >>>> +	if (sbi->segs_per_sec > 1)
> >>>> +		build_sec_entries(sbi);
> >>>>    }
> >>>>    
> >>>>    static void init_free_segmap(struct f2fs_sb_info *sbi)
> >>>> -- 
> >>>> 2.1.4
Gao Xiang Dec. 21, 2017, 1:31 a.m. UTC | #8
Hi Jaegeuk,

On 2017/12/21 4:39, Jaegeuk Kim wrote:
> On 12/20, Gaoxiang (OS) wrote:

>> Hi Jaegeuk,

>>

>> On 2017/12/20 10:31, Jaegeuk Kim wrote:

>>> On 12/20, Gaoxiang (OS) wrote:

>>>> Hi Jaegeuk,

>>>>

>>>> On 2017/12/20 7:40, Jaegeuk Kim wrote:

>>>>> On 12/17, gaoxiang (P) wrote:

>>>>>> clean up some redundant repeat code in build_sit_entries

>>>>>> and seperate some logic into new functions:

>>>>>>     - build_discard_entries

>>>>>>     - build_sec_entries

>>>>>

>>>>> This patch makes another big loop redundantly?

>>>>>

>>>> "condition & jump" is the fundamental of "if" and loop.

>>>> In the worst case, both f2fs_discard_en(sbi) and sbi->segs_per_sec > 1

>>>> are true, and I think, we have:

>>>> - the original approach:

>>>> 	MAIN_SEGS   <-- loop condition

>>>> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)

>>>> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1

>>>>

>>>> 	sits_in_cursum(journal) <-- loop_condition

>>>> 	sits_in_cursum(journal) <-- f2fs_discard_en(sbi)

>>>> 	sits_in_cursum(journal) <-- sbi->segs_per_sec > 1

>>>>       it takes about 3*(MAIN_SEGS + sits_in_cursum(journal)) jumps effort.

>>>> - this patch:

>>>> 	MAIN_SEGS   <-- loop condition

>>>> 	sits_in_cursum(journal) <-- loop_condition

>>>>

>>>> 	MAIN_SEGS   <-- f2fs_discard_en(sbi)

>>>> 	MAIN_SEGS   <-- sbi->segs_per_sec > 1

>>>>       it takes abourt 3*MAIN_SEGS + sits_in_cursum(journal) jumps effort.

>>>>

>>>> and in the best case, both f2fs_discard_en(sbi) and sbi->segs_per_sec >

>>>> 1 are false, and we have:

>>>> - this patch

>>>> 	MAIN_SEGS   <-- loop condition

>>>>

>>>> 	sits_in_cursum(journal) <-- loop_condition

>>>> See

>>>> https://stackoverflow.com/questions/1462710/can-c-compilers-optimize-if-statements-inside-for-loops

>>>

>>> Interesting!

>>>

>>> What I can think of the worst case looks like:

>>>

>>> In current code,

>>> 	for (N) {

>>> 		do_something;

>>> 		if (f2fs_discard_en())

>>> 			do _something;

>>> 		if (sbi->segs_per_sec > 1)

>>> 			do_something;

>>> 	}

>>> 	for (sits_in_cursum()) {

>>> 		do_something;

>>> 		if (f2fs_discard_en())

>>> 			do _something;

>>> 		if (sbi->segs_per_sec > 1)

>>> 			do_something;

>>> 	}

>>> => O(N + sits_in_cursum())

>>

>> In the worst case Its 3(1 -- exit condition < MAIN_SEG, 2 --  f2fs_discard_en(), 3 --

>> sbi->segs_per_sec > 1) *N + 3*sits_in_cursum() if condition & jump

>> instuctions at runtime.

>>

>> If we use Big O notion, yes I think O(N + sits_in_cursum())

>>

>>>

>>> Once compiler unrolled the loop, we can expect most of jumps could be hit by

>>> branch predictor, since the loop does not have many branches.

>>

>> The current code, if the compiler decides to unroll the loop, it could unroll it into, I guess

>>    	for (N) {

>>    		do_something;

>> 	}

>> 	if (f2fs_discard_en()) {

>> 		for(N) {

>>    			do _something;

>> 		}

>> 	}

>> 	if (sbi->segs_per_sec > 1) {

>> 		for(N) {

>> 			do_something;

>> 		}

>> 	}

>> 	...

>> 	for (sits_in_cursum()) {

>> 		do_something;

>> 	}

>> 	if (f2fs_discard_en()) {

>> 		for (N) {

>> 			do _something;

>> 		}

>> 	}

>> 	if (sbi->segs_per_sec > 1) {

>> 		for (N) {

>> 			do_something;

>> 		}

>>          	}

> 

> Unrolled loop would be something like:

> 	for (N/n) {

> 		do_something #1;

> 		if (f2fs_discard_en())

> 			do_something;

> 		if (sbi->segs_per_sec > 1)

> 			do_something;

> 		...

> 		do_something #n;

> 		if (f2fs_discard_en())

> 			do_something;

> 		if (sbi->segs_per_sec > 1)

> 			do_something;

> 	}

> 

> And, branch predictor can avoid three branch conditions by BTB.

> 

> do_something #1;

>    if (f2fs_discard_en())

>      do _something;

>        if (sbi->segs_per_sec > 1)

>          do_something;

>            do_something #2;

>              if (f2fs_discard_en())

>                do _something;

>                  if (sbi->segs_per_sec > 1)

>                    do_something;

>                      ...

>                        do_something #N;

>                          if (f2fs_discard_en())

>                            do _something;

>                              if (sbi->segs_per_sec > 1)

>                                do_something;

> 

> Anyway, I don't see huge benefit regarding to performance and code readability,

> but do worry about potential patch conflicts when backporting this.

> 

ok, I think it depends on the kind and complexity of BTB the target processor 
or architecture uses and how the target architecture enhances parallelization.

I am new to f2fs, curious about the f2fs detailed implementation.
I'm more focused on the code readability since some code with jumps are 
a little bit hacky. :)

Thanks,

>> Taking branch predictor into consideration, I think the unrolled one is more easier to predict

>> than the current rolled one. (as you say, the current has more conditions in the loop

>> and a exit condition to predict)

>>

>>>

>>> In the patch,

>>> 	for (N)

>>> 		do_something;

>>> 	for (sits_in_cursum())

>>> 		do_something;

>>>

>>> 	if (f2fs_discard_en()) {

>>> 		for (N)

>>> 			do_something;

>>> 	}

>>> 	if (sbi->segs_per_sec > 1) {

>>> 		for (N)

>>> 			do_something;

>>> 	}

>>> => O(3*N + sits_in_cursum())

>>

>> If we use Big O notion, I think O(N + sits_in_cursum()) too.

>>>

>>> BTW, build_discard_entries(struct f2fs_sb_info *sbi) is more likely to split

>>> the loops, as you describe.

>>>

>>> In build_discard_entries(),

>>> 	for (N) {

>>> 		if (CP_TRIMMED_FLAG)

>>> 			do_something;

>>> 		else

>>> 			do_semething_else;

>>> 	}

>>> Thanks,

>>>

>>

>> I think so, yet CP_TRIMMED_FLAG comes from a function argument, so I think compiler

>> cannot decide whether it is constant.

>>

>> I think if we count the runtime condition & jump instruction at runtime.

>> the unrolls above are beneficial, I think.

>>

>> Thanks.

>>

>>>>

>>>> In addtion, this approach is easier to read than the original

>>>> one, so splitting the loop would be beneficial.

>>>>

>>>> Thanks

>>>>

>>>>>>

>>>>>> Note that "if (f2fs_discard_en(sbi))" and "if (sbi->segs_per_sec > 1)"

>>>>>> are all taken out of the loops they are unchangable.

>>>>>>

>>>>>> Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>

>>>>>> ---

>>>>>>     fs/f2fs/segment.c | 80 +++++++++++++++++++++++++------------------------------

>>>>>>     1 file changed, 37 insertions(+), 43 deletions(-)

>>>>>>

>>>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c

>>>>>> index 40e1d20..bcd8a40 100644

>>>>>> --- a/fs/f2fs/segment.c

>>>>>> +++ b/fs/f2fs/segment.c

>>>>>> @@ -3476,12 +3476,39 @@ static int build_curseg(struct f2fs_sb_info *sbi)

>>>>>>     	return restore_curseg_summaries(sbi);

>>>>>>     }

>>>>>>     

>>>>>> +static void build_discard_entries(struct f2fs_sb_info *sbi)

>>>>>> +{

>>>>>> +	unsigned segno;

>>>>>> +

>>>>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno) {

>>>>>> +		struct seg_entry *se = get_seg_entry(sbi, segno);

>>>>>> +

>>>>>> +		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG))

>>>>>> +			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);

>>>>>> +		else {

>>>>>> +			memcpy(se->discard_map, se->cur_valid_map,

>>>>>> +				SIT_VBLOCK_MAP_SIZE);

>>>>>> +

>>>>>> +			sbi->discard_blks += sbi->blocks_per_seg -

>>>>>> +				se->valid_blocks;

>>>>>> +		}

>>>>>> +	}

>>>>>> +}

>>>>>> +

>>>>>> +static void build_sec_entries(struct f2fs_sb_info *sbi)

>>>>>> +{

>>>>>> +	unsigned segno;

>>>>>> +

>>>>>> +	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno)

>>>>>> +		get_sec_entry(sbi, segno)->valid_blocks +=

>>>>>> +			get_seg_entry(sbi, segno)->valid_blocks;

>>>>>> +}

>>>>>> +

>>>>>>     static void build_sit_entries(struct f2fs_sb_info *sbi)

>>>>>>     {

>>>>>>     	struct sit_info *sit_i = SIT_I(sbi);

>>>>>>     	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);

>>>>>>     	struct f2fs_journal *journal = curseg->journal;

>>>>>> -	struct seg_entry *se;

>>>>>>     	struct f2fs_sit_entry sit;

>>>>>>     	int sit_blk_cnt = SIT_BLK_CNT(sbi);

>>>>>>     	unsigned int i, start, end;

>>>>>> @@ -3498,67 +3525,34 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)

>>>>>>     			struct f2fs_sit_block *sit_blk;

>>>>>>     			struct page *page;

>>>>>>     

>>>>>> -			se = &sit_i->sentries[start];

>>>>>>     			page = get_current_sit_page(sbi, start);

>>>>>>     			sit_blk = (struct f2fs_sit_block *)page_address(page);

>>>>>>     			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];

>>>>>>     			f2fs_put_page(page, 1);

>>>>>>     

>>>>>>     			check_block_count(sbi, start, &sit);

>>>>>> -			seg_info_from_raw_sit(se, &sit);

>>>>>> -

>>>>>> -			/* build discard map only one time */

>>>>>> -			if (f2fs_discard_en(sbi)) {

>>>>>> -				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>>>>>> -					memset(se->discard_map, 0xff,

>>>>>> -						SIT_VBLOCK_MAP_SIZE);

>>>>>> -				} else {

>>>>>> -					memcpy(se->discard_map,

>>>>>> -						se->cur_valid_map,

>>>>>> -						SIT_VBLOCK_MAP_SIZE);

>>>>>> -					sbi->discard_blks +=

>>>>>> -						sbi->blocks_per_seg -

>>>>>> -						se->valid_blocks;

>>>>>> -				}

>>>>>> -			}

>>>>>>     

>>>>>> -			if (sbi->segs_per_sec > 1)

>>>>>> -				get_sec_entry(sbi, start)->valid_blocks +=

>>>>>> -							se->valid_blocks;

>>>>>> +			seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>>>>>     		}

>>>>>>     		start_blk += readed;

>>>>>>     	} while (start_blk < sit_blk_cnt);

>>>>>>     

>>>>>>     	down_read(&curseg->journal_rwsem);

>>>>>>     	for (i = 0; i < sits_in_cursum(journal); i++) {

>>>>>> -		unsigned int old_valid_blocks;

>>>>>> -

>>>>>>     		start = le32_to_cpu(segno_in_journal(journal, i));

>>>>>> -		se = &sit_i->sentries[start];

>>>>>>     		sit = sit_in_journal(journal, i);

>>>>>>     

>>>>>> -		old_valid_blocks = se->valid_blocks;

>>>>>> -

>>>>>>     		check_block_count(sbi, start, &sit);

>>>>>> -		seg_info_from_raw_sit(se, &sit);

>>>>>> -

>>>>>> -		if (f2fs_discard_en(sbi)) {

>>>>>> -			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {

>>>>>> -				memset(se->discard_map, 0xff,

>>>>>> -							SIT_VBLOCK_MAP_SIZE);

>>>>>> -			} else {

>>>>>> -				memcpy(se->discard_map, se->cur_valid_map,

>>>>>> -							SIT_VBLOCK_MAP_SIZE);

>>>>>> -				sbi->discard_blks += old_valid_blocks -

>>>>>> -							se->valid_blocks;

>>>>>> -			}

>>>>>> -		}

>>>>>> -

>>>>>> -		if (sbi->segs_per_sec > 1)

>>>>>> -			get_sec_entry(sbi, start)->valid_blocks +=

>>>>>> -				se->valid_blocks - old_valid_blocks;

>>>>>> +		seg_info_from_raw_sit(&sit_i->sentries[start], &sit);

>>>>>>     	}

>>>>>>     	up_read(&curseg->journal_rwsem);

>>>>>> +

>>>>>> +	/* build discard map only one time */

>>>>>> +	if (f2fs_discard_en(sbi))

>>>>>> +		build_discard_entries(sbi);

>>>>>> +

>>>>>> +	if (sbi->segs_per_sec > 1)

>>>>>> +		build_sec_entries(sbi);

>>>>>>     }

>>>>>>     

>>>>>>     static void init_free_segmap(struct f2fs_sb_info *sbi)

>>>>>> -- 

>>>>>> 2.1.4
diff mbox

Patch

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 40e1d20..bcd8a40 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3476,12 +3476,39 @@  static int build_curseg(struct f2fs_sb_info *sbi)
 	return restore_curseg_summaries(sbi);
 }
 
+static void build_discard_entries(struct f2fs_sb_info *sbi)
+{
+	unsigned segno;
+
+	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno) {
+		struct seg_entry *se = get_seg_entry(sbi, segno);
+
+		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG))
+			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
+		else {
+			memcpy(se->discard_map, se->cur_valid_map,
+				SIT_VBLOCK_MAP_SIZE);
+
+			sbi->discard_blks += sbi->blocks_per_seg -
+				se->valid_blocks;
+		}
+	}
+}
+
+static void build_sec_entries(struct f2fs_sb_info *sbi)
+{
+	unsigned segno;
+
+	for (segno = 0; segno < MAIN_SEGS(sbi); ++segno)
+		get_sec_entry(sbi, segno)->valid_blocks +=
+			get_seg_entry(sbi, segno)->valid_blocks;
+}
+
 static void build_sit_entries(struct f2fs_sb_info *sbi)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
 	struct f2fs_journal *journal = curseg->journal;
-	struct seg_entry *se;
 	struct f2fs_sit_entry sit;
 	int sit_blk_cnt = SIT_BLK_CNT(sbi);
 	unsigned int i, start, end;
@@ -3498,67 +3525,34 @@  static void build_sit_entries(struct f2fs_sb_info *sbi)
 			struct f2fs_sit_block *sit_blk;
 			struct page *page;
 
-			se = &sit_i->sentries[start];
 			page = get_current_sit_page(sbi, start);
 			sit_blk = (struct f2fs_sit_block *)page_address(page);
 			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
 			f2fs_put_page(page, 1);
 
 			check_block_count(sbi, start, &sit);
-			seg_info_from_raw_sit(se, &sit);
-
-			/* build discard map only one time */
-			if (f2fs_discard_en(sbi)) {
-				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
-					memset(se->discard_map, 0xff,
-						SIT_VBLOCK_MAP_SIZE);
-				} else {
-					memcpy(se->discard_map,
-						se->cur_valid_map,
-						SIT_VBLOCK_MAP_SIZE);
-					sbi->discard_blks +=
-						sbi->blocks_per_seg -
-						se->valid_blocks;
-				}
-			}
 
-			if (sbi->segs_per_sec > 1)
-				get_sec_entry(sbi, start)->valid_blocks +=
-							se->valid_blocks;
+			seg_info_from_raw_sit(&sit_i->sentries[start], &sit);
 		}
 		start_blk += readed;
 	} while (start_blk < sit_blk_cnt);
 
 	down_read(&curseg->journal_rwsem);
 	for (i = 0; i < sits_in_cursum(journal); i++) {
-		unsigned int old_valid_blocks;
-
 		start = le32_to_cpu(segno_in_journal(journal, i));
-		se = &sit_i->sentries[start];
 		sit = sit_in_journal(journal, i);
 
-		old_valid_blocks = se->valid_blocks;
-
 		check_block_count(sbi, start, &sit);
-		seg_info_from_raw_sit(se, &sit);
-
-		if (f2fs_discard_en(sbi)) {
-			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
-				memset(se->discard_map, 0xff,
-							SIT_VBLOCK_MAP_SIZE);
-			} else {
-				memcpy(se->discard_map, se->cur_valid_map,
-							SIT_VBLOCK_MAP_SIZE);
-				sbi->discard_blks += old_valid_blocks -
-							se->valid_blocks;
-			}
-		}
-
-		if (sbi->segs_per_sec > 1)
-			get_sec_entry(sbi, start)->valid_blocks +=
-				se->valid_blocks - old_valid_blocks;
+		seg_info_from_raw_sit(&sit_i->sentries[start], &sit);
 	}
 	up_read(&curseg->journal_rwsem);
+
+	/* build discard map only one time */
+	if (f2fs_discard_en(sbi))
+		build_discard_entries(sbi);
+
+	if (sbi->segs_per_sec > 1)
+		build_sec_entries(sbi);
 }
 
 static void init_free_segmap(struct f2fs_sb_info *sbi)