diff mbox

[v2] ocfs2: improve recovery performance

Message ID 1466155682-24656-1-git-send-email-junxiao.bi@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Junxiao Bi June 17, 2016, 9:28 a.m. UTC
Journal replay will be run when do recovery for a dead node,
to avoid the stale cache impact, all blocks of dead node's
journal inode were reload from disk. This hurts the performance,
check whether one block is cached before reload it can improve
a lot performance. In my test env, the time doing recovery was
improved from 120s to 1s.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
---
 fs/ocfs2/journal.c |   41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

Comments

Joseph Qi June 17, 2016, 9:43 a.m. UTC | #1
On 2016/6/17 17:28, Junxiao Bi wrote:
> Journal replay will be run when do recovery for a dead node,
> to avoid the stale cache impact, all blocks of dead node's
> journal inode were reload from disk. This hurts the performance,
> check whether one block is cached before reload it can improve
> a lot performance. In my test env, the time doing recovery was
> improved from 120s to 1s.
> 
> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Looks good to me. And it indeed has performance improvement from my
test.
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>

> ---
>  fs/ocfs2/journal.c |   41 ++++++++++++++++++++++-------------------
>  1 file changed, 22 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
> index e607419cdfa4..bc0e21e8a674 100644
> --- a/fs/ocfs2/journal.c
> +++ b/fs/ocfs2/journal.c
> @@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
>  	int status = 0;
>  	int i;
>  	u64 v_blkno, p_blkno, p_blocks, num_blocks;
> -#define CONCURRENT_JOURNAL_FILL 32ULL
> -	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
> -
> -	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
> +	struct buffer_head *bh = NULL;
> +	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>  
>  	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
>  	v_blkno = 0;
> @@ -1174,29 +1172,34 @@ static int ocfs2_force_read_journal(struct inode *inode)
>  			goto bail;
>  		}
>  
> -		if (p_blocks > CONCURRENT_JOURNAL_FILL)
> -			p_blocks = CONCURRENT_JOURNAL_FILL;
> +		for (i = 0; i < p_blocks; i++) {
> +			bh = __find_get_block(osb->sb->s_bdev, p_blkno,
> +					osb->sb->s_blocksize);
> +			/* block not cached. */
> +			if (!bh) {
> +				p_blkno++;
> +				continue;
> +			}
>  
> -		/* We are reading journal data which should not
> -		 * be put in the uptodate cache */
> -		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
> -						p_blkno, p_blocks, bhs);
> -		if (status < 0) {
> -			mlog_errno(status);
> -			goto bail;
> -		}
> +			brelse(bh);
> +			bh = NULL;
> +			/* We are reading journal data which should not
> +			 * be put in the uptodate cache.
> +			 */
> +			status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
> +			if (status < 0) {
> +				mlog_errno(status);
> +				goto bail;
> +			}
>  
> -		for(i = 0; i < p_blocks; i++) {
> -			brelse(bhs[i]);
> -			bhs[i] = NULL;
> +			brelse(bh);
> +			bh = NULL;
>  		}
>  
>  		v_blkno += p_blocks;
>  	}
>  
>  bail:
> -	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
> -		brelse(bhs[i]);
>  	return status;
>  }
>  
>
Gang He June 20, 2016, 3:10 a.m. UTC | #2
Hello Junxiao,

I think this change will bring a performance improvement, but from the function comments
/*
 * JBD Might read a cached version of another nodes journal file. We
 * don't want this as this file changes often and we get no
 * notification on those changes. The only way to be sure that we've
 * got the most up to date version of those blocks then is to force
 * read them off disk. Just searching through the buffer cache won't
 * work as there may be pages backing this file which are still marked
 * up to date. We know things can't change on this file underneath us
 * as we have the lock by now :)
 */
static int ocfs2_force_read_journal(struct inode *inode)

Did we consider this potential risk behind this patch? I am not familiar with this part code, 
I want to know if there is any sync mechanism to make sure the block cache for another node journal file is really the latest data?  



Thanks
Gang 


>>> 
> On 2016/6/17 17:28, Junxiao Bi wrote:
>> Journal replay will be run when do recovery for a dead node,
>> to avoid the stale cache impact, all blocks of dead node's
>> journal inode were reload from disk. This hurts the performance,
>> check whether one block is cached before reload it can improve
>> a lot performance. In my test env, the time doing recovery was
>> improved from 120s to 1s.
>> 
>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
> Looks good to me. And it indeed has performance improvement from my
> test.
> Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
> 
>> ---
>>  fs/ocfs2/journal.c |   41 ++++++++++++++++++++++-------------------
>>  1 file changed, 22 insertions(+), 19 deletions(-)
>> 
>> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
>> index e607419cdfa4..bc0e21e8a674 100644
>> --- a/fs/ocfs2/journal.c
>> +++ b/fs/ocfs2/journal.c
>> @@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode 
> *inode)
>>  	int status = 0;
>>  	int i;
>>  	u64 v_blkno, p_blkno, p_blocks, num_blocks;
>> -#define CONCURRENT_JOURNAL_FILL 32ULL
>> -	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
>> -
>> -	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
>> +	struct buffer_head *bh = NULL;
>> +	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>>  
>>  	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
>>  	v_blkno = 0;
>> @@ -1174,29 +1172,34 @@ static int ocfs2_force_read_journal(struct inode 
> *inode)
>>  			goto bail;
>>  		}
>>  
>> -		if (p_blocks > CONCURRENT_JOURNAL_FILL)
>> -			p_blocks = CONCURRENT_JOURNAL_FILL;
>> +		for (i = 0; i < p_blocks; i++) {
>> +			bh = __find_get_block(osb->sb->s_bdev, p_blkno,
>> +					osb->sb->s_blocksize);
>> +			/* block not cached. */
>> +			if (!bh) {
>> +				p_blkno++;
>> +				continue;
>> +			}
>>  
>> -		/* We are reading journal data which should not
>> -		 * be put in the uptodate cache */
>> -		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
>> -						p_blkno, p_blocks, bhs);
>> -		if (status < 0) {
>> -			mlog_errno(status);
>> -			goto bail;
>> -		}
>> +			brelse(bh);
>> +			bh = NULL;
>> +			/* We are reading journal data which should not
>> +			 * be put in the uptodate cache.
>> +			 */
>> +			status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
>> +			if (status < 0) {
>> +				mlog_errno(status);
>> +				goto bail;
>> +			}
>>  
>> -		for(i = 0; i < p_blocks; i++) {
>> -			brelse(bhs[i]);
>> -			bhs[i] = NULL;
>> +			brelse(bh);
>> +			bh = NULL;
>>  		}
>>  
>>  		v_blkno += p_blocks;
>>  	}
>>  
>>  bail:
>> -	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
>> -		brelse(bhs[i]);
>>  	return status;
>>  }
>>  
>> 
> 
> 
> 
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com 
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Junxiao Bi June 21, 2016, 9:03 a.m. UTC | #3
Hi Gang,

On 06/20/2016 11:10 AM, Gang He wrote:
> Hello Junxiao,
> 
> I think this change will bring a performance improvement, but from the function comments
> /*
>  * JBD Might read a cached version of another nodes journal file. We
>  * don't want this as this file changes often and we get no
>  * notification on those changes. The only way to be sure that we've
>  * got the most up to date version of those blocks then is to force
>  * read them off disk. Just searching through the buffer cache won't
>  * work as there may be pages backing this file which are still marked
>  * up to date. We know things can't change on this file underneath us
>  * as we have the lock by now :)
>  */
> static int ocfs2_force_read_journal(struct inode *inode)
> 
> Did we consider this potential risk behind this patch? I am not familiar with this part code, 
> I want to know if there is any sync mechanism to make sure the block cache for another node journal file is really the latest data?  
I don't see that is needed, because those stale info will not be used
except journal replay.

Thanks,
Junxiao.
> 
> 
> 
> Thanks
> Gang 
> 
> 
>>>>
>> On 2016/6/17 17:28, Junxiao Bi wrote:
>>> Journal replay will be run when do recovery for a dead node,
>>> to avoid the stale cache impact, all blocks of dead node's
>>> journal inode were reload from disk. This hurts the performance,
>>> check whether one block is cached before reload it can improve
>>> a lot performance. In my test env, the time doing recovery was
>>> improved from 120s to 1s.
>>>
>>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>> Looks good to me. And it indeed has performance improvement from my
>> test.
>> Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
>>
>>> ---
>>>  fs/ocfs2/journal.c |   41 ++++++++++++++++++++++-------------------
>>>  1 file changed, 22 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
>>> index e607419cdfa4..bc0e21e8a674 100644
>>> --- a/fs/ocfs2/journal.c
>>> +++ b/fs/ocfs2/journal.c
>>> @@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode 
>> *inode)
>>>  	int status = 0;
>>>  	int i;
>>>  	u64 v_blkno, p_blkno, p_blocks, num_blocks;
>>> -#define CONCURRENT_JOURNAL_FILL 32ULL
>>> -	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
>>> -
>>> -	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
>>> +	struct buffer_head *bh = NULL;
>>> +	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>>>  
>>>  	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
>>>  	v_blkno = 0;
>>> @@ -1174,29 +1172,34 @@ static int ocfs2_force_read_journal(struct inode 
>> *inode)
>>>  			goto bail;
>>>  		}
>>>  
>>> -		if (p_blocks > CONCURRENT_JOURNAL_FILL)
>>> -			p_blocks = CONCURRENT_JOURNAL_FILL;
>>> +		for (i = 0; i < p_blocks; i++) {
>>> +			bh = __find_get_block(osb->sb->s_bdev, p_blkno,
>>> +					osb->sb->s_blocksize);
>>> +			/* block not cached. */
>>> +			if (!bh) {
>>> +				p_blkno++;
>>> +				continue;
>>> +			}
>>>  
>>> -		/* We are reading journal data which should not
>>> -		 * be put in the uptodate cache */
>>> -		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
>>> -						p_blkno, p_blocks, bhs);
>>> -		if (status < 0) {
>>> -			mlog_errno(status);
>>> -			goto bail;
>>> -		}
>>> +			brelse(bh);
>>> +			bh = NULL;
>>> +			/* We are reading journal data which should not
>>> +			 * be put in the uptodate cache.
>>> +			 */
>>> +			status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
>>> +			if (status < 0) {
>>> +				mlog_errno(status);
>>> +				goto bail;
>>> +			}
>>>  
>>> -		for(i = 0; i < p_blocks; i++) {
>>> -			brelse(bhs[i]);
>>> -			bhs[i] = NULL;
>>> +			brelse(bh);
>>> +			bh = NULL;
>>>  		}
>>>  
>>>  		v_blkno += p_blocks;
>>>  	}
>>>  
>>>  bail:
>>> -	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
>>> -		brelse(bhs[i]);
>>>  	return status;
>>>  }
>>>  
>>>
>>
>>
>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel@oss.oracle.com 
>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
Junxiao Bi June 23, 2016, 1:17 a.m. UTC | #4
Hi Andrew,

Did you miss this patch to your tree?

Thanks,
Junxiao.

On 06/17/2016 05:43 PM, Joseph Qi wrote:
> On 2016/6/17 17:28, Junxiao Bi wrote:
>> Journal replay will be run when do recovery for a dead node,
>> to avoid the stale cache impact, all blocks of dead node's
>> journal inode were reload from disk. This hurts the performance,
>> check whether one block is cached before reload it can improve
>> a lot performance. In my test env, the time doing recovery was
>> improved from 120s to 1s.
>>
>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
> Looks good to me. And it indeed has performance improvement from my
> test.
> Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
> 
>> ---
>>  fs/ocfs2/journal.c |   41 ++++++++++++++++++++++-------------------
>>  1 file changed, 22 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
>> index e607419cdfa4..bc0e21e8a674 100644
>> --- a/fs/ocfs2/journal.c
>> +++ b/fs/ocfs2/journal.c
>> @@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
>>  	int status = 0;
>>  	int i;
>>  	u64 v_blkno, p_blkno, p_blocks, num_blocks;
>> -#define CONCURRENT_JOURNAL_FILL 32ULL
>> -	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
>> -
>> -	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
>> +	struct buffer_head *bh = NULL;
>> +	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>>  
>>  	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
>>  	v_blkno = 0;
>> @@ -1174,29 +1172,34 @@ static int ocfs2_force_read_journal(struct inode *inode)
>>  			goto bail;
>>  		}
>>  
>> -		if (p_blocks > CONCURRENT_JOURNAL_FILL)
>> -			p_blocks = CONCURRENT_JOURNAL_FILL;
>> +		for (i = 0; i < p_blocks; i++) {
>> +			bh = __find_get_block(osb->sb->s_bdev, p_blkno,
>> +					osb->sb->s_blocksize);
>> +			/* block not cached. */
>> +			if (!bh) {
>> +				p_blkno++;
>> +				continue;
>> +			}
>>  
>> -		/* We are reading journal data which should not
>> -		 * be put in the uptodate cache */
>> -		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
>> -						p_blkno, p_blocks, bhs);
>> -		if (status < 0) {
>> -			mlog_errno(status);
>> -			goto bail;
>> -		}
>> +			brelse(bh);
>> +			bh = NULL;
>> +			/* We are reading journal data which should not
>> +			 * be put in the uptodate cache.
>> +			 */
>> +			status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
>> +			if (status < 0) {
>> +				mlog_errno(status);
>> +				goto bail;
>> +			}
>>  
>> -		for(i = 0; i < p_blocks; i++) {
>> -			brelse(bhs[i]);
>> -			bhs[i] = NULL;
>> +			brelse(bh);
>> +			bh = NULL;
>>  		}
>>  
>>  		v_blkno += p_blocks;
>>  	}
>>  
>>  bail:
>> -	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
>> -		brelse(bhs[i]);
>>  	return status;
>>  }
>>  
>>
> 
>
Andrew Morton June 23, 2016, 10:13 p.m. UTC | #5
On Thu, 23 Jun 2016 09:17:53 +0800 Junxiao Bi <junxiao.bi@oracle.com> wrote:

> Hi Andrew,
> 
> Did you miss this patch to your tree?

I would have seen it eventually.  Explicitly cc'ing me on patches
helps, please.
Junxiao Bi June 24, 2016, 12:46 a.m. UTC | #6
On 06/24/2016 06:13 AM, Andrew Morton wrote:
> On Thu, 23 Jun 2016 09:17:53 +0800 Junxiao Bi <junxiao.bi@oracle.com> wrote:
> 
>> Hi Andrew,
>>
>> Did you miss this patch to your tree?
> 
> I would have seen it eventually.  Explicitly cc'ing me on patches
> helps, please.
I see, will cc you next time.

Thanks,
Junxiao.
>
Xue jiufei July 7, 2016, 2:05 a.m. UTC | #7
Hi Junxiao,
p_blkno is not increased after force reading from disk, so
this block is read many times from disk while other blocks
remain in cached are not reloaded.

Thanks,
Jiufei

On 2016/6/17 17:28, Junxiao Bi wrote:
> Journal replay will be run when do recovery for a dead node,
> to avoid the stale cache impact, all blocks of dead node's
> journal inode were reload from disk. This hurts the performance,
> check whether one block is cached before reload it can improve
> a lot performance. In my test env, the time doing recovery was
> improved from 120s to 1s.
> 
> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
> ---
>  fs/ocfs2/journal.c |   41 ++++++++++++++++++++++-------------------
>  1 file changed, 22 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
> index e607419cdfa4..bc0e21e8a674 100644
> --- a/fs/ocfs2/journal.c
> +++ b/fs/ocfs2/journal.c
> @@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
>  	int status = 0;
>  	int i;
>  	u64 v_blkno, p_blkno, p_blocks, num_blocks;
> -#define CONCURRENT_JOURNAL_FILL 32ULL
> -	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
> -
> -	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
> +	struct buffer_head *bh = NULL;
> +	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>  
>  	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
>  	v_blkno = 0;
> @@ -1174,29 +1172,34 @@ static int ocfs2_force_read_journal(struct inode *inode)
>  			goto bail;
>  		}
>  
> -		if (p_blocks > CONCURRENT_JOURNAL_FILL)
> -			p_blocks = CONCURRENT_JOURNAL_FILL;
> +		for (i = 0; i < p_blocks; i++) {
> +			bh = __find_get_block(osb->sb->s_bdev, p_blkno,
> +					osb->sb->s_blocksize);
> +			/* block not cached. */
> +			if (!bh) {
> +				p_blkno++;
> +				continue;
> +			}
>  
> -		/* We are reading journal data which should not
> -		 * be put in the uptodate cache */
> -		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
> -						p_blkno, p_blocks, bhs);
> -		if (status < 0) {
> -			mlog_errno(status);
> -			goto bail;
> -		}
> +			brelse(bh);
> +			bh = NULL;
> +			/* We are reading journal data which should not
> +			 * be put in the uptodate cache.
> +			 */
> +			status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
> +			if (status < 0) {
> +				mlog_errno(status);
> +				goto bail;
> +			}
>  
> -		for(i = 0; i < p_blocks; i++) {
> -			brelse(bhs[i]);
> -			bhs[i] = NULL;
> +			brelse(bh);
> +			bh = NULL;
>  		}
>  
>  		v_blkno += p_blocks;
>  	}
>  
>  bail:
> -	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
> -		brelse(bhs[i]);
>  	return status;
>  }
>  
>
Junxiao Bi July 7, 2016, 2:16 a.m. UTC | #8
On 07/07/2016 10:05 AM, xuejiufei wrote:
> Hi Junxiao,
> p_blkno is not increased after force reading from disk, so
> this block is read many times from disk while other blocks
> remain in cached are not reloaded.
Good catch. Will send a v2 version.

Thanks,
Junxiao.
> 
> Thanks,
> Jiufei
> 
> On 2016/6/17 17:28, Junxiao Bi wrote:
>> Journal replay will be run when do recovery for a dead node,
>> to avoid the stale cache impact, all blocks of dead node's
>> journal inode were reload from disk. This hurts the performance,
>> check whether one block is cached before reload it can improve
>> a lot performance. In my test env, the time doing recovery was
>> improved from 120s to 1s.
>>
>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>> ---
>>  fs/ocfs2/journal.c |   41 ++++++++++++++++++++++-------------------
>>  1 file changed, 22 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
>> index e607419cdfa4..bc0e21e8a674 100644
>> --- a/fs/ocfs2/journal.c
>> +++ b/fs/ocfs2/journal.c
>> @@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
>>  	int status = 0;
>>  	int i;
>>  	u64 v_blkno, p_blkno, p_blocks, num_blocks;
>> -#define CONCURRENT_JOURNAL_FILL 32ULL
>> -	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
>> -
>> -	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
>> +	struct buffer_head *bh = NULL;
>> +	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>>  
>>  	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
>>  	v_blkno = 0;
>> @@ -1174,29 +1172,34 @@ static int ocfs2_force_read_journal(struct inode *inode)
>>  			goto bail;
>>  		}
>>  
>> -		if (p_blocks > CONCURRENT_JOURNAL_FILL)
>> -			p_blocks = CONCURRENT_JOURNAL_FILL;
>> +		for (i = 0; i < p_blocks; i++) {
>> +			bh = __find_get_block(osb->sb->s_bdev, p_blkno,
>> +					osb->sb->s_blocksize);
>> +			/* block not cached. */
>> +			if (!bh) {
>> +				p_blkno++;
>> +				continue;
>> +			}
>>  
>> -		/* We are reading journal data which should not
>> -		 * be put in the uptodate cache */
>> -		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
>> -						p_blkno, p_blocks, bhs);
>> -		if (status < 0) {
>> -			mlog_errno(status);
>> -			goto bail;
>> -		}
>> +			brelse(bh);
>> +			bh = NULL;
>> +			/* We are reading journal data which should not
>> +			 * be put in the uptodate cache.
>> +			 */
>> +			status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
>> +			if (status < 0) {
>> +				mlog_errno(status);
>> +				goto bail;
>> +			}
>>  
>> -		for(i = 0; i < p_blocks; i++) {
>> -			brelse(bhs[i]);
>> -			bhs[i] = NULL;
>> +			brelse(bh);
>> +			bh = NULL;
>>  		}
>>  
>>  		v_blkno += p_blocks;
>>  	}
>>  
>>  bail:
>> -	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
>> -		brelse(bhs[i]);
>>  	return status;
>>  }
>>  
>>
>
diff mbox

Patch

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index e607419cdfa4..bc0e21e8a674 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1159,10 +1159,8 @@  static int ocfs2_force_read_journal(struct inode *inode)
 	int status = 0;
 	int i;
 	u64 v_blkno, p_blkno, p_blocks, num_blocks;
-#define CONCURRENT_JOURNAL_FILL 32ULL
-	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
-
-	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 	v_blkno = 0;
@@ -1174,29 +1172,34 @@  static int ocfs2_force_read_journal(struct inode *inode)
 			goto bail;
 		}
 
-		if (p_blocks > CONCURRENT_JOURNAL_FILL)
-			p_blocks = CONCURRENT_JOURNAL_FILL;
+		for (i = 0; i < p_blocks; i++) {
+			bh = __find_get_block(osb->sb->s_bdev, p_blkno,
+					osb->sb->s_blocksize);
+			/* block not cached. */
+			if (!bh) {
+				p_blkno++;
+				continue;
+			}
 
-		/* We are reading journal data which should not
-		 * be put in the uptodate cache */
-		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
-						p_blkno, p_blocks, bhs);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
+			brelse(bh);
+			bh = NULL;
+			/* We are reading journal data which should not
+			 * be put in the uptodate cache.
+			 */
+			status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
 
-		for(i = 0; i < p_blocks; i++) {
-			brelse(bhs[i]);
-			bhs[i] = NULL;
+			brelse(bh);
+			bh = NULL;
 		}
 
 		v_blkno += p_blocks;
 	}
 
 bail:
-	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
-		brelse(bhs[i]);
 	return status;
 }