ocfs2: submit another bio if current bio is full
diff mbox

Message ID 1523598667-29401-1-git-send-email-ge.changwei@h3c.com
State New
Headers show

Commit Message

Changwei Ge April 13, 2018, 5:51 a.m. UTC
If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()
returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
from o2hb_setup_one_bio() will lead to losing chance to allocate more
bios to present all heartbeat region.

So o2hb_read_slots() fails.

In my test, making fs fails in starting o2cb service.

Attach error log:
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
(mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
(mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
(mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
(mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5

Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"

Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
---
 fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

Comments

piaojun April 16, 2018, 3:44 a.m. UTC | #1
Hi Changwei,

Do you mean that if the slotnum exceed 16 like 'mkfs.ocfs2 -N 17', you
still let it go rather than reture error?

thanks,
Jun

On 2018/4/13 13:51, Changwei Ge wrote:
> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()
> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
> from o2hb_setup_one_bio() will lead to losing chance to allocate more
> bios to present all heartbeat region.
> 
> So o2hb_read_slots() fails.
> 
> In my test, making fs fails in starting o2cb service.
> 
> Attach error log:
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
> 
> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"
> 
> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
> ---
>  fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
> index 91a8889abf9b..2809e29d612d 100644
> --- a/fs/ocfs2/cluster/heartbeat.c
> +++ b/fs/ocfs2/cluster/heartbeat.c
> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>  	struct bio *bio;
>  	struct page *page;
>  
> +#define O2HB_BIO_VECS 16
>  	/* Testing has shown this allocation to take long enough under
>  	 * GFP_KERNEL that the local node can get fenced. It would be
>  	 * nicest if we could pre-allocate these bios and avoid this
>  	 * all together. */
> -	bio = bio_alloc(GFP_ATOMIC, 16);
> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>  	if (!bio) {
>  		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>  		bio = ERR_PTR(-ENOMEM);
> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>  		     current_page, vec_len, vec_start);
>  
>  		len = bio_add_page(bio, page, vec_len, vec_start);
> -		if (len != vec_len) {
> +		if (len == 0 && current_page == O2HB_BIO_VECS) {
> +			/* bio is full now. */
> +			goto bail;
> +		} else if (len != vec_len) {
>  			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>  			     "page %p, len %d, vec_len %u, vec_start %u, "
>  			     "bi_sector %llu\n", current_page, page, len,
>
Changwei Ge May 8, 2018, 3:57 p.m. UTC | #2
Hi Jun,

Sorry for this so late reply since I was very busy those days.


On 04/16/2018 11:44 AM, piaojun wrote:
> Hi Changwei,
>
> Do you mean that if the slotnum exceed 16 like 'mkfs.ocfs2 -N 17', you
> still let it go rather than reture error?

If your assumption is right, do you mean that ocfs2 slots can't exceed 16?

If you return error once slots exceed 16, mkfs will never succeed.

So if we can ensure that bio is full in current iteration, we should run 
into next iteration and allocate a
new bio adding pages and continue.

And your patch makes my ocfs2-test fail.


Thanks,
Changwei

>
> thanks,
> Jun
>
> On 2018/4/13 13:51, Changwei Ge wrote:
>> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()
>> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>> bios to present all heartbeat region.
>>
>> So o2hb_read_slots() fails.
>>
>> In my test, making fs fails in starting o2cb service.
>>
>> Attach error log:
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>
>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"
>>
>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>> ---
>>   fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>   1 file changed, 6 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
>> index 91a8889abf9b..2809e29d612d 100644
>> --- a/fs/ocfs2/cluster/heartbeat.c
>> +++ b/fs/ocfs2/cluster/heartbeat.c
>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>   	struct bio *bio;
>>   	struct page *page;
>>   
>> +#define O2HB_BIO_VECS 16
>>   	/* Testing has shown this allocation to take long enough under
>>   	 * GFP_KERNEL that the local node can get fenced. It would be
>>   	 * nicest if we could pre-allocate these bios and avoid this
>>   	 * all together. */
>> -	bio = bio_alloc(GFP_ATOMIC, 16);
>> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>   	if (!bio) {
>>   		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>   		bio = ERR_PTR(-ENOMEM);
>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>   		     current_page, vec_len, vec_start);
>>   
>>   		len = bio_add_page(bio, page, vec_len, vec_start);
>> -		if (len != vec_len) {
>> +		if (len == 0 && current_page == O2HB_BIO_VECS) {
>> +			/* bio is full now. */
>> +			goto bail;
>> +		} else if (len != vec_len) {
>>   			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>   			     "page %p, len %d, vec_len %u, vec_start %u, "
>>   			     "bi_sector %llu\n", current_page, page, len,
>>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
piaojun May 9, 2018, 8:50 a.m. UTC | #3
Hi Changwei,

On 2018/5/8 23:57, Changwei Ge wrote:
> Hi Jun,
> 
> Sorry for this so late reply since I was very busy those days.
> 
> 
> On 04/16/2018 11:44 AM, piaojun wrote:
>> Hi Changwei,
>>
>> Do you mean that if the slotnum exceed 16 like 'mkfs.ocfs2 -N 17', you
>> still let it go rather than reture error?
> 
> If your assumption is right, do you mean that ocfs2 slots can't exceed 16?
> 
> If you return error once slots exceed 16, mkfs will never succeed.
> 
> So if we can ensure that bio is full in current iteration, we should run 
> into next iteration and allocate a
> new bio adding pages and continue.
> 
> And your patch makes my ocfs2-test fail.
> 
> 
> Thanks,
> Changwei
> 
>>
>> thanks,
>> Jun
>>
>> On 2018/4/13 13:51, Changwei Ge wrote:
>>> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()

Sorry for misunderstanding your fix, and do you mean that the node num is
a little big which could not be covered by 16 pages, such as 129?

"one page could cover 8 node's slots"

thanks,
Jun

>>> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>>> bios to present all heartbeat region.
>>>
>>> So o2hb_read_slots() fails.
>>>
>>> In my test, making fs fails in starting o2cb service.
>>>
>>> Attach error log:
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>>
>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"
>>>
>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>> ---
>>>   fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>>   1 file changed, 6 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
>>> index 91a8889abf9b..2809e29d612d 100644
>>> --- a/fs/ocfs2/cluster/heartbeat.c
>>> +++ b/fs/ocfs2/cluster/heartbeat.c
>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>   	struct bio *bio;
>>>   	struct page *page;
>>>   
>>> +#define O2HB_BIO_VECS 16
>>>   	/* Testing has shown this allocation to take long enough under
>>>   	 * GFP_KERNEL that the local node can get fenced. It would be
>>>   	 * nicest if we could pre-allocate these bios and avoid this
>>>   	 * all together. */
>>> -	bio = bio_alloc(GFP_ATOMIC, 16);
>>> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>>   	if (!bio) {
>>>   		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>>   		bio = ERR_PTR(-ENOMEM);
>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>   		     current_page, vec_len, vec_start);
>>>   
>>>   		len = bio_add_page(bio, page, vec_len, vec_start);
>>> -		if (len != vec_len) {
>>> +		if (len == 0 && current_page == O2HB_BIO_VECS) {
>>> +			/* bio is full now. */
>>> +			goto bail;
>>> +		} else if (len != vec_len) {
>>>   			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>>   			     "page %p, len %d, vec_len %u, vec_start %u, "
>>>   			     "bi_sector %llu\n", current_page, page, len,
>>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel@oss.oracle.com
>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
Changwei Ge May 9, 2018, 9:06 a.m. UTC | #4
Hi Jun,


On 2018/5/9 16:50, piaojun wrote:
> Hi Changwei,
>
> On 2018/5/8 23:57, Changwei Ge wrote:
>> Hi Jun,
>>
>> Sorry for this so late reply since I was very busy those days.
>>
>>
>> On 04/16/2018 11:44 AM, piaojun wrote:
>>> Hi Changwei,
>>>
>>> Do you mean that if the slotnum exceed 16 like 'mkfs.ocfs2 -N 17', you
>>> still let it go rather than reture error?
>> If your assumption is right, do you mean that ocfs2 slots can't exceed 16?
>>
>> If you return error once slots exceed 16, mkfs will never succeed.
>>
>> So if we can ensure that bio is full in current iteration, we should run
>> into next iteration and allocate a
>> new bio adding pages and continue.
>>
>> And your patch makes my ocfs2-test fail.
>>
>>
>> Thanks,
>> Changwei
>>
>>> thanks,
>>> Jun
>>>
>>> On 2018/4/13 13:51, Changwei Ge wrote:
>>>> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()
> Sorry for misunderstanding your fix, and do you mean that the node num is
> a little big which could not be covered by 16 pages, such as 129?
>
> "one page could cover 8 node's slots"
It has nothing to do with the capacity of page holding slots.
It's about how many vecs a bio can have.

For your reference, bio_alloc() has set the maximum vec to 16 in 
o2hb_setup_one_bio() as precondition.

Thanks,
Changwei

>
> thanks,
> Jun
>
>>>> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
>>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>>>> bios to present all heartbeat region.
>>>>
>>>> So o2hb_read_slots() fails.
>>>>
>>>> In my test, making fs fails in starting o2cb service.
>>>>
>>>> Attach error log:
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
>>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>>>
>>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"
>>>>
>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>> ---
>>>>    fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>>>    1 file changed, 6 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
>>>> index 91a8889abf9b..2809e29d612d 100644
>>>> --- a/fs/ocfs2/cluster/heartbeat.c
>>>> +++ b/fs/ocfs2/cluster/heartbeat.c
>>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>>    	struct bio *bio;
>>>>    	struct page *page;
>>>>    
>>>> +#define O2HB_BIO_VECS 16
>>>>    	/* Testing has shown this allocation to take long enough under
>>>>    	 * GFP_KERNEL that the local node can get fenced. It would be
>>>>    	 * nicest if we could pre-allocate these bios and avoid this
>>>>    	 * all together. */
>>>> -	bio = bio_alloc(GFP_ATOMIC, 16);
>>>> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>>>    	if (!bio) {
>>>>    		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>>>    		bio = ERR_PTR(-ENOMEM);
>>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>>    		     current_page, vec_len, vec_start);
>>>>    
>>>>    		len = bio_add_page(bio, page, vec_len, vec_start);
>>>> -		if (len != vec_len) {
>>>> +		if (len == 0 && current_page == O2HB_BIO_VECS) {
>>>> +			/* bio is full now. */
>>>> +			goto bail;
>>>> +		} else if (len != vec_len) {
>>>>    			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>>>    			     "page %p, len %d, vec_len %u, vec_start %u, "
>>>>    			     "bi_sector %llu\n", current_page, page, len,
>>>>
>>> _______________________________________________
>>> Ocfs2-devel mailing list
>>> Ocfs2-devel@oss.oracle.com
>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
piaojun May 9, 2018, 9:13 a.m. UTC | #5
Hi Changwei,

I understand your fix already, but I'm still confused by the comments
"If cluster scale exceeds 16 nodes, ...".
Do you mean that this problem will happen if nodes' count exceeds 16.

thanks,
Jun

On 2018/5/9 17:06, Changwei Ge wrote:
> Hi Jun,
> 
> 
> On 2018/5/9 16:50, piaojun wrote:
>> Hi Changwei,
>>
>> On 2018/5/8 23:57, Changwei Ge wrote:
>>> Hi Jun,
>>>
>>> Sorry for this so late reply since I was very busy those days.
>>>
>>>
>>> On 04/16/2018 11:44 AM, piaojun wrote:
>>>> Hi Changwei,
>>>>
>>>> Do you mean that if the slotnum exceed 16 like 'mkfs.ocfs2 -N 17', you
>>>> still let it go rather than reture error?
>>> If your assumption is right, do you mean that ocfs2 slots can't exceed 16?
>>>
>>> If you return error once slots exceed 16, mkfs will never succeed.
>>>
>>> So if we can ensure that bio is full in current iteration, we should run
>>> into next iteration and allocate a
>>> new bio adding pages and continue.
>>>
>>> And your patch makes my ocfs2-test fail.
>>>
>>>
>>> Thanks,
>>> Changwei
>>>
>>>> thanks,
>>>> Jun
>>>>
>>>> On 2018/4/13 13:51, Changwei Ge wrote:
>>>>> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()
>> Sorry for misunderstanding your fix, and do you mean that the node num is
>> a little big which could not be covered by 16 pages, such as 129?
>>
>> "one page could cover 8 node's slots"
> It has nothing to do with the capacity of page holding slots.
> It's about how many vecs a bio can have.
> 
> For your reference, bio_alloc() has set the maximum vec to 16 in 
> o2hb_setup_one_bio() as precondition.
> 
> Thanks,
> Changwei
> 
>>
>> thanks,
>> Jun
>>
>>>>> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
>>>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>>>>> bios to present all heartbeat region.
>>>>>
>>>>> So o2hb_read_slots() fails.
>>>>>
>>>>> In my test, making fs fails in starting o2cb service.
>>>>>
>>>>> Attach error log:
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
>>>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>>>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>>>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>>>>
>>>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"
>>>>>
>>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>>> ---
>>>>>    fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>>>>    1 file changed, 6 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
>>>>> index 91a8889abf9b..2809e29d612d 100644
>>>>> --- a/fs/ocfs2/cluster/heartbeat.c
>>>>> +++ b/fs/ocfs2/cluster/heartbeat.c
>>>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>>>    	struct bio *bio;
>>>>>    	struct page *page;
>>>>>    
>>>>> +#define O2HB_BIO_VECS 16
>>>>>    	/* Testing has shown this allocation to take long enough under
>>>>>    	 * GFP_KERNEL that the local node can get fenced. It would be
>>>>>    	 * nicest if we could pre-allocate these bios and avoid this
>>>>>    	 * all together. */
>>>>> -	bio = bio_alloc(GFP_ATOMIC, 16);
>>>>> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>>>>    	if (!bio) {
>>>>>    		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>>>>    		bio = ERR_PTR(-ENOMEM);
>>>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>>>    		     current_page, vec_len, vec_start);
>>>>>    
>>>>>    		len = bio_add_page(bio, page, vec_len, vec_start);
>>>>> -		if (len != vec_len) {
>>>>> +		if (len == 0 && current_page == O2HB_BIO_VECS) {
>>>>> +			/* bio is full now. */
>>>>> +			goto bail;
>>>>> +		} else if (len != vec_len) {
>>>>>    			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>>>>    			     "page %p, len %d, vec_len %u, vec_start %u, "
>>>>>    			     "bi_sector %llu\n", current_page, page, len,
>>>>>
>>>> _______________________________________________
>>>> Ocfs2-devel mailing list
>>>> Ocfs2-devel@oss.oracle.com
>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
Changwei Ge May 9, 2018, 9:20 a.m. UTC | #6
Hi Jun,

On 2018/5/9 17:13, piaojun wrote:
> Hi Changwei,
>
> I understand your fix already, but I'm still confused by the comments
> "If cluster scale exceeds 16 nodes, ...".
> Do you mean that this problem will happen if nodes' count exceeds 16.

True. Or we can say that if slot number exceeds 16.

Thanks,
Changwei

>
> thanks,
> Jun
>
> On 2018/5/9 17:06, Changwei Ge wrote:
>> Hi Jun,
>>
>>
>> On 2018/5/9 16:50, piaojun wrote:
>>> Hi Changwei,
>>>
>>> On 2018/5/8 23:57, Changwei Ge wrote:
>>>> Hi Jun,
>>>>
>>>> Sorry for this so late reply since I was very busy those days.
>>>>
>>>>
>>>> On 04/16/2018 11:44 AM, piaojun wrote:
>>>>> Hi Changwei,
>>>>>
>>>>> Do you mean that if the slotnum exceed 16 like 'mkfs.ocfs2 -N 17', you
>>>>> still let it go rather than reture error?
>>>> If your assumption is right, do you mean that ocfs2 slots can't exceed 16?
>>>>
>>>> If you return error once slots exceed 16, mkfs will never succeed.
>>>>
>>>> So if we can ensure that bio is full in current iteration, we should run
>>>> into next iteration and allocate a
>>>> new bio adding pages and continue.
>>>>
>>>> And your patch makes my ocfs2-test fail.
>>>>
>>>>
>>>> Thanks,
>>>> Changwei
>>>>
>>>>> thanks,
>>>>> Jun
>>>>>
>>>>> On 2018/4/13 13:51, Changwei Ge wrote:
>>>>>> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()
>>> Sorry for misunderstanding your fix, and do you mean that the node num is
>>> a little big which could not be covered by 16 pages, such as 129?
>>>
>>> "one page could cover 8 node's slots"
>> It has nothing to do with the capacity of page holding slots.
>> It's about how many vecs a bio can have.
>>
>> For your reference, bio_alloc() has set the maximum vec to 16 in
>> o2hb_setup_one_bio() as precondition.
>>
>> Thanks,
>> Changwei
>>
>>> thanks,
>>> Jun
>>>
>>>>>> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
>>>>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>>>>>> bios to present all heartbeat region.
>>>>>>
>>>>>> So o2hb_read_slots() fails.
>>>>>>
>>>>>> In my test, making fs fails in starting o2cb service.
>>>>>>
>>>>>> Attach error log:
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
>>>>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>>>>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>>>>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>>>>>
>>>>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"
>>>>>>
>>>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>>>> ---
>>>>>>     fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>>>>>     1 file changed, 6 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
>>>>>> index 91a8889abf9b..2809e29d612d 100644
>>>>>> --- a/fs/ocfs2/cluster/heartbeat.c
>>>>>> +++ b/fs/ocfs2/cluster/heartbeat.c
>>>>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>>>>     	struct bio *bio;
>>>>>>     	struct page *page;
>>>>>>     
>>>>>> +#define O2HB_BIO_VECS 16
>>>>>>     	/* Testing has shown this allocation to take long enough under
>>>>>>     	 * GFP_KERNEL that the local node can get fenced. It would be
>>>>>>     	 * nicest if we could pre-allocate these bios and avoid this
>>>>>>     	 * all together. */
>>>>>> -	bio = bio_alloc(GFP_ATOMIC, 16);
>>>>>> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>>>>>     	if (!bio) {
>>>>>>     		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>>>>>     		bio = ERR_PTR(-ENOMEM);
>>>>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>>>>     		     current_page, vec_len, vec_start);
>>>>>>     
>>>>>>     		len = bio_add_page(bio, page, vec_len, vec_start);
>>>>>> -		if (len != vec_len) {
>>>>>> +		if (len == 0 && current_page == O2HB_BIO_VECS) {
>>>>>> +			/* bio is full now. */
>>>>>> +			goto bail;
>>>>>> +		} else if (len != vec_len) {
>>>>>>     			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>>>>>     			     "page %p, len %d, vec_len %u, vec_start %u, "
>>>>>>     			     "bi_sector %llu\n", current_page, page, len,
>>>>>>
>>>>> _______________________________________________
>>>>> Ocfs2-devel mailing list
>>>>> Ocfs2-devel@oss.oracle.com
>>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
piaojun May 9, 2018, 10:08 a.m. UTC | #7
Hi Changwei,

On 2018/4/13 13:51, Changwei Ge wrote:
> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()
> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
> from o2hb_setup_one_bio() will lead to losing chance to allocate more
> bios to present all heartbeat region.
> 
> So o2hb_read_slots() fails.
> 
> In my test, making fs fails in starting o2cb service.
> 
> Attach error log:
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
> 
> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"
> 
> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
> ---
>  fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
> index 91a8889abf9b..2809e29d612d 100644
> --- a/fs/ocfs2/cluster/heartbeat.c
> +++ b/fs/ocfs2/cluster/heartbeat.c
> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>  	struct bio *bio;
>  	struct page *page;
>  
> +#define O2HB_BIO_VECS 16
>  	/* Testing has shown this allocation to take long enough under
>  	 * GFP_KERNEL that the local node can get fenced. It would be
>  	 * nicest if we could pre-allocate these bios and avoid this
>  	 * all together. */
> -	bio = bio_alloc(GFP_ATOMIC, 16);
> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>  	if (!bio) {
>  		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>  		bio = ERR_PTR(-ENOMEM);
> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>  		     current_page, vec_len, vec_start);
>  
Should we check the validity of 'current_page' before bio_add_page()? And
that will prevent error happen. Others looks OK.

thanks,
Jun
>  		len = bio_add_page(bio, page, vec_len, vec_start);
> -		if (len != vec_len) {
> +		if (len == 0 && current_page == O2HB_BIO_VECS) {
> +			/* bio is full now. */
> +			goto bail;
> +		} else if (len != vec_len) {
>  			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>  			     "page %p, len %d, vec_len %u, vec_start %u, "
>  			     "bi_sector %llu\n", current_page, page, len,
>
Changwei Ge May 9, 2018, 12:01 p.m. UTC | #8
Hi Jun,


On 2018/5/9 18:08, piaojun wrote:
> Hi Changwei,

>

> On 2018/4/13 13:51, Changwei Ge wrote:

>> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()

>> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()

>> from o2hb_setup_one_bio() will lead to losing chance to allocate more

>> bios to present all heartbeat region.

>>

>> So o2hb_read_slots() fails.

>>

>> In my test, making fs fails in starting o2cb service.

>>

>> Attach error log:

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0

>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192

>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5

>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5

>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5

>>

>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"

>>

>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>

>> ---

>>   fs/ocfs2/cluster/heartbeat.c | 8 ++++++--

>>   1 file changed, 6 insertions(+), 2 deletions(-)

>>

>> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c

>> index 91a8889abf9b..2809e29d612d 100644

>> --- a/fs/ocfs2/cluster/heartbeat.c

>> +++ b/fs/ocfs2/cluster/heartbeat.c

>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,

>>   	struct bio *bio;

>>   	struct page *page;

>>   

>> +#define O2HB_BIO_VECS 16

>>   	/* Testing has shown this allocation to take long enough under

>>   	 * GFP_KERNEL that the local node can get fenced. It would be

>>   	 * nicest if we could pre-allocate these bios and avoid this

>>   	 * all together. */

>> -	bio = bio_alloc(GFP_ATOMIC, 16);

>> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);

>>   	if (!bio) {

>>   		mlog(ML_ERROR, "Could not alloc slots BIO!\n");

>>   		bio = ERR_PTR(-ENOMEM);

>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,

>>   		     current_page, vec_len, vec_start);

>>   

> Should we check the validity of 'current_page' before bio_add_page()? And

> that will prevent error happen. Others looks OK.


If I understand correctly, you mean we should check current page  is 
NULL or not?
If so I think there is no need since o2hb should guarantee that it has 
already reserved enough pages for disk heartbeat read/write behalf.

Thanks,
Changwei
>

> thanks,

> Jun

>>   		len = bio_add_page(bio, page, vec_len, vec_start);

>> -		if (len != vec_len) {

>> +		if (len == 0 && current_page == O2HB_BIO_VECS) {

>> +			/* bio is full now. */

>> +			goto bail;

>> +		} else if (len != vec_len) {

>>   			mlog(ML_ERROR, "Adding page[%d] to bio failed, "

>>   			     "page %p, len %d, vec_len %u, vec_start %u, "

>>   			     "bi_sector %llu\n", current_page, page, len,

>>

> _______________________________________________

> Ocfs2-devel mailing list

> Ocfs2-devel@oss.oracle.com

> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
piaojun May 10, 2018, 12:24 a.m. UTC | #9
On 2018/5/9 20:01, Changwei Ge wrote:
> Hi Jun,
> 
> 
> On 2018/5/9 18:08, piaojun wrote:
>> Hi Changwei,
>>
>> On 2018/4/13 13:51, Changwei Ge wrote:
>>> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()
>>> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>>> bios to present all heartbeat region.
>>>
>>> So o2hb_read_slots() fails.
>>>
>>> In my test, making fs fails in starting o2cb service.
>>>
>>> Attach error log:
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>>
>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"
>>>
>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>> ---
>>>   fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>>   1 file changed, 6 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
>>> index 91a8889abf9b..2809e29d612d 100644
>>> --- a/fs/ocfs2/cluster/heartbeat.c
>>> +++ b/fs/ocfs2/cluster/heartbeat.c
>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>   	struct bio *bio;
>>>   	struct page *page;
>>>   
>>> +#define O2HB_BIO_VECS 16
>>>   	/* Testing has shown this allocation to take long enough under
>>>   	 * GFP_KERNEL that the local node can get fenced. It would be
>>>   	 * nicest if we could pre-allocate these bios and avoid this
>>>   	 * all together. */
>>> -	bio = bio_alloc(GFP_ATOMIC, 16);
>>> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>>   	if (!bio) {
>>>   		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>>   		bio = ERR_PTR(-ENOMEM);
>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>   		     current_page, vec_len, vec_start);
>>>   
>> Should we check the validity of 'current_page' before bio_add_page()? And
>> that will prevent error happen. Others looks OK.
> 
> If I understand correctly, you mean we should check current page  is 
> NULL or not?
> If so I think there is no need since o2hb should guarantee that it has 
> already reserved enough pages for disk heartbeat read/write behalf.

I mean we could check if 'current_page' equals O2HB_BIO_VECS before
bio_add_page() to avoid NULL pointer referrence.

thanks,
Jun

> 
> Thanks,
> Changwei
>>
>> thanks,
>> Jun
>>>   		len = bio_add_page(bio, page, vec_len, vec_start);
>>> -		if (len != vec_len) {
>>> +		if (len == 0 && current_page == O2HB_BIO_VECS) {
>>> +			/* bio is full now. */
>>> +			goto bail;
>>> +		} else if (len != vec_len) {
>>>   			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>>   			     "page %p, len %d, vec_len %u, vec_start %u, "
>>>   			     "bi_sector %llu\n", current_page, page, len,
>>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel@oss.oracle.com
>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
Changwei Ge May 10, 2018, 1:02 a.m. UTC | #10
On 2018/5/10 8:24, piaojun wrote:
>
> On 2018/5/9 20:01, Changwei Ge wrote:
>> Hi Jun,
>>
>>
>> On 2018/5/9 18:08, piaojun wrote:
>>> Hi Changwei,
>>>
>>> On 2018/4/13 13:51, Changwei Ge wrote:
>>>> If cluster scale exceeds 16 nodes, bio will be full and bio_add_page()
>>>> returns 0 when adding pages to bio. Returning -EIO to o2hb_read_slots()
>>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>>>> bios to present all heartbeat region.
>>>>
>>>> So o2hb_read_slots() fails.
>>>>
>>>> In my test, making fs fails in starting o2cb service.
>>>>
>>>> Attach error log:
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 4096, vec_start = 0
>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, vec_start 0, bi_sector 8192
>>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>>>
>>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio"
>>>>
>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>> ---
>>>>    fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>>>    1 file changed, 6 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
>>>> index 91a8889abf9b..2809e29d612d 100644
>>>> --- a/fs/ocfs2/cluster/heartbeat.c
>>>> +++ b/fs/ocfs2/cluster/heartbeat.c
>>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>>    	struct bio *bio;
>>>>    	struct page *page;
>>>>    
>>>> +#define O2HB_BIO_VECS 16
>>>>    	/* Testing has shown this allocation to take long enough under
>>>>    	 * GFP_KERNEL that the local node can get fenced. It would be
>>>>    	 * nicest if we could pre-allocate these bios and avoid this
>>>>    	 * all together. */
>>>> -	bio = bio_alloc(GFP_ATOMIC, 16);
>>>> +	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>>>    	if (!bio) {
>>>>    		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>>>    		bio = ERR_PTR(-ENOMEM);
>>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
>>>>    		     current_page, vec_len, vec_start);
>>>>    
>>> Should we check the validity of 'current_page' before bio_add_page()? And
>>> that will prevent error happen. Others looks OK.
>> If I understand correctly, you mean we should check current page  is
>> NULL or not?
>> If so I think there is no need since o2hb should guarantee that it has
>> already reserved enough pages for disk heartbeat read/write behalf.
> I mean we could check if 'current_page' equals O2HB_BIO_VECS before
> bio_add_page() to avoid NULL pointer referrence.
Yes, that might work.
I find another problem within this patch.
I will post v2 patch later to fix them all with consideration about your 
suggestion.

Thanks,
Changwei

>
> thanks,
> Jun
>
>> Thanks,
>> Changwei
>>> thanks,
>>> Jun
>>>>    		len = bio_add_page(bio, page, vec_len, vec_start);
>>>> -		if (len != vec_len) {
>>>> +		if (len == 0 && current_page == O2HB_BIO_VECS) {
>>>> +			/* bio is full now. */
>>>> +			goto bail;
>>>> +		} else if (len != vec_len) {
>>>>    			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>>>    			     "page %p, len %d, vec_len %u, vec_start %u, "
>>>>    			     "bi_sector %llu\n", current_page, page, len,
>>>>
>>> _______________________________________________
>>> Ocfs2-devel mailing list
>>> Ocfs2-devel@oss.oracle.com
>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Changwei Ge May 14, 2018, 3:21 a.m. UTC | #11
Hi Jun,

Right now, I am afraid that the easiest and fasted way to fix this issue 
is to revert your patch.

 From comments before function bio_add_page(), we can see that it only 
fails if either ::bi_vcnt == ::bi_max_vecs or it's a cloned bio.


So we can judge if bio is full from its return value is zero or not.


Thanks,

Changwei


On 2018/5/10 9:13, Changwei Ge wrote:
>

>

> On 2018/5/10 8:24, piaojun wrote:

>>

>> On 2018/5/9 20:01, Changwei Ge wrote:

>>> Hi Jun,

>>>

>>>

>>> On 2018/5/9 18:08, piaojun wrote:

>>>> Hi Changwei,

>>>>

>>>> On 2018/4/13 13:51, Changwei Ge wrote:

>>>>> If cluster scale exceeds 16 nodes, bio will be full and 

>>>>> bio_add_page()

>>>>> returns 0 when adding pages to bio. Returning -EIO to 

>>>>> o2hb_read_slots()

>>>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more

>>>>> bios to present all heartbeat region.

>>>>>

>>>>> So o2hb_read_slots() fails.

>>>>>

>>>>> In my test, making fs fails in starting o2cb service.

>>>>>

>>>>> Attach error log:

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 

>>>>> 4096, vec_start = 0

>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] 

>>>>> to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, 

>>>>> vec_start 0, bi_sector 8192

>>>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5

>>>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5

>>>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5

>>>>>

>>>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to 

>>>>> avoid getting incorrect bio"

>>>>>

>>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>

>>>>> ---

>>>>>    fs/ocfs2/cluster/heartbeat.c | 8 ++++++--

>>>>>    1 file changed, 6 insertions(+), 2 deletions(-)

>>>>>

>>>>> diff --git a/fs/ocfs2/cluster/heartbeat.c 

>>>>> b/fs/ocfs2/cluster/heartbeat.c

>>>>> index 91a8889abf9b..2809e29d612d 100644

>>>>> --- a/fs/ocfs2/cluster/heartbeat.c

>>>>> +++ b/fs/ocfs2/cluster/heartbeat.c

>>>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct 

>>>>> o2hb_region *reg,

>>>>>        struct bio *bio;

>>>>>        struct page *page;

>>>>>    +#define O2HB_BIO_VECS 16

>>>>>        /* Testing has shown this allocation to take long enough under

>>>>>         * GFP_KERNEL that the local node can get fenced. It would be

>>>>>         * nicest if we could pre-allocate these bios and avoid this

>>>>>         * all together. */

>>>>> -    bio = bio_alloc(GFP_ATOMIC, 16);

>>>>> +    bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);

>>>>>        if (!bio) {

>>>>>            mlog(ML_ERROR, "Could not alloc slots BIO!\n");

>>>>>            bio = ERR_PTR(-ENOMEM);

>>>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct 

>>>>> o2hb_region *reg,

>>>>>                 current_page, vec_len, vec_start);

>>>> Should we check the validity of 'current_page' before 

>>>> bio_add_page()? And

>>>> that will prevent error happen. Others looks OK.

>>> If I understand correctly, you mean we should check current page  is

>>> NULL or not?

>>> If so I think there is no need since o2hb should guarantee that it has

>>> already reserved enough pages for disk heartbeat read/write behalf.

>> I mean we could check if 'current_page' equals O2HB_BIO_VECS before

>> bio_add_page() to avoid NULL pointer referrence.

> Yes, that might work.

> I find another problem within this patch.

> I will post v2 patch later to fix them all with consideration about 

> your suggestion.

>

> Thanks,

> Changwei

>

>>

>> thanks,

>> Jun

>>

>>> Thanks,

>>> Changwei

>>>> thanks,

>>>> Jun

>>>>>            len = bio_add_page(bio, page, vec_len, vec_start);

>>>>> -        if (len != vec_len) {

>>>>> +        if (len == 0 && current_page == O2HB_BIO_VECS) {

>>>>> +            /* bio is full now. */

>>>>> +            goto bail;

>>>>> +        } else if (len != vec_len) {

>>>>>                mlog(ML_ERROR, "Adding page[%d] to bio failed, "

>>>>>                     "page %p, len %d, vec_len %u, vec_start %u, "

>>>>>                     "bi_sector %llu\n", current_page, page, len,

>>>>>

>>>> _______________________________________________

>>>> Ocfs2-devel mailing list

>>>> Ocfs2-devel@oss.oracle.com

>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel

>
piaojun May 14, 2018, 6:26 a.m. UTC | #12
Hi Changwei,

I got your point, we should let the caller retry if bio is not enough,
right? But some caller like o2hb_issue_node_write() won't retry once error
happens, though the bio will always be enough. I think if we could
calculate the number of bio we need before calling bio_add_page()?

Thanks
Jun

On 2018/5/14 11:21, Changwei Ge wrote:
> Hi Jun,
> 
> Right now, I am afraid that the easiest and fasted way to fix this issue 
> is to revert your patch.
> 
>  From comments before function bio_add_page(), we can see that it only 
> fails if either ::bi_vcnt == ::bi_max_vecs or it's a cloned bio.
> 
> 
> So we can judge if bio is full from its return value is zero or not.
> 
> 
> Thanks,
> 
> Changwei
> 
> 
> On 2018/5/10 9:13, Changwei Ge wrote:
>>
>>
>> On 2018/5/10 8:24, piaojun wrote:
>>>
>>> On 2018/5/9 20:01, Changwei Ge wrote:
>>>> Hi Jun,
>>>>
>>>>
>>>> On 2018/5/9 18:08, piaojun wrote:
>>>>> Hi Changwei,
>>>>>
>>>>> On 2018/4/13 13:51, Changwei Ge wrote:
>>>>>> If cluster scale exceeds 16 nodes, bio will be full and 
>>>>>> bio_add_page()
>>>>>> returns 0 when adding pages to bio. Returning -EIO to 
>>>>>> o2hb_read_slots()
>>>>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>>>>>> bios to present all heartbeat region.
>>>>>>
>>>>>> So o2hb_read_slots() fails.
>>>>>>
>>>>>> In my test, making fs fails in starting o2cb service.
>>>>>>
>>>>>> Attach error log:
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len = 
>>>>>> 4096, vec_start = 0
>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16] 
>>>>>> to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096, 
>>>>>> vec_start 0, bi_sector 8192
>>>>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>>>>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>>>>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>>>>>
>>>>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to 
>>>>>> avoid getting incorrect bio"
>>>>>>
>>>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>>>> ---
>>>>>>    fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>>>>>    1 file changed, 6 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/ocfs2/cluster/heartbeat.c 
>>>>>> b/fs/ocfs2/cluster/heartbeat.c
>>>>>> index 91a8889abf9b..2809e29d612d 100644
>>>>>> --- a/fs/ocfs2/cluster/heartbeat.c
>>>>>> +++ b/fs/ocfs2/cluster/heartbeat.c
>>>>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct 
>>>>>> o2hb_region *reg,
>>>>>>        struct bio *bio;
>>>>>>        struct page *page;
>>>>>>    +#define O2HB_BIO_VECS 16
>>>>>>        /* Testing has shown this allocation to take long enough under
>>>>>>         * GFP_KERNEL that the local node can get fenced. It would be
>>>>>>         * nicest if we could pre-allocate these bios and avoid this
>>>>>>         * all together. */
>>>>>> -    bio = bio_alloc(GFP_ATOMIC, 16);
>>>>>> +    bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>>>>>        if (!bio) {
>>>>>>            mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>>>>>            bio = ERR_PTR(-ENOMEM);
>>>>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct 
>>>>>> o2hb_region *reg,
>>>>>>                 current_page, vec_len, vec_start);
>>>>> Should we check the validity of 'current_page' before 
>>>>> bio_add_page()? And
>>>>> that will prevent error happen. Others looks OK.
>>>> If I understand correctly, you mean we should check current page  is
>>>> NULL or not?
>>>> If so I think there is no need since o2hb should guarantee that it has
>>>> already reserved enough pages for disk heartbeat read/write behalf.
>>> I mean we could check if 'current_page' equals O2HB_BIO_VECS before
>>> bio_add_page() to avoid NULL pointer referrence.
>> Yes, that might work.
>> I find another problem within this patch.
>> I will post v2 patch later to fix them all with consideration about 
>> your suggestion.
>>
>> Thanks,
>> Changwei
>>
>>>
>>> thanks,
>>> Jun
>>>
>>>> Thanks,
>>>> Changwei
>>>>> thanks,
>>>>> Jun
>>>>>>            len = bio_add_page(bio, page, vec_len, vec_start);
>>>>>> -        if (len != vec_len) {
>>>>>> +        if (len == 0 && current_page == O2HB_BIO_VECS) {
>>>>>> +            /* bio is full now. */
>>>>>> +            goto bail;
>>>>>> +        } else if (len != vec_len) {
>>>>>>                mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>>>>>                     "page %p, len %d, vec_len %u, vec_start %u, "
>>>>>>                     "bi_sector %llu\n", current_page, page, len,
>>>>>>
>>>>> _______________________________________________
>>>>> Ocfs2-devel mailing list
>>>>> Ocfs2-devel@oss.oracle.com
>>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>
>
Changwei Ge May 14, 2018, 7:05 a.m. UTC | #13
Hi Jun,

IMO, o2hb_issue_node_write() just needs _one_ vec so there is no problem 
in it.

Calculating bio number might be get related code more complicated which 
I think is not necessary.

It doesn't solve any existed problem or improve performance or clean 
code. :(

Thanks,
Changwei

On 2018/5/14 14:26, piaojun wrote:
> Hi Changwei,
>
> I got your point, we should let the caller retry if bio is not enough,
> right? But some caller like o2hb_issue_node_write() won't retry once error
> happens, though the bio will always be enough. I think if we could
> calculate the number of bio we need before calling bio_add_page()?
>
> Thanks
> Jun
>
> On 2018/5/14 11:21, Changwei Ge wrote:
>> Hi Jun,
>>
>> Right now, I am afraid that the easiest and fasted way to fix this issue
>> is to revert your patch.
>>
>>   From comments before function bio_add_page(), we can see that it only
>> fails if either ::bi_vcnt == ::bi_max_vecs or it's a cloned bio.
>>
>>
>> So we can judge if bio is full from its return value is zero or not.
>>
>>
>> Thanks,
>>
>> Changwei
>>
>>
>> On 2018/5/10 9:13, Changwei Ge wrote:
>>>
>>> On 2018/5/10 8:24, piaojun wrote:
>>>> On 2018/5/9 20:01, Changwei Ge wrote:
>>>>> Hi Jun,
>>>>>
>>>>>
>>>>> On 2018/5/9 18:08, piaojun wrote:
>>>>>> Hi Changwei,
>>>>>>
>>>>>> On 2018/4/13 13:51, Changwei Ge wrote:
>>>>>>> If cluster scale exceeds 16 nodes, bio will be full and
>>>>>>> bio_add_page()
>>>>>>> returns 0 when adding pages to bio. Returning -EIO to
>>>>>>> o2hb_read_slots()
>>>>>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>>>>>>> bios to present all heartbeat region.
>>>>>>>
>>>>>>> So o2hb_read_slots() fails.
>>>>>>>
>>>>>>> In my test, making fs fails in starting o2cb service.
>>>>>>>
>>>>>>> Attach error log:
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len =
>>>>>>> 4096, vec_start = 0
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16]
>>>>>>> to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096,
>>>>>>> vec_start 0, bi_sector 8192
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>>>>>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>>>>>>
>>>>>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to
>>>>>>> avoid getting incorrect bio"
>>>>>>>
>>>>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>>>>> ---
>>>>>>>     fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>>>>>>     1 file changed, 6 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/ocfs2/cluster/heartbeat.c
>>>>>>> b/fs/ocfs2/cluster/heartbeat.c
>>>>>>> index 91a8889abf9b..2809e29d612d 100644
>>>>>>> --- a/fs/ocfs2/cluster/heartbeat.c
>>>>>>> +++ b/fs/ocfs2/cluster/heartbeat.c
>>>>>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct
>>>>>>> o2hb_region *reg,
>>>>>>>         struct bio *bio;
>>>>>>>         struct page *page;
>>>>>>>     +#define O2HB_BIO_VECS 16
>>>>>>>         /* Testing has shown this allocation to take long enough under
>>>>>>>          * GFP_KERNEL that the local node can get fenced. It would be
>>>>>>>          * nicest if we could pre-allocate these bios and avoid this
>>>>>>>          * all together. */
>>>>>>> -    bio = bio_alloc(GFP_ATOMIC, 16);
>>>>>>> +    bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>>>>>>         if (!bio) {
>>>>>>>             mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>>>>>>             bio = ERR_PTR(-ENOMEM);
>>>>>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct
>>>>>>> o2hb_region *reg,
>>>>>>>                  current_page, vec_len, vec_start);
>>>>>> Should we check the validity of 'current_page' before
>>>>>> bio_add_page()? And
>>>>>> that will prevent error happen. Others looks OK.
>>>>> If I understand correctly, you mean we should check current page  is
>>>>> NULL or not?
>>>>> If so I think there is no need since o2hb should guarantee that it has
>>>>> already reserved enough pages for disk heartbeat read/write behalf.
>>>> I mean we could check if 'current_page' equals O2HB_BIO_VECS before
>>>> bio_add_page() to avoid NULL pointer referrence.
>>> Yes, that might work.
>>> I find another problem within this patch.
>>> I will post v2 patch later to fix them all with consideration about
>>> your suggestion.
>>>
>>> Thanks,
>>> Changwei
>>>
>>>> thanks,
>>>> Jun
>>>>
>>>>> Thanks,
>>>>> Changwei
>>>>>> thanks,
>>>>>> Jun
>>>>>>>             len = bio_add_page(bio, page, vec_len, vec_start);
>>>>>>> -        if (len != vec_len) {
>>>>>>> +        if (len == 0 && current_page == O2HB_BIO_VECS) {
>>>>>>> +            /* bio is full now. */
>>>>>>> +            goto bail;
>>>>>>> +        } else if (len != vec_len) {
>>>>>>>                 mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>>>>>>                      "page %p, len %d, vec_len %u, vec_start %u, "
>>>>>>>                      "bi_sector %llu\n", current_page, page, len,
>>>>>>>
>>>>>> _______________________________________________
>>>>>> Ocfs2-devel mailing list
>>>>>> Ocfs2-devel@oss.oracle.com
>>>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
piaojun May 15, 2018, 1:06 a.m. UTC | #14
On 2018/5/14 15:05, Changwei Ge wrote:
> Hi Jun,
> 
> IMO, o2hb_issue_node_write() just needs _one_ vec so there is no problem 
> in it.
> 
> Calculating bio number might be get related code more complicated which 
> I think is not necessary.
> 
> It doesn't solve any existed problem or improve performance or clean 
> code. :(

OK, it sounds reasonable, thanks for your correction.

Thanks,
Jun

> 
> Thanks,
> Changwei
> 
> On 2018/5/14 14:26, piaojun wrote:
>> Hi Changwei,
>>
>> I got your point, we should let the caller retry if bio is not enough,
>> right? But some caller like o2hb_issue_node_write() won't retry once error
>> happens, though the bio will always be enough. I think if we could
>> calculate the number of bio we need before calling bio_add_page()?
>>
>> Thanks
>> Jun
>>
>> On 2018/5/14 11:21, Changwei Ge wrote:
>>> Hi Jun,
>>>
>>> Right now, I am afraid that the easiest and fasted way to fix this issue
>>> is to revert your patch.
>>>
>>>   From comments before function bio_add_page(), we can see that it only
>>> fails if either ::bi_vcnt == ::bi_max_vecs or it's a cloned bio.
>>>
>>>
>>> So we can judge if bio is full from its return value is zero or not.
>>>
>>>
>>> Thanks,
>>>
>>> Changwei
>>>
>>>
>>> On 2018/5/10 9:13, Changwei Ge wrote:
>>>>
>>>> On 2018/5/10 8:24, piaojun wrote:
>>>>> On 2018/5/9 20:01, Changwei Ge wrote:
>>>>>> Hi Jun,
>>>>>>
>>>>>>
>>>>>> On 2018/5/9 18:08, piaojun wrote:
>>>>>>> Hi Changwei,
>>>>>>>
>>>>>>> On 2018/4/13 13:51, Changwei Ge wrote:
>>>>>>>> If cluster scale exceeds 16 nodes, bio will be full and
>>>>>>>> bio_add_page()
>>>>>>>> returns 0 when adding pages to bio. Returning -EIO to
>>>>>>>> o2hb_read_slots()
>>>>>>>> from o2hb_setup_one_bio() will lead to losing chance to allocate more
>>>>>>>> bios to present all heartbeat region.
>>>>>>>>
>>>>>>>> So o2hb_read_slots() fails.
>>>>>>>>
>>>>>>>> In my test, making fs fails in starting o2cb service.
>>>>>>>>
>>>>>>>> Attach error log:
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 0, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 1, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 2, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 3, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 4, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 5, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 6, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 7, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 8, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 9, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 10, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 11, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 12, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 13, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 14, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 15, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:463 page 16, vec_len =
>>>>>>>> 4096, vec_start = 0
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_setup_one_bio:471 ERROR: Adding page[16]
>>>>>>>> to bio failed, page ffffea0002d7ed40, len 0, vec_len 4096,
>>>>>>>> vec_start 0, bi_sector 8192
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_read_slots:500 ERROR: status = -5
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_populate_slot_data:1911 ERROR: status = -5
>>>>>>>> (mkfs.ocfs2,27479,2):o2hb_region_dev_write:2012 ERROR: status = -5
>>>>>>>>
>>>>>>>> Fixes: ba16ddfbeb9d ("ocfs2/o2hb: check len for bio_add_page() to
>>>>>>>> avoid getting incorrect bio"
>>>>>>>>
>>>>>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>>>>>> ---
>>>>>>>>     fs/ocfs2/cluster/heartbeat.c | 8 ++++++--
>>>>>>>>     1 file changed, 6 insertions(+), 2 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/ocfs2/cluster/heartbeat.c
>>>>>>>> b/fs/ocfs2/cluster/heartbeat.c
>>>>>>>> index 91a8889abf9b..2809e29d612d 100644
>>>>>>>> --- a/fs/ocfs2/cluster/heartbeat.c
>>>>>>>> +++ b/fs/ocfs2/cluster/heartbeat.c
>>>>>>>> @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct
>>>>>>>> o2hb_region *reg,
>>>>>>>>         struct bio *bio;
>>>>>>>>         struct page *page;
>>>>>>>>     +#define O2HB_BIO_VECS 16
>>>>>>>>         /* Testing has shown this allocation to take long enough under
>>>>>>>>          * GFP_KERNEL that the local node can get fenced. It would be
>>>>>>>>          * nicest if we could pre-allocate these bios and avoid this
>>>>>>>>          * all together. */
>>>>>>>> -    bio = bio_alloc(GFP_ATOMIC, 16);
>>>>>>>> +    bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
>>>>>>>>         if (!bio) {
>>>>>>>>             mlog(ML_ERROR, "Could not alloc slots BIO!\n");
>>>>>>>>             bio = ERR_PTR(-ENOMEM);
>>>>>>>> @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct
>>>>>>>> o2hb_region *reg,
>>>>>>>>                  current_page, vec_len, vec_start);
>>>>>>> Should we check the validity of 'current_page' before
>>>>>>> bio_add_page()? And
>>>>>>> that will prevent error happen. Others looks OK.
>>>>>> If I understand correctly, you mean we should check current page  is
>>>>>> NULL or not?
>>>>>> If so I think there is no need since o2hb should guarantee that it has
>>>>>> already reserved enough pages for disk heartbeat read/write behalf.
>>>>> I mean we could check if 'current_page' equals O2HB_BIO_VECS before
>>>>> bio_add_page() to avoid NULL pointer referrence.
>>>> Yes, that might work.
>>>> I find another problem within this patch.
>>>> I will post v2 patch later to fix them all with consideration about
>>>> your suggestion.
>>>>
>>>> Thanks,
>>>> Changwei
>>>>
>>>>> thanks,
>>>>> Jun
>>>>>
>>>>>> Thanks,
>>>>>> Changwei
>>>>>>> thanks,
>>>>>>> Jun
>>>>>>>>             len = bio_add_page(bio, page, vec_len, vec_start);
>>>>>>>> -        if (len != vec_len) {
>>>>>>>> +        if (len == 0 && current_page == O2HB_BIO_VECS) {
>>>>>>>> +            /* bio is full now. */
>>>>>>>> +            goto bail;
>>>>>>>> +        } else if (len != vec_len) {
>>>>>>>>                 mlog(ML_ERROR, "Adding page[%d] to bio failed, "
>>>>>>>>                      "page %p, len %d, vec_len %u, vec_start %u, "
>>>>>>>>                      "bi_sector %llu\n", current_page, page, len,
>>>>>>>>
>>>>>>> _______________________________________________
>>>>>>> Ocfs2-devel mailing list
>>>>>>> Ocfs2-devel@oss.oracle.com
>>>>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>

Patch
diff mbox

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 91a8889abf9b..2809e29d612d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -540,11 +540,12 @@  static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 	struct bio *bio;
 	struct page *page;
 
+#define O2HB_BIO_VECS 16
 	/* Testing has shown this allocation to take long enough under
 	 * GFP_KERNEL that the local node can get fenced. It would be
 	 * nicest if we could pre-allocate these bios and avoid this
 	 * all together. */
-	bio = bio_alloc(GFP_ATOMIC, 16);
+	bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS);
 	if (!bio) {
 		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
 		bio = ERR_PTR(-ENOMEM);
@@ -570,7 +571,10 @@  static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 		     current_page, vec_len, vec_start);
 
 		len = bio_add_page(bio, page, vec_len, vec_start);
-		if (len != vec_len) {
+		if (len == 0 && current_page == O2HB_BIO_VECS) {
+			/* bio is full now. */
+			goto bail;
+		} else if (len != vec_len) {
 			mlog(ML_ERROR, "Adding page[%d] to bio failed, "
 			     "page %p, len %d, vec_len %u, vec_start %u, "
 			     "bi_sector %llu\n", current_page, page, len,