diff mbox series

block, nvme: Increase max segments parameter setting value

Message ID 20200323182324.3243-1-ikegami.t@gmail.com (mailing list archive)
State Rejected
Headers show
Series block, nvme: Increase max segments parameter setting value | expand

Commit Message

Tokunori Ikegami March 23, 2020, 6:23 p.m. UTC
Currently data length can be specified as UINT_MAX but failed.
This is caused by the max segments parameter limit set as USHRT_MAX.
To resolve this issue change to increase the value limit range.

Signed-off-by: Tokunori Ikegami <ikegami.t@gmail.com>
Cc: linux-block@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
---
 block/blk-settings.c     | 2 +-
 drivers/nvme/host/core.c | 2 +-
 include/linux/blkdev.h   | 7 ++++---
 3 files changed, 6 insertions(+), 5 deletions(-)

Comments

Chaitanya Kulkarni March 23, 2020, 7:14 p.m. UTC | #1
On 03/23/2020 11:24 AM, Tokunori Ikegami wrote:
> Currently data length can be specified as UINT_MAX but failed.
> This is caused by the max segments parameter limit set as USHRT_MAX.
> To resolve this issue change to increase the value limit range.
>
> Signed-off-by: Tokunori Ikegami<ikegami.t@gmail.com>
> Cc:linux-block@vger.kernel.org
> Cc:linux-nvme@lists.infradead.org

The change looks okay, but why do we need such a large data length ?

Do you have a use-case or performance numbers ?
Tokunori Ikegami March 23, 2020, 11:09 p.m. UTC | #2
Hi,
> The change looks okay, but why do we need such a large data length ?
>
> Do you have a use-case or performance numbers ?

We use the large data length to get log page by the NVMe admin command.
In the past it was able to get with the same length but failed currently 
with it.
So it seems that depended on the kernel version as caused by the version up.
Also I have confirmed that currently failed with the length 0x10000000 
256MB.

Regards,
Ikegami
Keith Busch March 24, 2020, 12:02 a.m. UTC | #3
On Tue, Mar 24, 2020 at 08:09:19AM +0900, Tokunori Ikegami wrote:
> Hi,
> > The change looks okay, but why do we need such a large data length ?
> > 
> > Do you have a use-case or performance numbers ?
> 
> We use the large data length to get log page by the NVMe admin command.
> In the past it was able to get with the same length but failed currently
> with it.
>
> So it seems that depended on the kernel version as caused by the version up.

We didn't have 32-bit max segments before, though. Why was 16-bits
enough in older kernels? Which kernel did this stop working?

> Also I have confirmed that currently failed with the length 0x10000000
> 256MB.

If your hitting max segment limits before any other limit, you should be
able to do larger transfers with more physically contiguous memory. Huge
pages can get the same data length in fewer segments, if you want to
try that.

But wouldn't it be better if your application splits the transfer into
smaller chunks across multiple commands? NVMe log page command supports
offsets for this reason.
Hannes Reinecke March 24, 2020, 7:16 a.m. UTC | #4
On 3/23/20 7:23 PM, Tokunori Ikegami wrote:
> Currently data length can be specified as UINT_MAX but failed.
> This is caused by the max segments parameter limit set as USHRT_MAX.
> To resolve this issue change to increase the value limit range.
> 
> Signed-off-by: Tokunori Ikegami <ikegami.t@gmail.com>
> Cc: linux-block@vger.kernel.org
> Cc: linux-nvme@lists.infradead.org
> ---
>   block/blk-settings.c     | 2 +-
>   drivers/nvme/host/core.c | 2 +-
>   include/linux/blkdev.h   | 7 ++++---
>   3 files changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index c8eda2e7b91e..ed40bda573c2 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -266,7 +266,7 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
>    *    Enables a low level driver to set an upper limit on the number of
>    *    hw data segments in a request.
>    **/
> -void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
> +void blk_queue_max_segments(struct request_queue *q, unsigned int max_segments)
>   {
>   	if (!max_segments) {
>   		max_segments = 1;
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index a4d8c90ee7cc..2b48aab0969e 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2193,7 +2193,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
>   
>   		max_segments = min_not_zero(max_segments, ctrl->max_segments);
>   		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
> -		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
> +		blk_queue_max_segments(q, min_t(u32, max_segments, UINT_MAX));
>   	}
>   	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
>   	    is_power_of_2(ctrl->max_hw_sectors))
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index f629d40c645c..4f4224e20c28 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -338,8 +338,8 @@ struct queue_limits {
>   	unsigned int		max_write_zeroes_sectors;
>   	unsigned int		discard_granularity;
>   	unsigned int		discard_alignment;
> +	unsigned int		max_segments;
>   
> -	unsigned short		max_segments;
>   	unsigned short		max_integrity_segments;
>   	unsigned short		max_discard_segments;
>   
> @@ -1067,7 +1067,8 @@ extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
>   extern void blk_queue_bounce_limit(struct request_queue *, u64);
>   extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
>   extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
> -extern void blk_queue_max_segments(struct request_queue *, unsigned short);
> +extern void blk_queue_max_segments(struct request_queue *q,
> +				   unsigned int max_segments);
>   extern void blk_queue_max_discard_segments(struct request_queue *,
>   		unsigned short);
>   extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
> @@ -1276,7 +1277,7 @@ static inline unsigned int queue_max_hw_sectors(const struct request_queue *q)
>   	return q->limits.max_hw_sectors;
>   }
>   
> -static inline unsigned short queue_max_segments(const struct request_queue *q)
> +static inline unsigned int queue_max_segments(const struct request_queue *q)
>   {
>   	return q->limits.max_segments;
>   }
> 
One would assume that the same reasoning goes for max_integrity_segment, no?

Otherwise looks good.

Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
Tokunori Ikegami March 24, 2020, 4:51 p.m. UTC | #5
On 2020/03/24 9:02, Keith Busch wrote:
> On Tue, Mar 24, 2020 at 08:09:19AM +0900, Tokunori Ikegami wrote:
>> Hi,
>>> The change looks okay, but why do we need such a large data length ?
>>>
>>> Do you have a use-case or performance numbers ?
>> We use the large data length to get log page by the NVMe admin command.
>> In the past it was able to get with the same length but failed currently
>> with it.
>>
>> So it seems that depended on the kernel version as caused by the version up.
> We didn't have 32-bit max segments before, though. Why was 16-bits
> enough in older kernels? Which kernel did this stop working?
Now I am asking the detail information to the reporter so let me update 
later.
That was able to use the same command script with the large data length 
in the past.
>
>> Also I have confirmed that currently failed with the length 0x10000000
>> 256MB.
> If your hitting max segment limits before any other limit, you should be
> able to do larger transfers with more physically contiguous memory. Huge
> pages can get the same data length in fewer segments, if you want to
> try that.
>
> But wouldn't it be better if your application splits the transfer into
> smaller chunks across multiple commands? NVMe log page command supports
> offsets for this reason.

Yes actually now we are using the offset parameter to split the data to get.
For a future usage it seems that it is better to use the large number 
size also.
Tokunori Ikegami March 24, 2020, 5:17 p.m. UTC | #6
On 2020/03/24 16:16, Hannes Reinecke wrote:
> On 3/23/20 7:23 PM, Tokunori Ikegami wrote:
>> Currently data length can be specified as UINT_MAX but failed.
>> This is caused by the max segments parameter limit set as USHRT_MAX.
>> To resolve this issue change to increase the value limit range.
>>
>> Signed-off-by: Tokunori Ikegami <ikegami.t@gmail.com>
>> Cc: linux-block@vger.kernel.org
>> Cc: linux-nvme@lists.infradead.org
>> ---
>>   block/blk-settings.c     | 2 +-
>>   drivers/nvme/host/core.c | 2 +-
>>   include/linux/blkdev.h   | 7 ++++---
>>   3 files changed, 6 insertions(+), 5 deletions(-)
>>
>> diff --git a/block/blk-settings.c b/block/blk-settings.c
>> index c8eda2e7b91e..ed40bda573c2 100644
>> --- a/block/blk-settings.c
>> +++ b/block/blk-settings.c
>> @@ -266,7 +266,7 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
>>    *    Enables a low level driver to set an upper limit on the 
>> number of
>>    *    hw data segments in a request.
>>    **/
>> -void blk_queue_max_segments(struct request_queue *q, unsigned short 
>> max_segments)
>> +void blk_queue_max_segments(struct request_queue *q, unsigned int 
>> max_segments)
>>   {
>>       if (!max_segments) {
>>           max_segments = 1;
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index a4d8c90ee7cc..2b48aab0969e 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -2193,7 +2193,7 @@ static void nvme_set_queue_limits(struct 
>> nvme_ctrl *ctrl,
>>             max_segments = min_not_zero(max_segments, 
>> ctrl->max_segments);
>>           blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
>> -        blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
>> +        blk_queue_max_segments(q, min_t(u32, max_segments, UINT_MAX));
>>       }
>>       if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
>>           is_power_of_2(ctrl->max_hw_sectors))
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index f629d40c645c..4f4224e20c28 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -338,8 +338,8 @@ struct queue_limits {
>>       unsigned int        max_write_zeroes_sectors;
>>       unsigned int        discard_granularity;
>>       unsigned int        discard_alignment;
>> +    unsigned int        max_segments;
>>   -    unsigned short        max_segments;
>>       unsigned short        max_integrity_segments;
>>       unsigned short        max_discard_segments;
>>   @@ -1067,7 +1067,8 @@ extern void blk_queue_make_request(struct 
>> request_queue *, make_request_fn *);
>>   extern void blk_queue_bounce_limit(struct request_queue *, u64);
>>   extern void blk_queue_max_hw_sectors(struct request_queue *, 
>> unsigned int);
>>   extern void blk_queue_chunk_sectors(struct request_queue *, 
>> unsigned int);
>> -extern void blk_queue_max_segments(struct request_queue *, unsigned 
>> short);
>> +extern void blk_queue_max_segments(struct request_queue *q,
>> +                   unsigned int max_segments);
>>   extern void blk_queue_max_discard_segments(struct request_queue *,
>>           unsigned short);
>>   extern void blk_queue_max_segment_size(struct request_queue *, 
>> unsigned int);
>> @@ -1276,7 +1277,7 @@ static inline unsigned int 
>> queue_max_hw_sectors(const struct request_queue *q)
>>       return q->limits.max_hw_sectors;
>>   }
>>   -static inline unsigned short queue_max_segments(const struct 
>> request_queue *q)
>> +static inline unsigned int queue_max_segments(const struct 
>> request_queue *q)
>>   {
>>       return q->limits.max_segments;
>>   }
>>
> One would assume that the same reasoning goes for 
> max_integrity_segment, no?

The error case itself can be resolved by the change without the 
max_integrity_segment changes.
Also the value is set to 0 as default and set to 1 by the nvme driver so 
it seems that not necessary to change for this case.

>
> Otherwise looks good.
>
> Reviewed-by: Hannes Reinecke <hare@suse.de>
Thanks for your reviewing.
>
> Cheers,
>
> Hannes
Tokunori Ikegami March 27, 2020, 5:50 p.m. UTC | #7
On 2020/03/25 1:51, Tokunori Ikegami wrote:
>
> On 2020/03/24 9:02, Keith Busch wrote:
>> On Tue, Mar 24, 2020 at 08:09:19AM +0900, Tokunori Ikegami wrote:
>>> Hi,
>>>> The change looks okay, but why do we need such a large data length ?
>>>>
>>>> Do you have a use-case or performance numbers ?
>>> We use the large data length to get log page by the NVMe admin command.
>>> In the past it was able to get with the same length but failed 
>>> currently
>>> with it.
>>>
>>> So it seems that depended on the kernel version as caused by the 
>>> version up.
>> We didn't have 32-bit max segments before, though. Why was 16-bits
>> enough in older kernels? Which kernel did this stop working?
> Now I am asking the detail information to the reporter so let me 
> update later.
> That was able to use the same command script with the large data 
> length in the past.

I have just confirmed the detail so let me update below.

The data length 20,531,712 (0x1394A00) is used on kernel 3.10.0 (CentOS 
64bit).
Also it is failed on kernel 10 4.10.0 (Ubuntu 32bit).
But just confirmed it as succeeded on both 4.15.0 (Ubuntu 32bit) and 
4.15.1 (Ubuntu 64bit).
So the original 20,531,712 length failure issue seems already resolved.

I tested the data length 0x10000000 (268,435,456) and it is failed
But now confirmed it as failed on all the above kernel versions.
Also the patch fixes only this 0x10000000 length failure issue.

There was confused and sorry for that.

Regards,
Ikegami
Keith Busch March 27, 2020, 6:18 p.m. UTC | #8
On Sat, Mar 28, 2020 at 02:50:43AM +0900, Tokunori Ikegami wrote:
> On 2020/03/25 1:51, Tokunori Ikegami wrote:
> > On 2020/03/24 9:02, Keith Busch wrote:
> > > We didn't have 32-bit max segments before, though. Why was 16-bits
> > > enough in older kernels? Which kernel did this stop working?
> > Now I am asking the detail information to the reporter so let me
> > update later.  That was able to use the same command script with the
> > large data length in the past.
> 
> I have just confirmed the detail so let me update below.
> 
> The data length 20,531,712 (0x1394A00) is used on kernel 3.10.0 (CentOS
> 64bit).
> Also it is failed on kernel 10 4.10.0 (Ubuntu 32bit).
> But just confirmed it as succeeded on both 4.15.0 (Ubuntu 32bit) and 4.15.1
> (Ubuntu 64bit).
> So the original 20,531,712 length failure issue seems already resolved.
> 
> I tested the data length 0x10000000 (268,435,456) and it is failed
> But now confirmed it as failed on all the above kernel versions.
> Also the patch fixes only this 0x10000000 length failure issue.

This is actually even more confusing. We do not support 256MB transfers
within a single command in the pci nvme driver anymore. The max is 4MB,
so I don't see how increasing the max segments will help: you should be
hitting the 'max_sectors' limit if you don't hit the segment limit first.
Ming Lei March 28, 2020, 2:11 a.m. UTC | #9
On Sat, Mar 28, 2020 at 2:18 AM Keith Busch <kbusch@kernel.org> wrote:
>
> On Sat, Mar 28, 2020 at 02:50:43AM +0900, Tokunori Ikegami wrote:
> > On 2020/03/25 1:51, Tokunori Ikegami wrote:
> > > On 2020/03/24 9:02, Keith Busch wrote:
> > > > We didn't have 32-bit max segments before, though. Why was 16-bits
> > > > enough in older kernels? Which kernel did this stop working?
> > > Now I am asking the detail information to the reporter so let me
> > > update later.  That was able to use the same command script with the
> > > large data length in the past.
> >
> > I have just confirmed the detail so let me update below.
> >
> > The data length 20,531,712 (0x1394A00) is used on kernel 3.10.0 (CentOS
> > 64bit).
> > Also it is failed on kernel 10 4.10.0 (Ubuntu 32bit).
> > But just confirmed it as succeeded on both 4.15.0 (Ubuntu 32bit) and 4.15.1
> > (Ubuntu 64bit).
> > So the original 20,531,712 length failure issue seems already resolved.
> >
> > I tested the data length 0x10000000 (268,435,456) and it is failed
> > But now confirmed it as failed on all the above kernel versions.
> > Also the patch fixes only this 0x10000000 length failure issue.
>
> This is actually even more confusing. We do not support 256MB transfers
> within a single command in the pci nvme driver anymore. The max is 4MB,
> so I don't see how increasing the max segments will help: you should be
> hitting the 'max_sectors' limit if you don't hit the segment limit first.

That looks a bug for passthrough req, because 'max_sectors' limit is only
checked in bio_add_pc_page(), not done in blk_rq_append_bio(), something
like the following seems required:

diff --git a/block/blk-map.c b/block/blk-map.c
index b0790268ed9d..e120d80b75a5 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -22,6 +22,10 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio)
        struct bio_vec bv;
        unsigned int nr_segs = 0;

+       if (((rq->__data_len + (*bio)->bi_iter.bi_size) >> 9) >
+                       queue_max_hw_sectors(rq->q))
+               return -EINVAL;
+


Thanks,
Ming Lei
Keith Busch March 28, 2020, 3:13 a.m. UTC | #10
On Sat, Mar 28, 2020 at 10:11:57AM +0800, Ming Lei wrote:
> On Sat, Mar 28, 2020 at 2:18 AM Keith Busch <kbusch@kernel.org> wrote:
> >
> > This is actually even more confusing. We do not support 256MB transfers
> > within a single command in the pci nvme driver anymore. The max is 4MB,
> > so I don't see how increasing the max segments will help: you should be
> > hitting the 'max_sectors' limit if you don't hit the segment limit first.
> 
> That looks a bug for passthrough req, because 'max_sectors' limit is only
> checked in bio_add_pc_page(), not done in blk_rq_append_bio(), something
> like the following seems required:

Shouldn't bio_map_user_iov() or bio_copy_user_iov() have caught the
length limit before proceeding to blk_rq_append_bio()?
Ming Lei March 28, 2020, 8:28 a.m. UTC | #11
On Sat, Mar 28, 2020 at 11:13 AM Keith Busch <kbusch@kernel.org> wrote:
>
> On Sat, Mar 28, 2020 at 10:11:57AM +0800, Ming Lei wrote:
> > On Sat, Mar 28, 2020 at 2:18 AM Keith Busch <kbusch@kernel.org> wrote:
> > >
> > > This is actually even more confusing. We do not support 256MB transfers
> > > within a single command in the pci nvme driver anymore. The max is 4MB,
> > > so I don't see how increasing the max segments will help: you should be
> > > hitting the 'max_sectors' limit if you don't hit the segment limit first.
> >
> > That looks a bug for passthrough req, because 'max_sectors' limit is only
> > checked in bio_add_pc_page(), not done in blk_rq_append_bio(), something
> > like the following seems required:
>
> Shouldn't bio_map_user_iov() or bio_copy_user_iov() have caught the
> length limit before proceeding to blk_rq_append_bio()?

Actually the limit should be caught earlier in bio_add_pc_page(), that requires
to pass 'rq' in, then one perfect passthrough bio can be made before
appending to rq.

Not only max sector limit, max segments & virt_boundary should be checked
in request wide too.

So far, only lightnvm calls bio_add_pc_page() before allocating
request, and that is
the 1st bio, so NULL rq can be passed.

thanks,
Ming Lei
Tokunori Ikegami March 28, 2020, 12:57 p.m. UTC | #12
Hi,

On 2020/03/28 11:11, Ming Lei wrote:
> On Sat, Mar 28, 2020 at 2:18 AM Keith Busch <kbusch@kernel.org> wrote:
>> On Sat, Mar 28, 2020 at 02:50:43AM +0900, Tokunori Ikegami wrote:
>>> On 2020/03/25 1:51, Tokunori Ikegami wrote:
>>>> On 2020/03/24 9:02, Keith Busch wrote:
>>>>> We didn't have 32-bit max segments before, though. Why was 16-bits
>>>>> enough in older kernels? Which kernel did this stop working?
>>>> Now I am asking the detail information to the reporter so let me
>>>> update later.  That was able to use the same command script with the
>>>> large data length in the past.
>>> I have just confirmed the detail so let me update below.
>>>
>>> The data length 20,531,712 (0x1394A00) is used on kernel 3.10.0 (CentOS
>>> 64bit).
>>> Also it is failed on kernel 10 4.10.0 (Ubuntu 32bit).
>>> But just confirmed it as succeeded on both 4.15.0 (Ubuntu 32bit) and 4.15.1
>>> (Ubuntu 64bit).
>>> So the original 20,531,712 length failure issue seems already resolved.
>>>
>>> I tested the data length 0x10000000 (268,435,456) and it is failed
>>> But now confirmed it as failed on all the above kernel versions.
>>> Also the patch fixes only this 0x10000000 length failure issue.
>> This is actually even more confusing. We do not support 256MB transfers
>> within a single command in the pci nvme driver anymore. The max is 4MB,
>> so I don't see how increasing the max segments will help: you should be
>> hitting the 'max_sectors' limit if you don't hit the segment limit first.
> That looks a bug for passthrough req, because 'max_sectors' limit is only
> checked in bio_add_pc_page(), not done in blk_rq_append_bio(), something
> like the following seems required:
>
> diff --git a/block/blk-map.c b/block/blk-map.c
> index b0790268ed9d..e120d80b75a5 100644
> --- a/block/blk-map.c
> +++ b/block/blk-map.c
> @@ -22,6 +22,10 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio)
>          struct bio_vec bv;
>          unsigned int nr_segs = 0;
>
> +       if (((rq->__data_len + (*bio)->bi_iter.bi_size) >> 9) >
> +                       queue_max_hw_sectors(rq->q))
> +               return -EINVAL;
> +

I have just confirmed about the max_hw_sectors checking below.
It is checked by the function blk_rq_map_kern() also as below.

     if (len > (queue_max_hw_sectors(q) << 9))
         return -EINVAL;

The function calls blk_rq_append_bio().
So the max_hw_sectors will be used to check the length with the change 
above.
But it seems that there is a difference also for the checking limit 
condition.

It seems that it is better to check the limit by blk_rq_map_user() 
instead of blk_rq_append_bio().
Or it can be changed to check the limit by blk_rq_append_bio() only 
without blk_rq_map_kern().

Regards,
Ikegami
Ming Lei March 29, 2020, 3:01 a.m. UTC | #13
On Sat, Mar 28, 2020 at 8:57 PM Tokunori Ikegami <ikegami.t@gmail.com> wrote:
>
> Hi,
>
> On 2020/03/28 11:11, Ming Lei wrote:
> > On Sat, Mar 28, 2020 at 2:18 AM Keith Busch <kbusch@kernel.org> wrote:
> >> On Sat, Mar 28, 2020 at 02:50:43AM +0900, Tokunori Ikegami wrote:
> >>> On 2020/03/25 1:51, Tokunori Ikegami wrote:
> >>>> On 2020/03/24 9:02, Keith Busch wrote:
> >>>>> We didn't have 32-bit max segments before, though. Why was 16-bits
> >>>>> enough in older kernels? Which kernel did this stop working?
> >>>> Now I am asking the detail information to the reporter so let me
> >>>> update later.  That was able to use the same command script with the
> >>>> large data length in the past.
> >>> I have just confirmed the detail so let me update below.
> >>>
> >>> The data length 20,531,712 (0x1394A00) is used on kernel 3.10.0 (CentOS
> >>> 64bit).
> >>> Also it is failed on kernel 10 4.10.0 (Ubuntu 32bit).
> >>> But just confirmed it as succeeded on both 4.15.0 (Ubuntu 32bit) and 4.15.1
> >>> (Ubuntu 64bit).
> >>> So the original 20,531,712 length failure issue seems already resolved.
> >>>
> >>> I tested the data length 0x10000000 (268,435,456) and it is failed
> >>> But now confirmed it as failed on all the above kernel versions.
> >>> Also the patch fixes only this 0x10000000 length failure issue.
> >> This is actually even more confusing. We do not support 256MB transfers
> >> within a single command in the pci nvme driver anymore. The max is 4MB,
> >> so I don't see how increasing the max segments will help: you should be
> >> hitting the 'max_sectors' limit if you don't hit the segment limit first.
> > That looks a bug for passthrough req, because 'max_sectors' limit is only
> > checked in bio_add_pc_page(), not done in blk_rq_append_bio(), something
> > like the following seems required:
> >
> > diff --git a/block/blk-map.c b/block/blk-map.c
> > index b0790268ed9d..e120d80b75a5 100644
> > --- a/block/blk-map.c
> > +++ b/block/blk-map.c
> > @@ -22,6 +22,10 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio)
> >          struct bio_vec bv;
> >          unsigned int nr_segs = 0;
> >
> > +       if (((rq->__data_len + (*bio)->bi_iter.bi_size) >> 9) >
> > +                       queue_max_hw_sectors(rq->q))
> > +               return -EINVAL;
> > +
>
> I have just confirmed about the max_hw_sectors checking below.
> It is checked by the function blk_rq_map_kern() also as below.
>
>      if (len > (queue_max_hw_sectors(q) << 9))
>          return -EINVAL;

The above check doesn't take rq->__data_len into account.

Thanks,
Ming Lei
Tokunori Ikegami March 30, 2020, 9:15 a.m. UTC | #14
On 2020/03/29 12:01, Ming Lei wrote:
> On Sat, Mar 28, 2020 at 8:57 PM Tokunori Ikegami <ikegami.t@gmail.com> wrote:
>> Hi,
>>
>> On 2020/03/28 11:11, Ming Lei wrote:
>>> On Sat, Mar 28, 2020 at 2:18 AM Keith Busch <kbusch@kernel.org> wrote:
>>>> On Sat, Mar 28, 2020 at 02:50:43AM +0900, Tokunori Ikegami wrote:
>>>>> On 2020/03/25 1:51, Tokunori Ikegami wrote:
>>>>>> On 2020/03/24 9:02, Keith Busch wrote:
>>>>>>> We didn't have 32-bit max segments before, though. Why was 16-bits
>>>>>>> enough in older kernels? Which kernel did this stop working?
>>>>>> Now I am asking the detail information to the reporter so let me
>>>>>> update later.  That was able to use the same command script with the
>>>>>> large data length in the past.
>>>>> I have just confirmed the detail so let me update below.
>>>>>
>>>>> The data length 20,531,712 (0x1394A00) is used on kernel 3.10.0 (CentOS
>>>>> 64bit).
>>>>> Also it is failed on kernel 10 4.10.0 (Ubuntu 32bit).
>>>>> But just confirmed it as succeeded on both 4.15.0 (Ubuntu 32bit) and 4.15.1
>>>>> (Ubuntu 64bit).
>>>>> So the original 20,531,712 length failure issue seems already resolved.
>>>>>
>>>>> I tested the data length 0x10000000 (268,435,456) and it is failed
>>>>> But now confirmed it as failed on all the above kernel versions.
>>>>> Also the patch fixes only this 0x10000000 length failure issue.
>>>> This is actually even more confusing. We do not support 256MB transfers
>>>> within a single command in the pci nvme driver anymore. The max is 4MB,
>>>> so I don't see how increasing the max segments will help: you should be
>>>> hitting the 'max_sectors' limit if you don't hit the segment limit first.
>>> That looks a bug for passthrough req, because 'max_sectors' limit is only
>>> checked in bio_add_pc_page(), not done in blk_rq_append_bio(), something
>>> like the following seems required:
>>>
>>> diff --git a/block/blk-map.c b/block/blk-map.c
>>> index b0790268ed9d..e120d80b75a5 100644
>>> --- a/block/blk-map.c
>>> +++ b/block/blk-map.c
>>> @@ -22,6 +22,10 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio)
>>>           struct bio_vec bv;
>>>           unsigned int nr_segs = 0;
>>>
>>> +       if (((rq->__data_len + (*bio)->bi_iter.bi_size) >> 9) >
>>> +                       queue_max_hw_sectors(rq->q))
>>> +               return -EINVAL;
>>> +
>> I have just confirmed about the max_hw_sectors checking below.
>> It is checked by the function blk_rq_map_kern() also as below.
>>
>>       if (len > (queue_max_hw_sectors(q) << 9))
>>           return -EINVAL;
> The above check doesn't take rq->__data_len into account.

Thanks for the comment and noted it.
I have just confirmed the behavior on 5.6.0-rc7 as below.
It works to limit the data length size 4MB as expected basically.
Also it can be limited by the check existed below in ll_back_merge_fn().

     if (blk_rq_sectors(req) + bio_sectors(bio) >
         blk_rq_get_max_sectors(req, blk_rq_pos(req))) {

But there is a case that the command data length is limited by 512KB.
I am not sure about this condition so needed more investigation.

Regards,
Ikegami
Keith Busch March 30, 2020, 1:53 p.m. UTC | #15
On Mon, Mar 30, 2020 at 06:15:41PM +0900, Tokunori Ikegami wrote:
> But there is a case that the command data length is limited by 512KB.
> I am not sure about this condition so needed more investigation.

Your memory is too fragmented. You need more physically contiguous
memory or you'll hit the segment limit before you hit the length
limit. Try allocating huge pages.
Kanchan Joshi March 31, 2020, 2:13 p.m. UTC | #16
Hi Ikegami,

Are you actually able to bump up the max segments with the original patch?
I did not notice the patch doing anything about NVME_MAX_SEGS, which
is set to 127 currently.

From pci.c -
/*
 * These can be higher, but we need to ensure that any command doesn't
 * require an sg allocation that needs more than a page of data.
 */
#define NVME_MAX_KB_SZ  4096
#define NVME_MAX_SEGS   127

> @@ -2193,7 +2193,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl
> *ctrl,
>
>  max_segments = min_not_zero(max_segments, ctrl-
> >max_segments);
>  blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
> - blk_queue_max_segments(q, min_t(u32, max_segments,
> USHRT_MAX));
> + blk_queue_max_segments(q, min_t(u32, max_segments,
> UINT_MAX));
>  }

Since ctrl->max_segment is set to 127,  max_segments will not go beyond 127.

Thanks,

On Mon, Mar 30, 2020 at 2:46 PM Tokunori Ikegami <ikegami.t@gmail.com> wrote:
>
>
> On 2020/03/29 12:01, Ming Lei wrote:
> > On Sat, Mar 28, 2020 at 8:57 PM Tokunori Ikegami <ikegami.t@gmail.com> wrote:
> >> Hi,
> >>
> >> On 2020/03/28 11:11, Ming Lei wrote:
> >>> On Sat, Mar 28, 2020 at 2:18 AM Keith Busch <kbusch@kernel.org> wrote:
> >>>> On Sat, Mar 28, 2020 at 02:50:43AM +0900, Tokunori Ikegami wrote:
> >>>>> On 2020/03/25 1:51, Tokunori Ikegami wrote:
> >>>>>> On 2020/03/24 9:02, Keith Busch wrote:
> >>>>>>> We didn't have 32-bit max segments before, though. Why was 16-bits
> >>>>>>> enough in older kernels? Which kernel did this stop working?
> >>>>>> Now I am asking the detail information to the reporter so let me
> >>>>>> update later.  That was able to use the same command script with the
> >>>>>> large data length in the past.
> >>>>> I have just confirmed the detail so let me update below.
> >>>>>
> >>>>> The data length 20,531,712 (0x1394A00) is used on kernel 3.10.0 (CentOS
> >>>>> 64bit).
> >>>>> Also it is failed on kernel 10 4.10.0 (Ubuntu 32bit).
> >>>>> But just confirmed it as succeeded on both 4.15.0 (Ubuntu 32bit) and 4.15.1
> >>>>> (Ubuntu 64bit).
> >>>>> So the original 20,531,712 length failure issue seems already resolved.
> >>>>>
> >>>>> I tested the data length 0x10000000 (268,435,456) and it is failed
> >>>>> But now confirmed it as failed on all the above kernel versions.
> >>>>> Also the patch fixes only this 0x10000000 length failure issue.
> >>>> This is actually even more confusing. We do not support 256MB transfers
> >>>> within a single command in the pci nvme driver anymore. The max is 4MB,
> >>>> so I don't see how increasing the max segments will help: you should be
> >>>> hitting the 'max_sectors' limit if you don't hit the segment limit first.
> >>> That looks a bug for passthrough req, because 'max_sectors' limit is only
> >>> checked in bio_add_pc_page(), not done in blk_rq_append_bio(), something
> >>> like the following seems required:
> >>>
> >>> diff --git a/block/blk-map.c b/block/blk-map.c
> >>> index b0790268ed9d..e120d80b75a5 100644
> >>> --- a/block/blk-map.c
> >>> +++ b/block/blk-map.c
> >>> @@ -22,6 +22,10 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio)
> >>>           struct bio_vec bv;
> >>>           unsigned int nr_segs = 0;
> >>>
> >>> +       if (((rq->__data_len + (*bio)->bi_iter.bi_size) >> 9) >
> >>> +                       queue_max_hw_sectors(rq->q))
> >>> +               return -EINVAL;
> >>> +
> >> I have just confirmed about the max_hw_sectors checking below.
> >> It is checked by the function blk_rq_map_kern() also as below.
> >>
> >>       if (len > (queue_max_hw_sectors(q) << 9))
> >>           return -EINVAL;
> > The above check doesn't take rq->__data_len into account.
>
> Thanks for the comment and noted it.
> I have just confirmed the behavior on 5.6.0-rc7 as below.
> It works to limit the data length size 4MB as expected basically.
> Also it can be limited by the check existed below in ll_back_merge_fn().
>
>      if (blk_rq_sectors(req) + bio_sectors(bio) >
>          blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
>
> But there is a case that the command data length is limited by 512KB.
> I am not sure about this condition so needed more investigation.
>
> Regards,
> Ikegami
>
Tokunori Ikegami March 31, 2020, 3:24 p.m. UTC | #17
Hi,

On 2020/03/30 22:53, Keith Busch wrote:
> On Mon, Mar 30, 2020 at 06:15:41PM +0900, Tokunori Ikegami wrote:
>> But there is a case that the command data length is limited by 512KB.
>> I am not sure about this condition so needed more investigation.
> Your memory is too fragmented. You need more physically contiguous
> memory or you'll hit the segment limit before you hit the length
> limit. Try allocating huge pages.

Thank you so much for your advise.
I have confirmed that the 512KB case is limited by the 127 segments 
check below in ll_new_hw_segment().

     if (req->nr_phys_segments + nr_phys_segs > 
queue_max_segments(req->q)) {

Also confirmed that the 4MB limit works correctly with the memory 
condition not fragmented too much.
I will do abandon the patch to increase max segments as limited 4MB now 
by NVMe PCIe driver.

Regards,
Ikegami
Tokunori Ikegami March 31, 2020, 3:37 p.m. UTC | #18
Hi,

On 2020/03/31 23:13, Joshi wrote:
> Hi Ikegami,
>
> Are you actually able to bump up the max segments with the original patch?
No as you mentioned currently it is limited by NVME_MAX_SEGS.
On the older kernel version it was not limited by the value but limited 
by USHRT_MAX.
So it was able to change the limit by the patch.

As Keith-san also mentioned currently it is limited by the 4MB and 127 
segments as you mentioned by the NVMe PCIe driver.
So I will do abandon the patch to increase max segments.

Regards,
Ikegami
diff mbox series

Patch

diff --git a/block/blk-settings.c b/block/blk-settings.c
index c8eda2e7b91e..ed40bda573c2 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -266,7 +266,7 @@  EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
  *    Enables a low level driver to set an upper limit on the number of
  *    hw data segments in a request.
  **/
-void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
+void blk_queue_max_segments(struct request_queue *q, unsigned int max_segments)
 {
 	if (!max_segments) {
 		max_segments = 1;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a4d8c90ee7cc..2b48aab0969e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2193,7 +2193,7 @@  static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 
 		max_segments = min_not_zero(max_segments, ctrl->max_segments);
 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
-		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+		blk_queue_max_segments(q, min_t(u32, max_segments, UINT_MAX));
 	}
 	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
 	    is_power_of_2(ctrl->max_hw_sectors))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f629d40c645c..4f4224e20c28 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -338,8 +338,8 @@  struct queue_limits {
 	unsigned int		max_write_zeroes_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
+	unsigned int		max_segments;
 
-	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
 	unsigned short		max_discard_segments;
 
@@ -1067,7 +1067,8 @@  extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
-extern void blk_queue_max_segments(struct request_queue *, unsigned short);
+extern void blk_queue_max_segments(struct request_queue *q,
+				   unsigned int max_segments);
 extern void blk_queue_max_discard_segments(struct request_queue *,
 		unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
@@ -1276,7 +1277,7 @@  static inline unsigned int queue_max_hw_sectors(const struct request_queue *q)
 	return q->limits.max_hw_sectors;
 }
 
-static inline unsigned short queue_max_segments(const struct request_queue *q)
+static inline unsigned int queue_max_segments(const struct request_queue *q)
 {
 	return q->limits.max_segments;
 }