diff mbox series

[RFC,3/5] blk-mq: Facilitate a shared tags per tagset

Message ID 1573652209-163505-4-git-send-email-john.garry@huawei.com (mailing list archive)
State New, archived
Headers show
Series blk-mq/scsi: Provide hostwide shared tags for SCSI HBAs | expand

Commit Message

John Garry Nov. 13, 2019, 1:36 p.m. UTC
Some SCSI HBAs (such as HPSA, megaraid, mpt3sas, hisi_sas_v3 ..) support
multiple reply queues with single hostwide tags.

In addition, these drivers want to use interrupt assignment in
pci_alloc_irq_vectors(PCI_IRQ_AFFINITY). However, as discussed in [0],
CPU hotplug may cause in-flight IO completion to not be serviced when an
interrupt is shutdown.

To solve that problem, Ming's patchset to drain hctx's should ensure no
IOs are missed in-flight [1].

However, to take advantage of that patchset, we need to map the HBA HW
queues to blk mq hctx's; to do that, we need to expose the HBA HW queues.

In making that transition, the per-SCSI command request tags are no
longer unique per Scsi host - they are just unique per hctx. As such, the
HBA LLDD would have to generate this tag internally, which has a certain
performance overhead.

However another problem is that blk mq assumes the host may accept
(Scsi_host.can_queue * #hw queue) commands. In [2], we removed the Scsi
host busy counter, which would stop the LLDD being sent more than
.can_queue commands; however, we should still ensure that the block layer
does not issue more than .can_queue commands to the Scsi host.

To solve this problem, introduce a shared tags per blk_mq_tag_set, which
may be requested when allocating the tagset.

New flag BLK_MQ_F_TAG_HCTX_SHARED should be set when requesting the
tagset.

This is based on work originally from Ming Lei in [3].

[0] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/
[1] https://lore.kernel.org/linux-block/20191014015043.25029-1-ming.lei@redhat.com/
[2] https://lore.kernel.org/linux-scsi/20191025065855.6309-1-ming.lei@redhat.com/
[3] https://lore.kernel.org/linux-block/20190531022801.10003-1-ming.lei@redhat.com/

Signed-off-by: John Garry <john.garry@huawei.com>
---
 block/blk-core.c       |  1 +
 block/blk-flush.c      |  2 +
 block/blk-mq-debugfs.c |  2 +-
 block/blk-mq-tag.c     | 85 ++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-tag.h     |  1 +
 block/blk-mq.c         | 61 +++++++++++++++++++++++++-----
 block/blk-mq.h         |  9 +++++
 include/linux/blk-mq.h |  3 ++
 include/linux/blkdev.h |  1 +
 9 files changed, 155 insertions(+), 10 deletions(-)

Comments

Hannes Reinecke Nov. 13, 2019, 2:06 p.m. UTC | #1
On 11/13/19 2:36 PM, John Garry wrote:
> Some SCSI HBAs (such as HPSA, megaraid, mpt3sas, hisi_sas_v3 ..) support
> multiple reply queues with single hostwide tags.
> 
> In addition, these drivers want to use interrupt assignment in
> pci_alloc_irq_vectors(PCI_IRQ_AFFINITY). However, as discussed in [0],
> CPU hotplug may cause in-flight IO completion to not be serviced when an
> interrupt is shutdown.
> 
> To solve that problem, Ming's patchset to drain hctx's should ensure no
> IOs are missed in-flight [1].
> 
> However, to take advantage of that patchset, we need to map the HBA HW
> queues to blk mq hctx's; to do that, we need to expose the HBA HW queues.
> 
> In making that transition, the per-SCSI command request tags are no
> longer unique per Scsi host - they are just unique per hctx. As such, the
> HBA LLDD would have to generate this tag internally, which has a certain
> performance overhead.
> 
> However another problem is that blk mq assumes the host may accept
> (Scsi_host.can_queue * #hw queue) commands. In [2], we removed the Scsi
> host busy counter, which would stop the LLDD being sent more than
> .can_queue commands; however, we should still ensure that the block layer
> does not issue more than .can_queue commands to the Scsi host.
> 
> To solve this problem, introduce a shared tags per blk_mq_tag_set, which
> may be requested when allocating the tagset.
> 
> New flag BLK_MQ_F_TAG_HCTX_SHARED should be set when requesting the
> tagset.
> 
> This is based on work originally from Ming Lei in [3].
> 
> [0] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/
> [1] https://lore.kernel.org/linux-block/20191014015043.25029-1-ming.lei@redhat.com/
> [2] https://lore.kernel.org/linux-scsi/20191025065855.6309-1-ming.lei@redhat.com/
> [3] https://lore.kernel.org/linux-block/20190531022801.10003-1-ming.lei@redhat.com/
> 
> Signed-off-by: John Garry <john.garry@huawei.com>
> ---
>  block/blk-core.c       |  1 +
>  block/blk-flush.c      |  2 +
>  block/blk-mq-debugfs.c |  2 +-
>  block/blk-mq-tag.c     | 85 ++++++++++++++++++++++++++++++++++++++++++
>  block/blk-mq-tag.h     |  1 +
>  block/blk-mq.c         | 61 +++++++++++++++++++++++++-----
>  block/blk-mq.h         |  9 +++++
>  include/linux/blk-mq.h |  3 ++
>  include/linux/blkdev.h |  1 +
>  9 files changed, 155 insertions(+), 10 deletions(-)
> 
[ .. ]
> @@ -396,15 +398,17 @@ static struct request *blk_mq_get_request(struct request_queue *q,
>  		blk_mq_tag_busy(data->hctx);
>  	}
>  
> -	tag = blk_mq_get_tag(data);
> -	if (tag == BLK_MQ_TAG_FAIL) {
> -		if (clear_ctx_on_error)
> -			data->ctx = NULL;
> -		blk_queue_exit(q);
> -		return NULL;
> +	if (data->hctx->shared_tags) {
> +		shared_tag = blk_mq_get_shared_tag(data);
> +		if (shared_tag == BLK_MQ_TAG_FAIL)
> +			goto err_shared_tag;
>  	}
>  
> -	rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
> +	tag = blk_mq_get_tag(data);
> +	if (tag == BLK_MQ_TAG_FAIL)
> +		goto err_tag;
> +
> +	rq = blk_mq_rq_ctx_init(data, tag, shared_tag, data->cmd_flags, alloc_time_ns);
>  	if (!op_is_flush(data->cmd_flags)) {
>  		rq->elv.icq = NULL;
>  		if (e && e->type->ops.prepare_request) {
Why do you need to keep a parallel tag accounting between 'normal' and
'shared' tags here?
Isn't is sufficient to get a shared tag only, and us that in lieo of the
'real' one?

I would love to combine both, as then we can easily do a reverse mapping
by using the 'tag' value to lookup the command itself, and can possibly
do the 'scsi_cmd_priv' trick of embedding the LLDD-specific parts within
the command. With this split we'll be wasting quite some memory there,
as the possible 'tag' values are actually nr_hw_queues * shared_tags.

Cheers,

Hannes
John Garry Nov. 13, 2019, 2:57 p.m. UTC | #2
On 13/11/2019 14:06, Hannes Reinecke wrote:
> On 11/13/19 2:36 PM, John Garry wrote:
>> Some SCSI HBAs (such as HPSA, megaraid, mpt3sas, hisi_sas_v3 ..) support
>> multiple reply queues with single hostwide tags.
>>
>> In addition, these drivers want to use interrupt assignment in
>> pci_alloc_irq_vectors(PCI_IRQ_AFFINITY). However, as discussed in [0],
>> CPU hotplug may cause in-flight IO completion to not be serviced when an
>> interrupt is shutdown.
>>
>> To solve that problem, Ming's patchset to drain hctx's should ensure no
>> IOs are missed in-flight [1].
>>
>> However, to take advantage of that patchset, we need to map the HBA HW
>> queues to blk mq hctx's; to do that, we need to expose the HBA HW queues.
>>
>> In making that transition, the per-SCSI command request tags are no
>> longer unique per Scsi host - they are just unique per hctx. As such, the
>> HBA LLDD would have to generate this tag internally, which has a certain
>> performance overhead.
>>
>> However another problem is that blk mq assumes the host may accept
>> (Scsi_host.can_queue * #hw queue) commands. In [2], we removed the Scsi
>> host busy counter, which would stop the LLDD being sent more than
>> .can_queue commands; however, we should still ensure that the block layer
>> does not issue more than .can_queue commands to the Scsi host.
>>
>> To solve this problem, introduce a shared tags per blk_mq_tag_set, which
>> may be requested when allocating the tagset.
>>
>> New flag BLK_MQ_F_TAG_HCTX_SHARED should be set when requesting the
>> tagset.
>>
>> This is based on work originally from Ming Lei in [3].
>>
>> [0] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/
>> [1] https://lore.kernel.org/linux-block/20191014015043.25029-1-ming.lei@redhat.com/
>> [2] https://lore.kernel.org/linux-scsi/20191025065855.6309-1-ming.lei@redhat.com/
>> [3] https://lore.kernel.org/linux-block/20190531022801.10003-1-ming.lei@redhat.com/
>>
>> Signed-off-by: John Garry <john.garry@huawei.com>
>> ---
>>   block/blk-core.c       |  1 +
>>   block/blk-flush.c      |  2 +
>>   block/blk-mq-debugfs.c |  2 +-
>>   block/blk-mq-tag.c     | 85 ++++++++++++++++++++++++++++++++++++++++++
>>   block/blk-mq-tag.h     |  1 +
>>   block/blk-mq.c         | 61 +++++++++++++++++++++++++-----
>>   block/blk-mq.h         |  9 +++++
>>   include/linux/blk-mq.h |  3 ++
>>   include/linux/blkdev.h |  1 +
>>   9 files changed, 155 insertions(+), 10 deletions(-)
>>
> [ .. ]
>> @@ -396,15 +398,17 @@ static struct request *blk_mq_get_request(struct request_queue *q,
>>   		blk_mq_tag_busy(data->hctx);
>>   	}
>>   
>> -	tag = blk_mq_get_tag(data);
>> -	if (tag == BLK_MQ_TAG_FAIL) {
>> -		if (clear_ctx_on_error)
>> -			data->ctx = NULL;
>> -		blk_queue_exit(q);
>> -		return NULL;
>> +	if (data->hctx->shared_tags) {
>> +		shared_tag = blk_mq_get_shared_tag(data);
>> +		if (shared_tag == BLK_MQ_TAG_FAIL)
>> +			goto err_shared_tag;
>>   	}
>>   
>> -	rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
>> +	tag = blk_mq_get_tag(data);
>> +	if (tag == BLK_MQ_TAG_FAIL)
>> +		goto err_tag;
>> +
>> +	rq = blk_mq_rq_ctx_init(data, tag, shared_tag, data->cmd_flags, alloc_time_ns);
>>   	if (!op_is_flush(data->cmd_flags)) {
>>   		rq->elv.icq = NULL;
>>   		if (e && e->type->ops.prepare_request) {

Hi Hannes,

> Why do you need to keep a parallel tag accounting between 'normal' and
> 'shared' tags here?
> Isn't is sufficient to get a shared tag only, and us that in lieo of the
> 'real' one?

In theory, yes. Just the 'shared' tag should be adequate.

A problem I see with this approach is that we lose the identity of which 
tags are allocated for each hctx. As an example for this, consider 
blk_mq_queue_tag_busy_iter(), which iterates the bits for each hctx. 
Now, if you're just using shared tags only, that wouldn't work.

Consider blk_mq_can_queue() as another example - this tells us if a hctx 
has any bits unset, while with only using shared tags it would tell if 
any bits unset over all queues, and this change in semantics could break 
things. At a glance, function __blk_mq_tag_idle() looks problematic also.

And this is where it becomes messy to implement.

> 
> I would love to combine both,

Same here...

  as then we can easily do a reverse mapping
> by using the 'tag' value to lookup the command itself, and can possibly
> do the 'scsi_cmd_priv' trick of embedding the LLDD-specific parts within
> the command. With this split we'll be wasting quite some memory there,
> as the possible 'tag' values are actually nr_hw_queues * shared_tags.

Yeah, understood. That's just a trade-off I saw.

Thanks,
John
Hannes Reinecke Nov. 13, 2019, 3:38 p.m. UTC | #3
On 11/13/19 3:57 PM, John Garry wrote:
> On 13/11/2019 14:06, Hannes Reinecke wrote:
>> On 11/13/19 2:36 PM, John Garry wrote:
>>> Some SCSI HBAs (such as HPSA, megaraid, mpt3sas, hisi_sas_v3 ..) support
>>> multiple reply queues with single hostwide tags.
>>>
>>> In addition, these drivers want to use interrupt assignment in
>>> pci_alloc_irq_vectors(PCI_IRQ_AFFINITY). However, as discussed in [0],
>>> CPU hotplug may cause in-flight IO completion to not be serviced when an
>>> interrupt is shutdown.
>>>
>>> To solve that problem, Ming's patchset to drain hctx's should ensure no
>>> IOs are missed in-flight [1].
>>>
>>> However, to take advantage of that patchset, we need to map the HBA HW
>>> queues to blk mq hctx's; to do that, we need to expose the HBA HW
>>> queues.
>>>
>>> In making that transition, the per-SCSI command request tags are no
>>> longer unique per Scsi host - they are just unique per hctx. As such,
>>> the
>>> HBA LLDD would have to generate this tag internally, which has a certain
>>> performance overhead.
>>>
>>> However another problem is that blk mq assumes the host may accept
>>> (Scsi_host.can_queue * #hw queue) commands. In [2], we removed the Scsi
>>> host busy counter, which would stop the LLDD being sent more than
>>> .can_queue commands; however, we should still ensure that the block
>>> layer
>>> does not issue more than .can_queue commands to the Scsi host.
>>>
>>> To solve this problem, introduce a shared tags per blk_mq_tag_set, which
>>> may be requested when allocating the tagset.
>>>
>>> New flag BLK_MQ_F_TAG_HCTX_SHARED should be set when requesting the
>>> tagset.
>>>
>>> This is based on work originally from Ming Lei in [3].
>>>
>>> [0]
>>> https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/
>>>
>>> [1]
>>> https://lore.kernel.org/linux-block/20191014015043.25029-1-ming.lei@redhat.com/
>>>
>>> [2]
>>> https://lore.kernel.org/linux-scsi/20191025065855.6309-1-ming.lei@redhat.com/
>>>
>>> [3]
>>> https://lore.kernel.org/linux-block/20190531022801.10003-1-ming.lei@redhat.com/
>>>
>>>
>>> Signed-off-by: John Garry <john.garry@huawei.com>
>>> ---
>>>   block/blk-core.c       |  1 +
>>>   block/blk-flush.c      |  2 +
>>>   block/blk-mq-debugfs.c |  2 +-
>>>   block/blk-mq-tag.c     | 85 ++++++++++++++++++++++++++++++++++++++++++
>>>   block/blk-mq-tag.h     |  1 +
>>>   block/blk-mq.c         | 61 +++++++++++++++++++++++++-----
>>>   block/blk-mq.h         |  9 +++++
>>>   include/linux/blk-mq.h |  3 ++
>>>   include/linux/blkdev.h |  1 +
>>>   9 files changed, 155 insertions(+), 10 deletions(-)
>>>
>> [ .. ]
>>> @@ -396,15 +398,17 @@ static struct request
>>> *blk_mq_get_request(struct request_queue *q,
>>>           blk_mq_tag_busy(data->hctx);
>>>       }
>>>   -    tag = blk_mq_get_tag(data);
>>> -    if (tag == BLK_MQ_TAG_FAIL) {
>>> -        if (clear_ctx_on_error)
>>> -            data->ctx = NULL;
>>> -        blk_queue_exit(q);
>>> -        return NULL;
>>> +    if (data->hctx->shared_tags) {
>>> +        shared_tag = blk_mq_get_shared_tag(data);
>>> +        if (shared_tag == BLK_MQ_TAG_FAIL)
>>> +            goto err_shared_tag;
>>>       }
>>>   -    rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags,
>>> alloc_time_ns);
>>> +    tag = blk_mq_get_tag(data);
>>> +    if (tag == BLK_MQ_TAG_FAIL)
>>> +        goto err_tag;
>>> +
>>> +    rq = blk_mq_rq_ctx_init(data, tag, shared_tag, data->cmd_flags,
>>> alloc_time_ns);
>>>       if (!op_is_flush(data->cmd_flags)) {
>>>           rq->elv.icq = NULL;
>>>           if (e && e->type->ops.prepare_request) {
> 
> Hi Hannes,
> 
>> Why do you need to keep a parallel tag accounting between 'normal' and
>> 'shared' tags here?
>> Isn't is sufficient to get a shared tag only, and us that in lieo of the
>> 'real' one?
> 
> In theory, yes. Just the 'shared' tag should be adequate.
> 
> A problem I see with this approach is that we lose the identity of which
> tags are allocated for each hctx. As an example for this, consider
> blk_mq_queue_tag_busy_iter(), which iterates the bits for each hctx.
> Now, if you're just using shared tags only, that wouldn't work.
> 
> Consider blk_mq_can_queue() as another example - this tells us if a hctx
> has any bits unset, while with only using shared tags it would tell if
> any bits unset over all queues, and this change in semantics could break
> things. At a glance, function __blk_mq_tag_idle() looks problematic also.
> 
> And this is where it becomes messy to implement.
> 
Oh, my. Indeed, that's correct.

But then we don't really care _which_ shared tag is assigned; so
wouldn't be we better off by just having an atomic counter here?
Cache locality will be blown anyway ...

Cheers,

Hannes
John Garry Nov. 13, 2019, 4:21 p.m. UTC | #4
On 13/11/2019 15:38, Hannes Reinecke wrote:
>>>> -        if (clear_ctx_on_error)
>>>> -            data->ctx = NULL;
>>>> -        blk_queue_exit(q);
>>>> -        return NULL;
>>>> +    if (data->hctx->shared_tags) {
>>>> +        shared_tag = blk_mq_get_shared_tag(data);
>>>> +        if (shared_tag == BLK_MQ_TAG_FAIL)
>>>> +            goto err_shared_tag;
>>>>        }
>>>>    -    rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags,
>>>> alloc_time_ns);
>>>> +    tag = blk_mq_get_tag(data);
>>>> +    if (tag == BLK_MQ_TAG_FAIL)
>>>> +        goto err_tag;
>>>> +
>>>> +    rq = blk_mq_rq_ctx_init(data, tag, shared_tag, data->cmd_flags,
>>>> alloc_time_ns);
>>>>        if (!op_is_flush(data->cmd_flags)) {
>>>>            rq->elv.icq = NULL;
>>>>            if (e && e->type->ops.prepare_request) {
>> Hi Hannes,
>>
>>> Why do you need to keep a parallel tag accounting between 'normal' and
>>> 'shared' tags here?
>>> Isn't is sufficient to get a shared tag only, and us that in lieo of the
>>> 'real' one?
>> In theory, yes. Just the 'shared' tag should be adequate.
>>
>> A problem I see with this approach is that we lose the identity of which
>> tags are allocated for each hctx. As an example for this, consider
>> blk_mq_queue_tag_busy_iter(), which iterates the bits for each hctx.
>> Now, if you're just using shared tags only, that wouldn't work.
>>
>> Consider blk_mq_can_queue() as another example - this tells us if a hctx
>> has any bits unset, while with only using shared tags it would tell if
>> any bits unset over all queues, and this change in semantics could break
>> things. At a glance, function __blk_mq_tag_idle() looks problematic also.
>>
>> And this is where it becomes messy to implement.
>>

Hi Hannes,

> Oh, my. Indeed, that's correct.

The tags could be kept in sync like this:

shared_tag = blk_mq_get_tag(shared_tagset);
if (shared_tag != -1)
	sbitmap_set(hctx->tags, shared_tag);

But that's obviously not ideal.

> 
> But then we don't really care _which_ shared tag is assigned; so
> wouldn't be we better off by just having an atomic counter here?
> Cache locality will be blown anyway ...
The atomic counter would solve the issuing more than Scsi_host.can_queue 
to the LLDD, but we still need a unique tag, which is what the shared 
tag is.

Thanks,
John
Hannes Reinecke Nov. 13, 2019, 6:38 p.m. UTC | #5
On 11/13/19 5:21 PM, John Garry wrote:
> On 13/11/2019 15:38, Hannes Reinecke wrote:
>>>>> -        if (clear_ctx_on_error)
>>>>> -            data->ctx = NULL;
>>>>> -        blk_queue_exit(q);
>>>>> -        return NULL;
>>>>> +    if (data->hctx->shared_tags) {
>>>>> +        shared_tag = blk_mq_get_shared_tag(data);
>>>>> +        if (shared_tag == BLK_MQ_TAG_FAIL)
>>>>> +            goto err_shared_tag;
>>>>>        }
>>>>>    -    rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags,
>>>>> alloc_time_ns);
>>>>> +    tag = blk_mq_get_tag(data);
>>>>> +    if (tag == BLK_MQ_TAG_FAIL)
>>>>> +        goto err_tag;
>>>>> +
>>>>> +    rq = blk_mq_rq_ctx_init(data, tag, shared_tag, data->cmd_flags,
>>>>> alloc_time_ns);
>>>>>        if (!op_is_flush(data->cmd_flags)) {
>>>>>            rq->elv.icq = NULL;
>>>>>            if (e && e->type->ops.prepare_request) {
>>> Hi Hannes,
>>>
>>>> Why do you need to keep a parallel tag accounting between 'normal' and
>>>> 'shared' tags here?
>>>> Isn't is sufficient to get a shared tag only, and us that in lieo of 
>>>> the
>>>> 'real' one?
>>> In theory, yes. Just the 'shared' tag should be adequate.
>>>
>>> A problem I see with this approach is that we lose the identity of which
>>> tags are allocated for each hctx. As an example for this, consider
>>> blk_mq_queue_tag_busy_iter(), which iterates the bits for each hctx.
>>> Now, if you're just using shared tags only, that wouldn't work.
>>>
>>> Consider blk_mq_can_queue() as another example - this tells us if a hctx
>>> has any bits unset, while with only using shared tags it would tell if
>>> any bits unset over all queues, and this change in semantics could break
>>> things. At a glance, function __blk_mq_tag_idle() looks problematic 
>>> also.
>>>
>>> And this is where it becomes messy to implement.
>>>
> 
> Hi Hannes,
> 
>> Oh, my. Indeed, that's correct.
> 
> The tags could be kept in sync like this:
> 
> shared_tag = blk_mq_get_tag(shared_tagset);
> if (shared_tag != -1)
>      sbitmap_set(hctx->tags, shared_tag);
> 
> But that's obviously not ideal.
> 
Actually, I _do_ prefer keeping both in sync.
We might want to check if the 'normal' tag is set (typically it would 
not, but then, who knows ...)
The beauty here is that both 'shared' and 'normal' tag are in sync, so 
if a driver would be wanting to use the tag as index into a command 
array it can do so without any surprises.

Why do you think it's not ideal?

>>
>> But then we don't really care _which_ shared tag is assigned; so
>> wouldn't be we better off by just having an atomic counter here?
>> Cache locality will be blown anyway ...
> The atomic counter would solve the issuing more than Scsi_host.can_queue 
> to the LLDD, but we still need a unique tag, which is what the shared 
> tag is.
> 
Yeah, true. Daft idea :-)

Cheers,

Hannes
John Garry Nov. 14, 2019, 9:41 a.m. UTC | #6
On 13/11/2019 18:38, Hannes Reinecke wrote:
>> Hi Hannes,
>>
>>> Oh, my. Indeed, that's correct.
>>
>> The tags could be kept in sync like this:
>>
>> shared_tag = blk_mq_get_tag(shared_tagset);
>> if (shared_tag != -1)
>>      sbitmap_set(hctx->tags, shared_tag);
>>
>> But that's obviously not ideal.
>>
> Actually, I _do_ prefer keeping both in sync.
> We might want to check if the 'normal' tag is set (typically it would 
> not, but then, who knows ...)
> The beauty here is that both 'shared' and 'normal' tag are in sync, so 
> if a driver would be wanting to use the tag as index into a command 
> array it can do so without any surprises.
> 
> Why do you think it's not ideal?

A few points:
- Getting a bit from one tagset and then setting it in another tagset is 
a bit clunky.
- There may be an atomicity of the getting the shared tag bit and 
setting the hctx tag bit - I don't think that there is.
- Consider that sometimes we may want to check if there is space on a hw 
queue - checking the hctx tags is not really proper any longer, as 
typically there would always be space on hctx, but not always the shared 
tags. We did delete blk_mq_can_queue() yesterday, which would be an 
example of that. Need to check if there are others.

Having said all that, the obvious advantage is performance gain, can 
still use request.tag and so maybe less intrusive changes.

I'll have a look at the implementation. The devil is mostly in the detail...

Thanks,
John
Bart Van Assche Nov. 15, 2019, 5:30 a.m. UTC | #7
On 11/14/19 1:41 AM, John Garry wrote:
> On 13/11/2019 18:38, Hannes Reinecke wrote:
>>> Hi Hannes,
>>>
>>>> Oh, my. Indeed, that's correct.
>>>
>>> The tags could be kept in sync like this:
>>>
>>> shared_tag = blk_mq_get_tag(shared_tagset);
>>> if (shared_tag != -1)
>>>      sbitmap_set(hctx->tags, shared_tag);
>>>
>>> But that's obviously not ideal.
>>>
>> Actually, I _do_ prefer keeping both in sync.
>> We might want to check if the 'normal' tag is set (typically it would not, but then, who knows ...)
>> The beauty here is that both 'shared' and 'normal' tag are in sync, so if a driver would be wanting to use the tag as index into a command array it can do so without any surprises.
>>
>> Why do you think it's not ideal?
> 
> A few points:
> - Getting a bit from one tagset and then setting it in another tagset is a bit clunky.
> - There may be an atomicity of the getting the shared tag bit and setting the hctx tag bit - I don't think that there is.
> - Consider that sometimes we may want to check if there is space on a hw queue - checking the hctx tags is not really proper any longer, as typically there would always be space on hctx, but not always the shared tags. We did delete blk_mq_can_queue() yesterday, which
> would be an example of that. Need to check if there are others.
> 
> Having said all that, the obvious advantage is performance gain, can still use request.tag and so maybe less intrusive changes.
> 
> I'll have a look at the implementation. The devil is mostly in the detail...

Wouldn't that approach trigger a deadlock if it is attempted to allocate the last
tag from two different hardware queues? How about sharing tag sets across hardware
queues, e.g. like in the (totally untested) patch below?

Thanks,

Bart.

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b3f2ba483992..3678e95ec947 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -211,8 +211,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
 #define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name
 static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
-	HCTX_STATE_NAME(TAG_ACTIVE),
-	HCTX_STATE_NAME(SCHED_RESTART),
 };
 #undef HCTX_STATE_NAME

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ca22afd47b3d..6262584dca09 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -64,18 +64,18 @@ void blk_mq_sched_assign_ioc(struct request *rq)
  */
 void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
-	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+	if (test_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state))
 		return;

-	set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+	set_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state);
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);

 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 {
-	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+	if (!test_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state))
 		return;
-	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+	clear_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state);

 	blk_mq_run_hw_queue(hctx, true);
 }
@@ -479,12 +479,15 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
 /* called in queue's release handler, tagset has gone away */
 static void blk_mq_sched_tags_teardown(struct request_queue *q)
 {
+	struct blk_mq_tags *sched_tags = NULL;
 	struct blk_mq_hw_ctx *hctx;
 	int i;

 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (hctx->sched_tags) {
+		if (hctx->sched_tags != sched_tags) {
 			blk_mq_free_rq_map(hctx->sched_tags);
+			if (!sched_tags)
+				sched_tags = hctx->sched_tags;
 			hctx->sched_tags = NULL;
 		}
 	}
@@ -512,6 +515,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 				   BLKDEV_MAX_RQ);

 	queue_for_each_hw_ctx(q, hctx, i) {
+		if (i > 0 && q->tag_set->share_tags) {
+			hctx->sched_tags = q->queue_hw_ctx[0]->sched_tags;
+			continue;
+		}
 		ret = blk_mq_sched_alloc_tags(q, hctx, i);
 		if (ret)
 			goto err;
@@ -556,8 +563,11 @@ void blk_mq_sched_free_requests(struct request_queue *q)
 	int i;

 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (hctx->sched_tags)
+		if (hctx->sched_tags) {
 			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+			if (q->tag_set->share_tags)
+				break;
+		}
 	}
 }

diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 126021fc3a11..15174a646468 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -82,7 +82,7 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)

 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
 {
-	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+	return test_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state);
 }

 #endif
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 586c9d6e904a..770fe2324230 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -23,8 +23,8 @@
  */
 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
-	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
-	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+	if (!test_bit(BLK_MQ_T_ACTIVE, &hctx->tags->state) &&
+	    !test_and_set_bit(BLK_MQ_T_ACTIVE, &hctx->tags->state))
 		atomic_inc(&hctx->tags->active_queues);

 	return true;
@@ -48,7 +48,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 {
 	struct blk_mq_tags *tags = hctx->tags;

-	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+	if (!test_and_clear_bit(BLK_MQ_T_ACTIVE, &hctx->tags->state))
 		return;

 	atomic_dec(&tags->active_queues);
@@ -67,7 +67,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,

 	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
 		return true;
-	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+	if (!test_bit(BLK_MQ_T_ACTIVE, &hctx->tags->state))
 		return true;

 	/*
@@ -220,7 +220,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assigning ->rqs[].
 	 */
-	if (rq && rq->q == hctx->queue)
+	if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
 		return iter_data->fn(hctx, rq, iter_data->data, reserved);
 	return true;
 }
@@ -341,8 +341,11 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 	int i;

 	for (i = 0; i < tagset->nr_hw_queues; i++) {
-		if (tagset->tags && tagset->tags[i])
+		if (tagset->tags && tagset->tags[i]) {
 			blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
+			if (tagset->share_tags)
+				break;
+		}
 	}
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d0c10d043891..f75fa936b090 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -4,6 +4,11 @@

 #include "blk-mq.h"

+enum {
+	BLK_MQ_T_ACTIVE		= 1,
+	BLK_MQ_T_SCHED_RESTART	= 2,
+};
+
 /*
  * Tag address space map.
  */
@@ -11,6 +16,11 @@ struct blk_mq_tags {
 	unsigned int nr_tags;
 	unsigned int nr_reserved_tags;

+	/**
+	 * @state: BLK_MQ_T_* flags. Defines the state of the hw
+	 * queue (active, scheduled to restart).
+	 */
+	unsigned long	state;
 	atomic_t active_queues;

 	struct sbitmap_queue bitmap_tags;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fec4b82ff91c..81d4d6a96098 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2404,6 +2404,12 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
 {
 	int ret = 0;

+	if (hctx_idx > 0 && set->share_tags) {
+		WARN_ON_ONCE(!set->tags[0]);
+		set->tags[hctx_idx] = set->tags[0];
+		return 0;
+	}
+
 	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
 					set->queue_depth, set->reserved_tags);
 	if (!set->tags[hctx_idx])
@@ -2423,8 +2429,10 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
 					 unsigned int hctx_idx)
 {
 	if (set->tags && set->tags[hctx_idx]) {
-		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
-		blk_mq_free_rq_map(set->tags[hctx_idx]);
+		if (hctx_idx == 0 || !set->share_tags) {
+			blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+			blk_mq_free_rq_map(set->tags[hctx_idx]);
+		}
 		set->tags[hctx_idx] = NULL;
 	}
 }
@@ -2568,7 +2576,7 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)

 	mutex_lock(&set->tag_list_lock);
 	list_del_rcu(&q->tag_set_list);
-	if (list_is_singular(&set->tag_list)) {
+	if (list_is_singular(&set->tag_list) && !set->share_tags) {
 		/* just transitioned to unshared */
 		set->flags &= ~BLK_MQ_F_TAG_SHARED;
 		/* update existing queue */
@@ -2586,7 +2594,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 	/*
 	 * Check to see if we're transitioning to shared (from 1 to 2 queues).
 	 */
-	if (!list_empty(&set->tag_list) &&
+	if ((!list_empty(&set->tag_list) || set->share_tags) &&
 	    !(set->flags & BLK_MQ_F_TAG_SHARED)) {
 		set->flags |= BLK_MQ_F_TAG_SHARED;
 		/* update existing queue */
@@ -2911,15 +2919,21 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
 	int i;

-	for (i = 0; i < set->nr_hw_queues; i++)
-		if (!__blk_mq_alloc_rq_map(set, i))
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		if (i > 0 && set->share_tags) {
+			set->tags[i] = set->tags[0];
+		} else if (!__blk_mq_alloc_rq_map(set, i))
 			goto out_unwind;
+	}

 	return 0;

 out_unwind:
-	while (--i >= 0)
+	while (--i >= 0) {
+		if (i > 0 && set->share_tags)
+			continue;
 		blk_mq_free_rq_map(set->tags[i]);
+	}

 	return -ENOMEM;
 }
@@ -3016,6 +3030,10 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
  * May fail with EINVAL for various error conditions. May adjust the
  * requested depth down, if it's too large. In that case, the set
  * value will be stored in set->queue_depth.
+ *
+ * @set: tag set for which to allocate tags.
+ * @share_tags: If true, allocate a single set of tags and share it across
+ *	hardware queues.
  */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
@@ -3137,6 +3155,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->tags)
 			continue;
+		if (i > 0 && set->share_tags) {
+			hctx->tags[i] = hctx->tags[0];
+			if (hctx->sched_tags)
+				hctx->sched_tags[i] = hctx->sched_tags[0];
+			continue;
+		}
 		/*
 		 * If we're using an MQ scheduler, just update the scheduler
 		 * queue depth. This is similar to what the old code would do.
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 11cfd6470b1a..dd5517476314 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -224,10 +224,13 @@ enum hctx_type {
  * @numa_node:	   NUMA node the storage adapter has been connected to.
  * @timeout:	   Request processing timeout in jiffies.
  * @flags:	   Zero or more BLK_MQ_F_* flags.
+ * @share_tags:	   Whether or not to share one tag set across hardware queues.
  * @driver_data:   Pointer to data owned by the block driver that created this
  *		   tag set.
- * @tags:	   Tag sets. One tag set per hardware queue. Has @nr_hw_queues
- *		   elements.
+ * @tags:	   Array of tag set pointers. Has @nr_hw_queues elements. If
+ *		   share_tags has not been set, all tag set pointers are
+ *		   different. If share_tags has been set, all tag_set pointers
+ *		   are identical.
  * @tag_list_lock: Serializes tag_list accesses.
  * @tag_list:	   List of the request queues that use this tag set. See also
  *		   request_queue.tag_set_list.
@@ -243,6 +246,7 @@ struct blk_mq_tag_set {
 	int			numa_node;
 	unsigned int		timeout;
 	unsigned int		flags;
+	bool			share_tags;
 	void			*driver_data;

 	struct blk_mq_tags	**tags;
@@ -394,8 +398,6 @@ enum {
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,

 	BLK_MQ_S_STOPPED	= 0,
-	BLK_MQ_S_TAG_ACTIVE	= 1,
-	BLK_MQ_S_SCHED_RESTART	= 2,

 	BLK_MQ_MAX_DEPTH	= 10240,
Hannes Reinecke Nov. 15, 2019, 7:26 a.m. UTC | #8
On 11/14/19 10:41 AM, John Garry wrote:
> On 13/11/2019 18:38, Hannes Reinecke wrote:
>>> Hi Hannes,
>>>
>>>> Oh, my. Indeed, that's correct.
>>>
>>> The tags could be kept in sync like this:
>>>
>>> shared_tag = blk_mq_get_tag(shared_tagset);
>>> if (shared_tag != -1)
>>>      sbitmap_set(hctx->tags, shared_tag);
>>>
>>> But that's obviously not ideal.
>>>
>> Actually, I _do_ prefer keeping both in sync.
>> We might want to check if the 'normal' tag is set (typically it would
>> not, but then, who knows ...)
>> The beauty here is that both 'shared' and 'normal' tag are in sync, so
>> if a driver would be wanting to use the tag as index into a command
>> array it can do so without any surprises.
>>
>> Why do you think it's not ideal?
> 
> A few points:
> - Getting a bit from one tagset and then setting it in another tagset is
> a bit clunky.
Yes, that's true.
But painstakingly trying to find a free bit in a bitmask when we already
know which to pick is also a bit daft.

> - There may be an atomicity of the getting the shared tag bit and
> setting the hctx tag bit - I don't think that there is.

That was precisely what I've alluded to in 'We might want to check if
the normal tag is set'.
Typically the 'normal' tag would be free (as the shared tag set out of
necessity needs to be the combination of all hctx tag sets).
Any difference here _is_ a programming error, and should be flagged as
such (sbitmap_test_and_set() anyone?)
We might have ordering issues on release, as we really should drop the
hctx tag before the shared tag; but when we observe that we should be fine.

> - Consider that sometimes we may want to check if there is space on a hw
> queue - checking the hctx tags is not really proper any longer, as
> typically there would always be space on hctx, but not always the shared
> tags. We did delete blk_mq_can_queue() yesterday, which would be an
> example of that. Need to check if there are others.
> 
Clearly, this needs an audit of all functions accessing the hctx tag
space; maybe it's worth having a pre-requisite patchset differentiating
between hctx tags and global, shared tags. Hmm.

> Having said all that, the obvious advantage is performance gain, can
> still use request.tag and so maybe less intrusive changes.
> 
> I'll have a look at the implementation. The devil is mostly in the
> detail...
> 
True.
And, incidentally, if we run with shared tage we can skip the scheduling
section in blk_mq_get_tag(); if we're out of tags, we're out of tags,
and no rescheduling will help as we don't _have_ other tagsets to look at.

But overall I like this approach.

Cheers,

Hannes
Hannes Reinecke Nov. 15, 2019, 7:29 a.m. UTC | #9
On 11/15/19 6:30 AM, Bart Van Assche wrote:
> On 11/14/19 1:41 AM, John Garry wrote:
>> On 13/11/2019 18:38, Hannes Reinecke wrote:
>>>> Hi Hannes,
>>>>
>>>>> Oh, my. Indeed, that's correct.
>>>>
>>>> The tags could be kept in sync like this:
>>>>
>>>> shared_tag = blk_mq_get_tag(shared_tagset);
>>>> if (shared_tag != -1)
>>>>      sbitmap_set(hctx->tags, shared_tag);
>>>>
>>>> But that's obviously not ideal.
>>>>
>>> Actually, I _do_ prefer keeping both in sync.
>>> We might want to check if the 'normal' tag is set (typically it would not, but then, who knows ...)
>>> The beauty here is that both 'shared' and 'normal' tag are in sync, so if a driver would be wanting to use the tag as index into a command array it can do so without any surprises.
>>>
>>> Why do you think it's not ideal?
>>
>> A few points:
>> - Getting a bit from one tagset and then setting it in another tagset is a bit clunky.
>> - There may be an atomicity of the getting the shared tag bit and setting the hctx tag bit - I don't think that there is.
>> - Consider that sometimes we may want to check if there is space on a hw queue - checking the hctx tags is not really proper any longer, as typically there would always be space on hctx, but not always the shared tags. We did delete blk_mq_can_queue() yesterday, which
>> would be an example of that. Need to check if there are others.
>>
>> Having said all that, the obvious advantage is performance gain, can still use request.tag and so maybe less intrusive changes.
>>
>> I'll have a look at the implementation. The devil is mostly in the detail...
> 
> Wouldn't that approach trigger a deadlock if it is attempted to allocate the last
> tag from two different hardware queues? How about sharing tag sets across hardware
> queues, e.g. like in the (totally untested) patch below?
> 
Why should it?
The shared tag map determines which tag should be allocated in the
per-hctx map, and as the former is a strict superset of all hctx maps
the bit _has_ to be free in the hctx map.

Cheers,

Hannes
John Garry Nov. 15, 2019, 10:24 a.m. UTC | #10
>>> Actually, I _do_ prefer keeping both in sync.
>>> We might want to check if the 'normal' tag is set (typically it would not, but then, who knows ...)
>>> The beauty here is that both 'shared' and 'normal' tag are in sync, so if a driver would be wanting to use the tag as index into a command array it can do so without any surprises.
>>>
>>> Why do you think it's not ideal?
>>
>> A few points:
>> - Getting a bit from one tagset and then setting it in another tagset is a bit clunky.
>> - There may be an atomicity of the getting the shared tag bit and setting the hctx tag bit - I don't think that there is.
>> - Consider that sometimes we may want to check if there is space on a hw queue - checking the hctx tags is not really proper any longer, as typically there would always be space on hctx, but not always the shared tags. We did delete blk_mq_can_queue() yesterday, which
>> would be an example of that. Need to check if there are others.
>>
>> Having said all that, the obvious advantage is performance gain, can still use request.tag and so maybe less intrusive changes.
>>
>> I'll have a look at the implementation. The devil is mostly in the detail...
> 

Hi Bart,

> Wouldn't that approach trigger a deadlock if it is attempted to allocate the last
> tag from two different hardware queues?

I see Hannes answered this one.

How about sharing tag sets across hardware
 > queues, e.g. like in the (totally untested) patch below?

So this is similar in principle what Ming Lei came up with here:
https://lore.kernel.org/linux-block/20190531022801.10003-1-ming.lei@redhat.com/

However your implementation looks neater, which is good.

My concern with this approach is that we can't differentiate which tags 
are allocated for which hctx, and sometimes we need to know that.

An example here was blk_mq_queue_tag_busy_iter(), which iterates the 
bits for each hctx. This would just be broken by that change, unless we 
record which bits are associated with each hctx.

Another example was __blk_mq_tag_idle(), which looks problematic.

> 
> Thanks,
> 
> Bart. >
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c

For debugfs, when we examine 
/sys/kernel/debug/block/.../hctxX/tags_bitmap, wouldn't that be the tags 
for all hctx (hctx0)?

For debugging reasons, I would say we want to know which tags are 
allocated for a specific hctx, as this is tightly related to the 
requests for that hctx.

> index b3f2ba483992..3678e95ec947 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -211,8 +211,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
>   #define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name
>   static const char *const hctx_state_name[] = {
>   	HCTX_STATE_NAME(STOPPED),
> -	HCTX_STATE_NAME(TAG_ACTIVE),
> -	HCTX_STATE_NAME(SCHED_RESTART),
>   };
>   #undef HCTX_STATE_NAME
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index ca22afd47b3d..6262584dca09 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -64,18 +64,18 @@ void blk_mq_sched_assign_ioc(struct request *rq)
>    */
>   void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
>   {
> -	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
> +	if (test_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state))
>   		return;
> 
> -	set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
> +	set_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state);
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
> 
>   void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
>   {
> -	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
> +	if (!test_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state))
>   		return;
> -	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
> +	clear_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state);
> 
>   	blk_mq_run_hw_queue(hctx, true);
>   }
> @@ -479,12 +479,15 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
>   /* called in queue's release handler, tagset has gone away */
>   static void blk_mq_sched_tags_teardown(struct request_queue *q)
>   {
> +	struct blk_mq_tags *sched_tags = NULL;
>   	struct blk_mq_hw_ctx *hctx;
>   	int i;
> 
>   	queue_for_each_hw_ctx(q, hctx, i) {
> -		if (hctx->sched_tags) {
> +		if (hctx->sched_tags != sched_tags) {
>   			blk_mq_free_rq_map(hctx->sched_tags);
> +			if (!sched_tags)
> +				sched_tags = hctx->sched_tags;
>   			hctx->sched_tags = NULL;
>   		}
>   	}
> @@ -512,6 +515,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>   				   BLKDEV_MAX_RQ);
> 
>   	queue_for_each_hw_ctx(q, hctx, i) {
> +		if (i > 0 && q->tag_set->share_tags) {
> +			hctx->sched_tags = q->queue_hw_ctx[0]->sched_tags;
> +			continue;
> +		}
>   		ret = blk_mq_sched_alloc_tags(q, hctx, i);
>   		if (ret)
>   			goto err;
> @@ -556,8 +563,11 @@ void blk_mq_sched_free_requests(struct request_queue *q)
>   	int i;
> 
>   	queue_for_each_hw_ctx(q, hctx, i) {
> -		if (hctx->sched_tags)
> +		if (hctx->sched_tags) {
>   			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
> +			if (q->tag_set->share_tags)
> +				break;
> +		}
>   	}
>   }
> 
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 126021fc3a11..15174a646468 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -82,7 +82,7 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
> 
>   static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
>   {
> -	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
> +	return test_bit(BLK_MQ_T_SCHED_RESTART, &hctx->tags->state);
>   }
> 
>   #endif
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 586c9d6e904a..770fe2324230 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -23,8 +23,8 @@
>    */
>   bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>   {
> -	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
> -	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> +	if (!test_bit(BLK_MQ_T_ACTIVE, &hctx->tags->state) &&
> +	    !test_and_set_bit(BLK_MQ_T_ACTIVE, &hctx->tags->state))
>   		atomic_inc(&hctx->tags->active_queues);
> 
>   	return true;
> @@ -48,7 +48,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
>   {
>   	struct blk_mq_tags *tags = hctx->tags;
> 
> -	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> +	if (!test_and_clear_bit(BLK_MQ_T_ACTIVE, &hctx->tags->state))
>   		return;
> 
>   	atomic_dec(&tags->active_queues);
> @@ -67,7 +67,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
> 
>   	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
>   		return true;
> -	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> +	if (!test_bit(BLK_MQ_T_ACTIVE, &hctx->tags->state))
>   		return true;
> 
>   	/*
> @@ -220,7 +220,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>   	 * We can hit rq == NULL here, because the tagging functions
>   	 * test and set the bit before assigning ->rqs[].
>   	 */
> -	if (rq && rq->q == hctx->queue)
> +	if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
>   		return iter_data->fn(hctx, rq, iter_data->data, reserved);
>   	return true;
>   }
> @@ -341,8 +341,11 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
>   	int i;
> 
>   	for (i = 0; i < tagset->nr_hw_queues; i++) {
> -		if (tagset->tags && tagset->tags[i])
> +		if (tagset->tags && tagset->tags[i]) {
>   			blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);

As I mentioned earlier, wouldn't this iterate over all tags for all 
hctx's, when we just want the tags for hctx[i]?

Thanks,
John

[Not trimming reply for future reference]

> +			if (tagset->share_tags)
> +				break;
> +		}
>   	}
>   }
>   EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index d0c10d043891..f75fa936b090 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -4,6 +4,11 @@
> 
>   #include "blk-mq.h"
> 
> +enum {
> +	BLK_MQ_T_ACTIVE		= 1,
> +	BLK_MQ_T_SCHED_RESTART	= 2,
> +};
> +
>   /*
>    * Tag address space map.
>    */
> @@ -11,6 +16,11 @@ struct blk_mq_tags {
>   	unsigned int nr_tags;
>   	unsigned int nr_reserved_tags;
> 
> +	/**
> +	 * @state: BLK_MQ_T_* flags. Defines the state of the hw
> +	 * queue (active, scheduled to restart).
> +	 */
> +	unsigned long	state;
>   	atomic_t active_queues;
> 
>   	struct sbitmap_queue bitmap_tags;
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index fec4b82ff91c..81d4d6a96098 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2404,6 +2404,12 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
>   {
>   	int ret = 0;
> 
> +	if (hctx_idx > 0 && set->share_tags) {
> +		WARN_ON_ONCE(!set->tags[0]);
> +		set->tags[hctx_idx] = set->tags[0];
> +		return 0;
> +	}
> +
>   	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
>   					set->queue_depth, set->reserved_tags);
>   	if (!set->tags[hctx_idx])
> @@ -2423,8 +2429,10 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
>   					 unsigned int hctx_idx)
>   {
>   	if (set->tags && set->tags[hctx_idx]) {
> -		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
> -		blk_mq_free_rq_map(set->tags[hctx_idx]);
> +		if (hctx_idx == 0 || !set->share_tags) {
> +			blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
> +			blk_mq_free_rq_map(set->tags[hctx_idx]);
> +		}
>   		set->tags[hctx_idx] = NULL;
>   	}
>   }
> @@ -2568,7 +2576,7 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
> 
>   	mutex_lock(&set->tag_list_lock);
>   	list_del_rcu(&q->tag_set_list);
> -	if (list_is_singular(&set->tag_list)) {
> +	if (list_is_singular(&set->tag_list) && !set->share_tags) {
>   		/* just transitioned to unshared */
>   		set->flags &= ~BLK_MQ_F_TAG_SHARED;
>   		/* update existing queue */
> @@ -2586,7 +2594,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
>   	/*
>   	 * Check to see if we're transitioning to shared (from 1 to 2 queues).
>   	 */
> -	if (!list_empty(&set->tag_list) &&
> +	if ((!list_empty(&set->tag_list) || set->share_tags) &&
>   	    !(set->flags & BLK_MQ_F_TAG_SHARED)) {
>   		set->flags |= BLK_MQ_F_TAG_SHARED;
>   		/* update existing queue */
> @@ -2911,15 +2919,21 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
>   {
>   	int i;
> 
> -	for (i = 0; i < set->nr_hw_queues; i++)
> -		if (!__blk_mq_alloc_rq_map(set, i))
> +	for (i = 0; i < set->nr_hw_queues; i++) {
> +		if (i > 0 && set->share_tags) {
> +			set->tags[i] = set->tags[0];
> +		} else if (!__blk_mq_alloc_rq_map(set, i))
>   			goto out_unwind;
> +	}
> 
>   	return 0;
> 
>   out_unwind:
> -	while (--i >= 0)
> +	while (--i >= 0) {
> +		if (i > 0 && set->share_tags)
> +			continue;
>   		blk_mq_free_rq_map(set->tags[i]);
> +	}
> 
>   	return -ENOMEM;
>   }
> @@ -3016,6 +3030,10 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
>    * May fail with EINVAL for various error conditions. May adjust the
>    * requested depth down, if it's too large. In that case, the set
>    * value will be stored in set->queue_depth.
> + *
> + * @set: tag set for which to allocate tags.
> + * @share_tags: If true, allocate a single set of tags and share it across
> + *	hardware queues.
>    */
>   int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
>   {
> @@ -3137,6 +3155,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
>   	queue_for_each_hw_ctx(q, hctx, i) {
>   		if (!hctx->tags)
>   			continue;
> +		if (i > 0 && set->share_tags) {
> +			hctx->tags[i] = hctx->tags[0];
> +			if (hctx->sched_tags)
> +				hctx->sched_tags[i] = hctx->sched_tags[0];
> +			continue;
> +		}
>   		/*
>   		 * If we're using an MQ scheduler, just update the scheduler
>   		 * queue depth. This is similar to what the old code would do.
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 11cfd6470b1a..dd5517476314 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -224,10 +224,13 @@ enum hctx_type {
>    * @numa_node:	   NUMA node the storage adapter has been connected to.
>    * @timeout:	   Request processing timeout in jiffies.
>    * @flags:	   Zero or more BLK_MQ_F_* flags.
> + * @share_tags:	   Whether or not to share one tag set across hardware queues.
>    * @driver_data:   Pointer to data owned by the block driver that created this
>    *		   tag set.
> - * @tags:	   Tag sets. One tag set per hardware queue. Has @nr_hw_queues
> - *		   elements.
> + * @tags:	   Array of tag set pointers. Has @nr_hw_queues elements. If
> + *		   share_tags has not been set, all tag set pointers are
> + *		   different. If share_tags has been set, all tag_set pointers
> + *		   are identical.
>    * @tag_list_lock: Serializes tag_list accesses.
>    * @tag_list:	   List of the request queues that use this tag set. See also
>    *		   request_queue.tag_set_list.
> @@ -243,6 +246,7 @@ struct blk_mq_tag_set {
>   	int			numa_node;
>   	unsigned int		timeout;
>   	unsigned int		flags;
> +	bool			share_tags;
>   	void			*driver_data;
> 
>   	struct blk_mq_tags	**tags;
> @@ -394,8 +398,6 @@ enum {
>   	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
> 
>   	BLK_MQ_S_STOPPED	= 0,
> -	BLK_MQ_S_TAG_ACTIVE	= 1,
> -	BLK_MQ_S_SCHED_RESTART	= 2,
> 
>   	BLK_MQ_MAX_DEPTH	= 10240,
> 
> 
> .
>
John Garry Nov. 15, 2019, 10:46 a.m. UTC | #11
On 15/11/2019 07:26, Hannes Reinecke wrote:
> On 11/14/19 10:41 AM, John Garry wrote:
>> On 13/11/2019 18:38, Hannes Reinecke wrote:
>>>> Hi Hannes,
>>>>
>>>>> Oh, my. Indeed, that's correct.
>>>>
>>>> The tags could be kept in sync like this:
>>>>
>>>> shared_tag = blk_mq_get_tag(shared_tagset);
>>>> if (shared_tag != -1)
>>>>       sbitmap_set(hctx->tags, shared_tag);
>>>>
>>>> But that's obviously not ideal.
>>>>
>>> Actually, I _do_ prefer keeping both in sync.
>>> We might want to check if the 'normal' tag is set (typically it would
>>> not, but then, who knows ...)
>>> The beauty here is that both 'shared' and 'normal' tag are in sync, so
>>> if a driver would be wanting to use the tag as index into a command
>>> array it can do so without any surprises.
>>>
>>> Why do you think it's not ideal?
>>
>> A few points:
>> - Getting a bit from one tagset and then setting it in another tagset is
>> a bit clunky.
> Yes, that's true.
> But painstakingly trying to find a free bit in a bitmask when we already
> know which to pick is also a bit daft.

Yeah, but it's not all good - there would still be a certain overhead in 
the atomic set and unset bit on the hctx sbitmap. However it still 
should be faster as it excludes the search.

> 
>> - There may be an atomicity of the getting the shared tag bit and
>> setting the hctx tag bit - I don't think that there is.
> 
> That was precisely what I've alluded to in 'We might want to check if
> the normal tag is set'.
> Typically the 'normal' tag would be free (as the shared tag set out of
> necessity needs to be the combination of all hctx tag sets).

Right

> Any difference here _is_ a programming error, and should be flagged as
> such (sbitmap_test_and_set() anyone?)

Eh, I hope that we wouldn't need this.

> We might have ordering issues on release, as we really should drop the
> hctx tag before the shared tag; but when we observe that we should be fine.
> 
>> - Consider that sometimes we may want to check if there is space on a hw
>> queue - checking the hctx tags is not really proper any longer, as
>> typically there would always be space on hctx, but not always the shared
>> tags. We did delete blk_mq_can_queue() yesterday, which would be an
>> example of that. Need to check if there are others.
>>
> Clearly, this needs an audit of all functions accessing the hctx tag
> space; maybe it's worth having a pre-requisite patchset differentiating
> between hctx tags and global, shared tags. Hmm.
> 
>> Having said all that, the obvious advantage is performance gain, can
>> still use request.tag and so maybe less intrusive changes.
>>
>> I'll have a look at the implementation. The devil is mostly in the
>> detail...
>>
> True.
> And, incidentally, if we run with shared tage we can skip the scheduling
> section in blk_mq_get_tag(); if we're out of tags, we're out of tags,

Right, but don't we need to then "kick all hw queues", instead of just 
that for the current hctx in blk_mq_get_tag() when free tags are exhausted?

> and no rescheduling will help as we don't _have_ other tagsets to look at.
> 
> But overall I like this approach.
> 

Yeah, to me it seems sensible. Again, a neat implementation is the 
challenge.

I'll post an RFC v2 for this one.

I am also requesting some performance figures also internally.

Thanks,
John
Bart Van Assche Nov. 15, 2019, 5:57 p.m. UTC | #12
On 11/15/19 2:24 AM, John Garry wrote:
> Bart Van Assche wrote:
> > How about sharing tag sets across hardware
> > queues, e.g. like in the (totally untested) patch below?
> 
> So this is similar in principle what Ming Lei came up with here:
> https://lore.kernel.org/linux-block/20190531022801.10003-1-ming.lei@redhat.com/ 
> 
> However your implementation looks neater, which is good.
> 
> My concern with this approach is that we can't differentiate which tags 
> are allocated for which hctx, and sometimes we need to know that.
> 
> An example here was blk_mq_queue_tag_busy_iter(), which iterates the 
> bits for each hctx. This would just be broken by that change, unless we 
> record which bits are associated with each hctx.

I disagree. In bt_iter() I added " && rq->mq_hctx == hctx" such that 
blk_mq_queue_tag_busy_iter() only calls the callback function for 
matching (hctx, rq) pairs.

> Another example was __blk_mq_tag_idle(), which looks problematic.

Please elaborate.

> For debugfs, when we examine 
> /sys/kernel/debug/block/.../hctxX/tags_bitmap, wouldn't that be the tags 
> for all hctx (hctx0)?
> 
> For debugging reasons, I would say we want to know which tags are 
> allocated for a specific hctx, as this is tightly related to the 
> requests for that hctx.

That is an open issue in the patch I posted and something that needs to 
be addressed. One way to address this is to change the 
sbitmap_bitmap_show() calls into calls to a function that only shows 
those bits for which rq->mq_hctx == hctx.

>> @@ -341,8 +341,11 @@ void blk_mq_tagset_busy_iter(struct 
>> blk_mq_tag_set *tagset,
>>       int i;
>>
>>       for (i = 0; i < tagset->nr_hw_queues; i++) {
>> -        if (tagset->tags && tagset->tags[i])
>> +        if (tagset->tags && tagset->tags[i]) {
>>               blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
> 
> As I mentioned earlier, wouldn't this iterate over all tags for all 
> hctx's, when we just want the tags for hctx[i]?
> 
> Thanks,
> John
> 
> [Not trimming reply for future reference]
> 
>> +            if (tagset->share_tags)
>> +                break;
>> +        }
>>       }
>>   }
>>   EXPORT_SYMBOL(blk_mq_tagset_busy_iter);

Since blk_mq_tagset_busy_iter() loops over all hardware queues all what 
is changed is the order in which requests are examined. I am not aware 
of any block driver that calls blk_mq_tagset_busy_iter() and that 
depends on the order of the requests passed to the callback function.

Thanks,

Bart.
John Garry Nov. 18, 2019, 10:31 a.m. UTC | #13
On 15/11/2019 17:57, Bart Van Assche wrote:
> On 11/15/19 2:24 AM, John Garry wrote:
>> Bart Van Assche wrote:
>> > How about sharing tag sets across hardware
>> > queues, e.g. like in the (totally untested) patch below?
>>
>> So this is similar in principle what Ming Lei came up with here:
>> https://lore.kernel.org/linux-block/20190531022801.10003-1-ming.lei@redhat.com/ 
>>
>> However your implementation looks neater, which is good.
>>
>> My concern with this approach is that we can't differentiate which 
>> tags are allocated for which hctx, and sometimes we need to know that.
>>

Hi Bart,

>> An example here was blk_mq_queue_tag_busy_iter(), which iterates the 
>> bits for each hctx. This would just be broken by that change, unless 
>> we record which bits are associated with each hctx.
> 
> I disagree. In bt_iter() I added " && rq->mq_hctx == hctx" such that 
> blk_mq_queue_tag_busy_iter() only calls the callback function for 
> matching (hctx, rq) pairs.

OK, I see. I assumed that rq->mq_hctx was statically set when we 
initially allocate the static requests per hctx; but that doesn’t appear 
so - it's set in blk_mq_get_request()->blk_mq_rq_ctx_init().

> 
>> Another example was __blk_mq_tag_idle(), which looks problematic.
> 
> Please elaborate.

Again, this was for the same reason being that I thought we could not 
differentiate which rqs were associated with which hctx.

> 
>> For debugfs, when we examine 
>> /sys/kernel/debug/block/.../hctxX/tags_bitmap, wouldn't that be the 
>> tags for all hctx (hctx0)?
>>
>> For debugging reasons, I would say we want to know which tags are 
>> allocated for a specific hctx, as this is tightly related to the 
>> requests for that hctx.
> 
> That is an open issue in the patch I posted and something that needs to 
> be addressed. One way to address this is to change the 
> sbitmap_bitmap_show() calls into calls to a function that only shows 
> those bits for which rq->mq_hctx == hctx.

Yeah, understood.

> 
>>> @@ -341,8 +341,11 @@ void blk_mq_tagset_busy_iter(struct 
>>> blk_mq_tag_set *tagset,
>>>       int i;
>>>
>>>       for (i = 0; i < tagset->nr_hw_queues; i++) {
>>> -        if (tagset->tags && tagset->tags[i])
>>> +        if (tagset->tags && tagset->tags[i]) {
>>>               blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
>>
>> As I mentioned earlier, wouldn't this iterate over all tags for all 
>> hctx's, when we just want the tags for hctx[i]?
>>
>> Thanks,
>> John
>>
>> [Not trimming reply for future reference]
>>
>>> +            if (tagset->share_tags)
>>> +                break;
>>> +        }
>>>       }
>>>   }
>>>   EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
> 
> Since blk_mq_tagset_busy_iter() loops over all hardware queues all what 
> is changed is the order in which requests are examined. I am not aware 
> of any block driver that calls blk_mq_tagset_busy_iter() and that 
> depends on the order of the requests passed to the callback function.
> 

OK, fine.

So, to me, this approach also seems viable then.

I am however not so happy with how we use blk_mq_tag_set.tags[0] for the 
shared tags; I would like to use blk_mq_tag_set.shared_tags and make 
blk_mq_tag_set.tags[] point at blk_mq_tag_set.shared_tags or maybe not 
blk_mq_tag_set.tags[] at all. However maybe that change may be more 
intrusive.

And another more real concern is that we miss a check somewhere for 
rq->mq_hctx == hctx when examining the bits on the shared tags.

Thanks,
John
John Garry Nov. 19, 2019, 9:26 a.m. UTC | #14
>>
>>>> @@ -341,8 +341,11 @@ void blk_mq_tagset_busy_iter(struct 
>>>> blk_mq_tag_set *tagset,
>>>>       int i;
>>>>
>>>>       for (i = 0; i < tagset->nr_hw_queues; i++) {
>>>> -        if (tagset->tags && tagset->tags[i])
>>>> +        if (tagset->tags && tagset->tags[i]) {
>>>>               blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
>>>
>>> As I mentioned earlier, wouldn't this iterate over all tags for all 
>>> hctx's, when we just want the tags for hctx[i]?
>>>
>>> Thanks,
>>> John
>>>
>>> [Not trimming reply for future reference]
>>>
>>>> +            if (tagset->share_tags)
>>>> +                break;
>>>> +        }
>>>>       }
>>>>   }
>>>>   EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
>>
>> Since blk_mq_tagset_busy_iter() loops over all hardware queues all 
>> what is changed is the order in which requests are examined. I am not 
>> aware of any block driver that calls blk_mq_tagset_busy_iter() and 
>> that depends on the order of the requests passed to the callback 
>> function.
>>
> 
> OK, fine.
> 
> So, to me, this approach also seems viable then.
> 
> I am however not so happy with how we use blk_mq_tag_set.tags[0] for the 
> shared tags; I would like to use blk_mq_tag_set.shared_tags and make 
> blk_mq_tag_set.tags[] point at blk_mq_tag_set.shared_tags or maybe not 
> blk_mq_tag_set.tags[] at all. However maybe that change may be more 
> intrusive.
> 
> And another more real concern is that we miss a check somewhere for 
> rq->mq_hctx == hctx when examining the bits on the shared tags.

Another issue I notice is that using tags from just hctx0 may cause a 
breakage when associated with a different hw queue in the driver.

Specifically we may have blk_mq_alloc_rqs(hctx_idx = 
0)->blk_mq_init_request()->nvme_init_request(), and we would set all 
iod->nvmeq = nvmeq0; since we may actually use this iod on another hw 
queue, we're breaking that interface.

Thanks,
John
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index d5e668ec751b..79eb8983ed45 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -116,6 +116,7 @@  void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->tag = -1;
+	rq->shared_tag = -1;
 	rq->internal_tag = -1;
 	rq->start_time_ns = ktime_get_ns();
 	rq->part = NULL;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 1eec9cbe5a0a..b9ad9a5978f5 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -228,6 +228,7 @@  static void flush_end_io(struct request *flush_rq, blk_status_t error)
 	if (!q->elevator) {
 		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
 		flush_rq->tag = -1;
+		flush_rq->shared_tag = -1;
 	} else {
 		blk_mq_put_driver_tag(flush_rq);
 		flush_rq->internal_tag = -1;
@@ -309,6 +310,7 @@  static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	if (!q->elevator) {
 		fq->orig_rq = first_rq;
 		flush_rq->tag = first_rq->tag;
+		flush_rq->shared_tag = first_rq->shared_tag;
 		blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
 	} else {
 		flush_rq->internal_tag = first_rq->internal_tag;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 33a40ae1d60f..dc90c42aeb9a 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -339,7 +339,7 @@  int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 		       ARRAY_SIZE(rqf_name));
 	seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq)));
-	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
+	seq_printf(m, ", .tag=%d, .shared_tag=%d, .internal_tag=%d", rq->tag, rq->shared_tag,
 		   rq->internal_tag);
 	if (mq_ops->show_rq)
 		mq_ops->show_rq(m, rq);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d7aa23c82dbf..0a6c8a6b05dd 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -191,6 +191,91 @@  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	return tag + tag_offset;
 }

+ /* We could factor this out */
+unsigned int blk_mq_get_shared_tag(struct blk_mq_alloc_data *data)
+{
+	struct blk_mq_tags *tags = blk_mq_shared_tags_from_data(data);
+	struct sbitmap_queue *bt;
+	struct sbq_wait_state *ws;
+	DEFINE_SBQ_WAIT(wait);
+	unsigned int tag_offset;
+	int tag;
+
+	if (data->flags & BLK_MQ_REQ_RESERVED) {
+		if (unlikely(!tags->nr_reserved_tags)) {
+			WARN_ON_ONCE(1);
+			return BLK_MQ_TAG_FAIL;
+		}
+		bt = &tags->breserved_tags;
+		tag_offset = 0;
+	} else {
+		bt = &tags->bitmap_tags;
+		tag_offset = tags->nr_reserved_tags;
+	}
+
+	tag = __blk_mq_get_tag(data, bt);
+	if (tag != -1)
+		goto found_tag;
+
+	if (data->flags & BLK_MQ_REQ_NOWAIT)
+		return BLK_MQ_TAG_FAIL;
+
+	ws = bt_wait_ptr(bt, data->hctx);
+	do {
+		struct sbitmap_queue *bt_prev;
+
+		/*
+		 * We're out of tags on this hardware queue, kick any
+		 * pending IO submits before going to sleep waiting for
+		 * some to complete.
+		 */
+		blk_mq_run_hw_queues(data->q, false);
+
+		/*
+		 * Retry tag allocation after running the hardware queue,
+		 * as running the queue may also have found completions.
+		 */
+		tag = __blk_mq_get_tag(data, bt);
+		if (tag != -1)
+			break;
+
+		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
+
+		tag = __blk_mq_get_tag(data, bt);
+		if (tag != -1)
+			break;
+
+
+		bt_prev = bt;
+		io_schedule();
+
+		sbitmap_finish_wait(bt, ws, &wait);
+
+		data->ctx = blk_mq_get_ctx(data->q);
+		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
+						data->ctx);
+		tags = blk_mq_tags_from_data(data);
+		if (data->flags & BLK_MQ_REQ_RESERVED)
+			bt = &tags->breserved_tags;
+		else
+			bt = &tags->bitmap_tags;
+
+		/*
+		 * If destination hw queue is changed, fake wake up on
+		 * previous queue for compensating the wake up miss, so
+		 * other allocations on previous queue won't be starved.
+		 */
+		if (bt != bt_prev)
+			sbitmap_queue_wake_up(bt_prev);
+
+		ws = bt_wait_ptr(bt, data->hctx);
+	} while (1);
+
+	sbitmap_finish_wait(bt, ws, &wait);
+
+found_tag:
+	return tag + tag_offset;
+}
+
 void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 		    unsigned int tag)
 {
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 88b85daa4976..82ff8faa70f7 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -26,6 +26,7 @@  extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+extern unsigned int blk_mq_get_shared_tag(struct blk_mq_alloc_data *data);
 extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6b39cf0efdcd..792eae37dc44 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -292,7 +292,7 @@  static inline bool blk_mq_need_time_stamp(struct request *rq)
 }
 
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
-		unsigned int tag, unsigned int op, u64 alloc_time_ns)
+		unsigned int tag, unsigned int shared_tag, unsigned int op, u64 alloc_time_ns)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct request *rq = tags->static_rqs[tag];
@@ -300,6 +300,7 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 
 	if (data->flags & BLK_MQ_REQ_INTERNAL) {
 		rq->tag = -1;
+		rq->shared_tag = -1;
 		rq->internal_tag = tag;
 	} else {
 		if (data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) {
@@ -307,6 +308,7 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 			atomic_inc(&data->hctx->nr_active);
 		}
 		rq->tag = tag;
+		rq->shared_tag = shared_tag;
 		rq->internal_tag = -1;
 		data->hctx->tags->rqs[rq->tag] = rq;
 	}
@@ -359,7 +361,7 @@  static struct request *blk_mq_get_request(struct request_queue *q,
 {
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
-	unsigned int tag;
+	unsigned int tag, shared_tag = BLK_MQ_TAG_FAIL;
 	bool clear_ctx_on_error = false;
 	u64 alloc_time_ns = 0;
 
@@ -396,15 +398,17 @@  static struct request *blk_mq_get_request(struct request_queue *q,
 		blk_mq_tag_busy(data->hctx);
 	}
 
-	tag = blk_mq_get_tag(data);
-	if (tag == BLK_MQ_TAG_FAIL) {
-		if (clear_ctx_on_error)
-			data->ctx = NULL;
-		blk_queue_exit(q);
-		return NULL;
+	if (data->hctx->shared_tags) {
+		shared_tag = blk_mq_get_shared_tag(data);
+		if (shared_tag == BLK_MQ_TAG_FAIL)
+			goto err_shared_tag;
 	}
 
-	rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
+	tag = blk_mq_get_tag(data);
+	if (tag == BLK_MQ_TAG_FAIL)
+		goto err_tag;
+
+	rq = blk_mq_rq_ctx_init(data, tag, shared_tag, data->cmd_flags, alloc_time_ns);
 	if (!op_is_flush(data->cmd_flags)) {
 		rq->elv.icq = NULL;
 		if (e && e->type->ops.prepare_request) {
@@ -417,6 +421,15 @@  static struct request *blk_mq_get_request(struct request_queue *q,
 	}
 	data->hctx->queued++;
 	return rq;
+
+err_tag:
+	if (shared_tag != BLK_MQ_TAG_FAIL)
+		blk_mq_put_tag(data->hctx->shared_tags, data->ctx, shared_tag);
+err_shared_tag:
+	if (clear_ctx_on_error)
+		data->ctx = NULL;
+	blk_queue_exit(q);
+	return NULL;
 }
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
@@ -498,6 +511,8 @@  static void __blk_mq_free_request(struct request *rq)
 
 	blk_pm_mark_last_busy(rq);
 	rq->mq_hctx = NULL;
+	if (rq->shared_tag != -1)
+		blk_mq_put_tag(hctx->shared_tags, ctx, rq->shared_tag);
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
@@ -1070,6 +1085,14 @@  bool blk_mq_get_driver_tag(struct request *rq)
 		data.flags |= BLK_MQ_REQ_RESERVED;
 
 	shared = blk_mq_tag_busy(data.hctx);
+	if (rq && rq->mq_hctx && rq->mq_hctx->shared_tags) {
+		rq->shared_tag = blk_mq_get_shared_tag(&data);
+		if (rq->shared_tag == BLK_MQ_TAG_FAIL) {
+			blk_mq_put_tag(rq->mq_hctx->tags, rq->mq_ctx, rq->tag);
+			rq->tag = -1;
+			goto done;
+		}
+	}
 	rq->tag = blk_mq_get_tag(&data);
 	if (rq->tag >= 0) {
 		if (shared) {
@@ -1077,6 +1100,9 @@  bool blk_mq_get_driver_tag(struct request *rq)
 			atomic_inc(&data.hctx->nr_active);
 		}
 		data.hctx->tags->rqs[rq->tag] = rq;
+	} else if (rq->shared_tag >= 0) {
+		blk_mq_put_tag(rq->mq_hctx->tags, rq->mq_ctx, rq->tag);
+		rq->shared_tag = -1;
 	}
 
 done:
@@ -2317,6 +2343,7 @@  static int blk_mq_init_hctx(struct request_queue *q,
 	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
 
 	hctx->tags = set->tags[hctx_idx];
+	hctx->shared_tags = set->shared_tags;
 
 	if (set->ops->init_hctx &&
 	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
@@ -3100,6 +3127,22 @@  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (ret)
 		goto out_free_mq_map;
 
+	if (set->flags & BLK_MQ_F_TAG_HCTX_SHARED) {
+		int node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], 0);
+		int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
+
+		if (node == NUMA_NO_NODE)
+			node = set->numa_node;
+
+		set->shared_tags = blk_mq_init_tags(set->queue_depth,
+						    set->reserved_tags,
+						    node, alloc_policy);
+		if (!set->shared_tags) {
+			ret = -ENOMEM;
+			goto out_free_mq_map;
+		}
+	}
+
 	mutex_init(&set->tag_list_lock);
 	INIT_LIST_HEAD(&set->tag_list);
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 78d38b5f2793..c328d335de7d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -174,6 +174,14 @@  static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
 	return data->hctx->tags;
 }
 
+static inline struct blk_mq_tags *blk_mq_shared_tags_from_data(struct blk_mq_alloc_data *data)
+{
+	if (data->flags & BLK_MQ_REQ_INTERNAL)
+		return data->hctx->sched_tags;
+
+	return data->hctx->shared_tags;
+}
+
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 {
 	return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
@@ -210,6 +218,7 @@  static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
 {
 	blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
 	rq->tag = -1;
+	rq->shared_tag = -1;
 
 	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
 		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 147185394a25..d3b402bd01a9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -46,6 +46,7 @@  struct blk_mq_hw_ctx {
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
+	struct blk_mq_tags	*shared_tags;
 	struct blk_mq_tags	*sched_tags;
 
 	unsigned long		queued;
@@ -109,6 +110,7 @@  struct blk_mq_tag_set {
 	unsigned int		flags;		/* BLK_MQ_F_* */
 	void			*driver_data;
 
+	struct blk_mq_tags	*shared_tags;
 	struct blk_mq_tags	**tags;
 
 	struct mutex		tag_list_lock;
@@ -226,6 +228,7 @@  struct blk_mq_ops {
 enum {
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_TAG_QUEUE_SHARED	= 1 << 1,
+	BLK_MQ_F_TAG_HCTX_SHARED	= 1 << 2,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
 	BLK_MQ_F_NO_SCHED	= 1 << 6,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3ea78b0c91c..a4caa6407b3a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -138,6 +138,7 @@  struct request {
 	req_flags_t rq_flags;
 
 	int tag;
+	int shared_tag;
 	int internal_tag;
 
 	/* the following two fields are internal, NEVER access directly */