diff mbox series

[3/8] blk-mq: Use a pointer for sbitmap

Message ID 20191126091416.20052-4-hare@suse.de (mailing list archive)
State New, archived
Headers show
Series blk-mq/scsi: Provide hostwide shared tags for SCSI HBAs | expand

Commit Message

Hannes Reinecke Nov. 26, 2019, 9:14 a.m. UTC
Instead of allocating the tag bitmap in place we should be using a
pointer. This is in preparation for shared host-wide bitmaps.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 block/bfq-iosched.c    |  4 +--
 block/blk-mq-debugfs.c |  8 +++---
 block/blk-mq-tag.c     | 68 +++++++++++++++++++++++++++++++-------------------
 block/blk-mq-tag.h     |  4 +--
 block/blk-mq.c         |  5 ++--
 block/kyber-iosched.c  |  4 +--
 6 files changed, 54 insertions(+), 39 deletions(-)

Comments

Jens Axboe Nov. 26, 2019, 3:14 p.m. UTC | #1
On 11/26/19 2:14 AM, Hannes Reinecke wrote:
> Instead of allocating the tag bitmap in place we should be using a
> pointer. This is in preparation for shared host-wide bitmaps.

Not a huge fan of this, it's an extra indirection in the hot path
of both submission and completion.
John Garry Nov. 26, 2019, 4:54 p.m. UTC | #2
On 26/11/2019 15:14, Jens Axboe wrote:
> On 11/26/19 2:14 AM, Hannes Reinecke wrote:
>> Instead of allocating the tag bitmap in place we should be using a
>> pointer. This is in preparation for shared host-wide bitmaps.
> 
> Not a huge fan of this, it's an extra indirection in the hot path
> of both submission and completion.

Hi Jens,

Thanks for having a look.

I checked the disassembly for blk_mq_get_tag() as a sample - which I 
assume is one hot path function which you care about - and the cost of 
the indirection is a load instruction instead of an add, denoted by ***, 
below:

Before:
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct 
blk_mq_alloc_data *data)
{
	if (data->flags & BLK_MQ_REQ_INTERNAL)
		return data->hctx->sched_tags;
      6ac:	a9554c64 	ldp	x4, x19, [x3, #336]

	return data->hctx->tags;
      6b0:	f27e003f 	tst	x1, #0x4
      6b4:	f9003ba0 	str	x0, [x29, #112]
      6b8:	a9078ba2 	stp	x2, x2, [x29, #120]
      6bc:	9a841273 	csel	x19, x19, x4, ne  // ne = any
	if (data->flags & BLK_MQ_REQ_RESERVED) {
      6c0:	36081021 	tbz	w1, #1, 8c4 <blk_mq_get_tag+0x264>
		if (unlikely(!tags->nr_reserved_tags)) {
      6c4:	b9400660 	ldr	w0, [x19, #4]

      6d4:	f90027ba 	str	x26, [x29, #72]
		tag_offset = 0;
      6c8:	52800018 	mov	w24, #0x0                   	// #0
		bt = &tags->breserved_tags;
      6cc:	91014273 	add	x19, x19, #0x50 ***
		if (unlikely(!tags->nr_reserved_tags)) {
      6d0:	340012e0 	cbz	w0, 92c <blk_mq_get_tag+0x2cc>
	tag = __blk_mq_get_tag(data, bt);
      6d8:	aa1303e1 	mov	x1, x19
      6dc:	aa1403e0 	mov	x0, x20
      6e0:	97fffe92 	bl	128 <__blk_mq_get_tag>

After:
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct 
blk_mq_alloc_data *data)
{
	if (data->flags & BLK_MQ_REQ_INTERNAL)
		return data->hctx->sched_tags;

      6b4:	a9550004 	ldp	x4, x0, [x0, #336]

	return data->hctx->tags;
      6ac:	f27e005f 	tst	x2, #0x4
      6b0:	f9003ba1 	str	x1, [x29, #112]
		return data->hctx->sched_tags;
      6b8:	a9078fa3 	stp	x3, x3, [x29, #120]
	return data->hctx->tags;
      6bc:	9a841000 	csel	x0, x0, x4, ne  // ne = any
	if (data->flags & BLK_MQ_REQ_RESERVED) {
      6c0:	36080fa2 	tbz	w2, #1, 8b4 <blk_mq_get_tag+0x254>
		if (unlikely(!tags->nr_reserved_tags)) {
      6c4:	b9400401 	ldr	w1, [x0, #4]
      6cc:	f90027ba 	str	x26, [x29, #72]
		tag_offset = 0;
      6d0:	52800017 	mov	w23, #0x0                   	// #0
		bt = tags->breserved_tags;

      6c8:	340012a1 	cbz	w1, 91c <blk_mq_get_tag+0x2bc>
      6d4:	f9400c14 	ldr	x20, [x0, #24] ***
	tag = __blk_mq_get_tag(data, bt);
      6d8:	aa1303e0 	mov	x0, x19
      6dc:	aa1403e1 	mov	x1, x20
      6e0:	97fffe92 	bl	128 <__blk_mq_get_tag>

This is arm64 dis.

I'm just saying this to provide some illustration of the potential 
performance impact of this change.

Thanks,
John
Jens Axboe Nov. 26, 2019, 5:11 p.m. UTC | #3
On 11/26/19 9:54 AM, John Garry wrote:
> On 26/11/2019 15:14, Jens Axboe wrote:
>> On 11/26/19 2:14 AM, Hannes Reinecke wrote:
>>> Instead of allocating the tag bitmap in place we should be using a
>>> pointer. This is in preparation for shared host-wide bitmaps.
>>
>> Not a huge fan of this, it's an extra indirection in the hot path
>> of both submission and completion.
> 
> Hi Jens,
> 
> Thanks for having a look.
> 
> I checked the disassembly for blk_mq_get_tag() as a sample - which I
> assume is one hot path function which you care about - and the cost of
> the indirection is a load instruction instead of an add, denoted by ***,
> below:

I'm not that worried about an extra instruction, my worry is the extra
load is from different memory. When it's embedded in the struct, we're
on the same cache line or adjacent.
John Garry Nov. 26, 2019, 5:23 p.m. UTC | #4
On 26/11/2019 17:11, Jens Axboe wrote:
> On 11/26/19 9:54 AM, John Garry wrote:
>> On 26/11/2019 15:14, Jens Axboe wrote:
>>> On 11/26/19 2:14 AM, Hannes Reinecke wrote:
>>>> Instead of allocating the tag bitmap in place we should be using a
>>>> pointer. This is in preparation for shared host-wide bitmaps.
>>>
>>> Not a huge fan of this, it's an extra indirection in the hot path
>>> of both submission and completion.
>>
>> Hi Jens,
>>
>> Thanks for having a look.
>>
>> I checked the disassembly for blk_mq_get_tag() as a sample - which I
>> assume is one hot path function which you care about - and the cost of
>> the indirection is a load instruction instead of an add, denoted by ***,
>> below:
> 

Hi Jens,

> I'm not that worried about an extra instruction, my worry is the extra
> load is from different memory. When it's embedded in the struct, we're
> on the same cache line or adjacent.

Right, so the earlier iteration of this series kept the embedded struct 
and we simply pointed at that, so I wouldn't expect a caching issue of 
different memory in that case.

Cheers,
John
Jens Axboe Nov. 26, 2019, 5:25 p.m. UTC | #5
On 11/26/19 10:23 AM, John Garry wrote:
> On 26/11/2019 17:11, Jens Axboe wrote:
>> On 11/26/19 9:54 AM, John Garry wrote:
>>> On 26/11/2019 15:14, Jens Axboe wrote:
>>>> On 11/26/19 2:14 AM, Hannes Reinecke wrote:
>>>>> Instead of allocating the tag bitmap in place we should be using a
>>>>> pointer. This is in preparation for shared host-wide bitmaps.
>>>>
>>>> Not a huge fan of this, it's an extra indirection in the hot path
>>>> of both submission and completion.
>>>
>>> Hi Jens,
>>>
>>> Thanks for having a look.
>>>
>>> I checked the disassembly for blk_mq_get_tag() as a sample - which I
>>> assume is one hot path function which you care about - and the cost of
>>> the indirection is a load instruction instead of an add, denoted by ***,
>>> below:
>>
> 
> Hi Jens,
> 
>> I'm not that worried about an extra instruction, my worry is the extra
>> load is from different memory. When it's embedded in the struct, we're
>> on the same cache line or adjacent.
> 
> Right, so the earlier iteration of this series kept the embedded struct
> and we simply pointed at that, so I wouldn't expect a caching issue of
> different memory in that case.

That would be a much better solution for the common case, my concern
here is slowing down the fast path for device that don't need shared
tags.

Would be interesting to check the generated code for that, ideally we'd
get rid of the extra load for that case, even if it is in the same
cacheline.
John Garry Nov. 26, 2019, 6:08 p.m. UTC | #6
On 26/11/2019 17:25, Jens Axboe wrote:
> On 11/26/19 10:23 AM, John Garry wrote:
>> On 26/11/2019 17:11, Jens Axboe wrote:
>>> On 11/26/19 9:54 AM, John Garry wrote:
>>>> On 26/11/2019 15:14, Jens Axboe wrote:
>>>>> On 11/26/19 2:14 AM, Hannes Reinecke wrote:
>>>>>> Instead of allocating the tag bitmap in place we should be using a
>>>>>> pointer. This is in preparation for shared host-wide bitmaps.
>>>>>
>>>>> Not a huge fan of this, it's an extra indirection in the hot path
>>>>> of both submission and completion.
>>>>
>>>> Hi Jens,
>>>>
>>>> Thanks for having a look.
>>>>
>>>> I checked the disassembly for blk_mq_get_tag() as a sample - which I
>>>> assume is one hot path function which you care about - and the cost of
>>>> the indirection is a load instruction instead of an add, denoted by ***,
>>>> below:
>>>
>>
>> Hi Jens,
>>
>>> I'm not that worried about an extra instruction, my worry is the extra
>>> load is from different memory. When it's embedded in the struct, we're
>>> on the same cache line or adjacent.
>>
>> Right, so the earlier iteration of this series kept the embedded struct
>> and we simply pointed at that, so I wouldn't expect a caching issue of
>> different memory in that case.
> 

Hi Jens,

> That would be a much better solution for the common case, my concern
> here is slowing down the fast path for device that don't need shared
> tags.
> 
> Would be interesting to check the generated code for that, ideally we'd
> get rid of the extra load for that case, even if it is in the same
> cacheline.
> 

I checked the disassembly and we still have the load instead of the add.

This is not surprising, as the compiler would not know for certain that 
we point to a field within the same struct. But at least we still should 
point to a close memory.

Note that the pointer could be dropped, which would remove the load, but 
then we have many if-elses which could be slower, not to mention that 
the blk-mq-tag code deals in bitmap pointers anyway.

Thanks,
John
Jens Axboe Nov. 27, 2019, 1:46 a.m. UTC | #7
On 11/26/19 11:08 AM, John Garry wrote:
> On 26/11/2019 17:25, Jens Axboe wrote:
>> On 11/26/19 10:23 AM, John Garry wrote:
>>> On 26/11/2019 17:11, Jens Axboe wrote:
>>>> On 11/26/19 9:54 AM, John Garry wrote:
>>>>> On 26/11/2019 15:14, Jens Axboe wrote:
>>>>>> On 11/26/19 2:14 AM, Hannes Reinecke wrote:
>>>>>>> Instead of allocating the tag bitmap in place we should be using a
>>>>>>> pointer. This is in preparation for shared host-wide bitmaps.
>>>>>>
>>>>>> Not a huge fan of this, it's an extra indirection in the hot path
>>>>>> of both submission and completion.
>>>>>
>>>>> Hi Jens,
>>>>>
>>>>> Thanks for having a look.
>>>>>
>>>>> I checked the disassembly for blk_mq_get_tag() as a sample - which I
>>>>> assume is one hot path function which you care about - and the cost of
>>>>> the indirection is a load instruction instead of an add, denoted by ***,
>>>>> below:
>>>>
>>>
>>> Hi Jens,
>>>
>>>> I'm not that worried about an extra instruction, my worry is the extra
>>>> load is from different memory. When it's embedded in the struct, we're
>>>> on the same cache line or adjacent.
>>>
>>> Right, so the earlier iteration of this series kept the embedded struct
>>> and we simply pointed at that, so I wouldn't expect a caching issue of
>>> different memory in that case.
>>
> 
> Hi Jens,
> 
>> That would be a much better solution for the common case, my concern
>> here is slowing down the fast path for device that don't need shared
>> tags.
>>
>> Would be interesting to check the generated code for that, ideally we'd
>> get rid of the extra load for that case, even if it is in the same
>> cacheline.
>>
> 
> I checked the disassembly and we still have the load instead of the add.
> 
> This is not surprising, as the compiler would not know for certain that
> we point to a field within the same struct. But at least we still should
> point to a close memory.
> 
> Note that the pointer could be dropped, which would remove the load, but
> then we have many if-elses which could be slower, not to mention that
> the blk-mq-tag code deals in bitmap pointers anyway.

It might still be worthwhile to do:

if (tags->ptr == &tags->__default)
	foo(&tags->__default);

to make it clear, as that branch will predict easily. If if can be done
in a nice enough fashion and not sprinkled everywhere, in some fashion.

Should be testable, though.
John Garry Nov. 27, 2019, 1:05 p.m. UTC | #8
On 27/11/2019 01:46, Jens Axboe wrote:
>>> Would be interesting to check the generated code for that, ideally we'd
>>> get rid of the extra load for that case, even if it is in the same
>>> cacheline.
>>>
>> I checked the disassembly and we still have the load instead of the add.
>>
>> This is not surprising, as the compiler would not know for certain that
>> we point to a field within the same struct. But at least we still should
>> point to a close memory.
>>
>> Note that the pointer could be dropped, which would remove the load, but
>> then we have many if-elses which could be slower, not to mention that
>> the blk-mq-tag code deals in bitmap pointers anyway.

Hi Jens,

> It might still be worthwhile to do:
> 
> if (tags->ptr == &tags->__default)
> 	foo(&tags->__default);
> 
> to make it clear, as that branch will predict easily. 

Not sure. So this code does produce the same assembly, as we still need 
to do the tags->ptr load for the comparison.

And then if you consider blk_mq_get_tags() as an example, there is no 
other hot value available to indicate whether the shared tags are used 
to decide whether to use  &tags->__default.

I'll consider it more.

If if can be done
> in a nice enough fashion and not sprinkled everywhere, in some fashion.
> 
> Should be testable, though.
> 
> -- Jens Axboe

Thanks,
John
Hannes Reinecke Nov. 27, 2019, 1:12 p.m. UTC | #9
On 11/27/19 2:05 PM, John Garry wrote:
> On 27/11/2019 01:46, Jens Axboe wrote:
>>>> Would be interesting to check the generated code for that, ideally we'd
>>>> get rid of the extra load for that case, even if it is in the same
>>>> cacheline.
>>>>
>>> I checked the disassembly and we still have the load instead of the add.
>>>
>>> This is not surprising, as the compiler would not know for certain that
>>> we point to a field within the same struct. But at least we still should
>>> point to a close memory.
>>>
>>> Note that the pointer could be dropped, which would remove the load, but
>>> then we have many if-elses which could be slower, not to mention that
>>> the blk-mq-tag code deals in bitmap pointers anyway.
> 
> Hi Jens,
> 
>> It might still be worthwhile to do:
>>
>> if (tags->ptr == &tags->__default)
>>     foo(&tags->__default);
>>
>> to make it clear, as that branch will predict easily. 
> 
> Not sure. So this code does produce the same assembly, as we still need
> to do the tags->ptr load for the comparison.
> 
> And then if you consider blk_mq_get_tags() as an example, there is no
> other hot value available to indicate whether the shared tags are used
> to decide whether to use  &tags->__default.
> 
> I'll consider it more.
> 
After talking to our compiler folks I guess it should be possible to
resurrect your original patch (with both the embedded bitmap and the
point to the bitmap), linking the pointer to the embedded bitmap in the
non-shared case.

Then we can access the pointer in all cases, but in the non-shared case
that will then point to the embedded one, thus avoiding the possible
cache miss.

I'll see how that goes.

Cheers,

Hannes
Jens Axboe Nov. 27, 2019, 2:20 p.m. UTC | #10
On 11/27/19 6:12 AM, Hannes Reinecke wrote:
> On 11/27/19 2:05 PM, John Garry wrote:
>> On 27/11/2019 01:46, Jens Axboe wrote:
>>>>> Would be interesting to check the generated code for that, ideally we'd
>>>>> get rid of the extra load for that case, even if it is in the same
>>>>> cacheline.
>>>>>
>>>> I checked the disassembly and we still have the load instead of the add.
>>>>
>>>> This is not surprising, as the compiler would not know for certain that
>>>> we point to a field within the same struct. But at least we still should
>>>> point to a close memory.
>>>>
>>>> Note that the pointer could be dropped, which would remove the load, but
>>>> then we have many if-elses which could be slower, not to mention that
>>>> the blk-mq-tag code deals in bitmap pointers anyway.
>>
>> Hi Jens,
>>
>>> It might still be worthwhile to do:
>>>
>>> if (tags->ptr == &tags->__default)
>>>      foo(&tags->__default);
>>>
>>> to make it clear, as that branch will predict easily.
>>
>> Not sure. So this code does produce the same assembly, as we still need
>> to do the tags->ptr load for the comparison.
>>
>> And then if you consider blk_mq_get_tags() as an example, there is no
>> other hot value available to indicate whether the shared tags are used
>> to decide whether to use  &tags->__default.
>>
>> I'll consider it more.
>>
> After talking to our compiler folks I guess it should be possible to
> resurrect your original patch (with both the embedded bitmap and the
> point to the bitmap), linking the pointer to the embedded bitmap in the
> non-shared case.
> 
> Then we can access the pointer in all cases, but in the non-shared case
> that will then point to the embedded one, thus avoiding the possible
> cache miss.
> 
> I'll see how that goes.

That's exactly what I suggested yesterday, the discussion now is on how
to make that work in the cleanest way.
Jens Axboe Nov. 27, 2019, 2:21 p.m. UTC | #11
On 11/27/19 6:05 AM, John Garry wrote:
> On 27/11/2019 01:46, Jens Axboe wrote:
>>>> Would be interesting to check the generated code for that, ideally we'd
>>>> get rid of the extra load for that case, even if it is in the same
>>>> cacheline.
>>>>
>>> I checked the disassembly and we still have the load instead of the add.
>>>
>>> This is not surprising, as the compiler would not know for certain that
>>> we point to a field within the same struct. But at least we still should
>>> point to a close memory.
>>>
>>> Note that the pointer could be dropped, which would remove the load, but
>>> then we have many if-elses which could be slower, not to mention that
>>> the blk-mq-tag code deals in bitmap pointers anyway.
> 
> Hi Jens,
> 
>> It might still be worthwhile to do:
>>
>> if (tags->ptr == &tags->__default)
>> 	foo(&tags->__default);
>>
>> to make it clear, as that branch will predict easily.
> 
> Not sure. So this code does produce the same assembly, as we still need
> to do the tags->ptr load for the comparison.

How can it be the same? The approach in the patchset needs to load
*tags->ptr, this one needs tags->ptr. That's the big difference.
John Garry Nov. 27, 2019, 2:44 p.m. UTC | #12
On 27/11/2019 14:21, Jens Axboe wrote:
> On 11/27/19 6:05 AM, John Garry wrote:
>> On 27/11/2019 01:46, Jens Axboe wrote:
>>>>> Would be interesting to check the generated code for that, ideally 
>>>>> we'd
>>>>> get rid of the extra load for that case, even if it is in the same
>>>>> cacheline.
>>>>>
>>>> I checked the disassembly and we still have the load instead of the 
>>>> add.
>>>>
>>>> This is not surprising, as the compiler would not know for certain that
>>>> we point to a field within the same struct. But at least we still 
>>>> should
>>>> point to a close memory.
>>>>
>>>> Note that the pointer could be dropped, which would remove the load, 
>>>> but
>>>> then we have many if-elses which could be slower, not to mention that
>>>> the blk-mq-tag code deals in bitmap pointers anyway.
>>
>> Hi Jens,
>>
>>> It might still be worthwhile to do:
>>>
>>> if (tags->ptr == &tags->__default)
>>>     foo(&tags->__default);
>>>
>>> to make it clear, as that branch will predict easily.
>>
>> Not sure. So this code does produce the same assembly, as we still need
>> to do the tags->ptr load for the comparison.
> 

Hi Jens,

> How can it be the same? The approach in the patchset needs to load
> *tags->ptr, this one needs tags->ptr. That's the big difference.
> 

In the patch for this thread, we have:

@@ -121,10 +121,10 @@ unsigned int blk_mq_get_tag(struct 
blk_mq_alloc_data *data)
  			WARN_ON_ONCE(1);
  			return BLK_MQ_TAG_FAIL;
  		}
-		bt = &tags->breserved_tags;
+		bt = tags->breserved_tags;
  		tag_offset = 0;
  	} else {
-		bt = &tags->bitmap_tags;
+		bt = tags->bitmap_tags;
  		tag_offset = tags->nr_reserved_tags;
  	}


So current code gets bt pointer by simply offsetting a certain distance 
from tags pointer - that is the add I mention.

With the change in this patch, we need to load memory at address 
&tags->bitmap_tags to get bt - this is the load I mention.

So for this:

if (tags->ptr == &tags->__default)

We load &tags->ptr to get the pointer value for comparison vs 
&tags->__default.

There must be something I'm missing...

Thanks,
John
Hannes Reinecke Nov. 27, 2019, 4:52 p.m. UTC | #13
On 11/27/19 3:44 PM, John Garry wrote:
> On 27/11/2019 14:21, Jens Axboe wrote:
>> On 11/27/19 6:05 AM, John Garry wrote:
>>> On 27/11/2019 01:46, Jens Axboe wrote:
>>>>>> Would be interesting to check the generated code for that, ideally 
>>>>>> we'd
>>>>>> get rid of the extra load for that case, even if it is in the same
>>>>>> cacheline.
>>>>>>
>>>>> I checked the disassembly and we still have the load instead of the 
>>>>> add.
>>>>>
>>>>> This is not surprising, as the compiler would not know for certain 
>>>>> that
>>>>> we point to a field within the same struct. But at least we still 
>>>>> should
>>>>> point to a close memory.
>>>>>
>>>>> Note that the pointer could be dropped, which would remove the 
>>>>> load, but
>>>>> then we have many if-elses which could be slower, not to mention that
>>>>> the blk-mq-tag code deals in bitmap pointers anyway.
>>>
>>> Hi Jens,
>>>
>>>> It might still be worthwhile to do:
>>>>
>>>> if (tags->ptr == &tags->__default)
>>>>     foo(&tags->__default);
>>>>
>>>> to make it clear, as that branch will predict easily.
>>>
>>> Not sure. So this code does produce the same assembly, as we still need
>>> to do the tags->ptr load for the comparison.
>>
> 
> Hi Jens,
> 
>> How can it be the same? The approach in the patchset needs to load
>> *tags->ptr, this one needs tags->ptr. That's the big difference.
>>
> 
> In the patch for this thread, we have:
> 
> @@ -121,10 +121,10 @@ unsigned int blk_mq_get_tag(struct 
> blk_mq_alloc_data *data)
>               WARN_ON_ONCE(1);
>               return BLK_MQ_TAG_FAIL;
>           }
> -        bt = &tags->breserved_tags;
> +        bt = tags->breserved_tags;
>           tag_offset = 0;
>       } else {
> -        bt = &tags->bitmap_tags;
> +        bt = tags->bitmap_tags;
>           tag_offset = tags->nr_reserved_tags;
>       }
> 
> 
> So current code gets bt pointer by simply offsetting a certain distance 
> from tags pointer - that is the add I mention.
> 
> With the change in this patch, we need to load memory at address 
> &tags->bitmap_tags to get bt - this is the load I mention.
> 
> So for this:
> 
> if (tags->ptr == &tags->__default)
> 
> We load &tags->ptr to get the pointer value for comparison vs 
> &tags->__default.
> 
> There must be something I'm missing...
> 
The point here was that the load might refer to _other_ memory locations 
(as it's being allocated separately), thus incurring a cache miss.
With embedded tag bitmaps we'll load from the same cache line 
(hopefully), and won't get a performance hit.

Cheers,

Hannes
John Garry Nov. 27, 2019, 6:02 p.m. UTC | #14
On 27/11/2019 16:52, Hannes Reinecke wrote:
> On 11/27/19 3:44 PM, John Garry wrote:
>> On 27/11/2019 14:21, Jens Axboe wrote:
>>> On 11/27/19 6:05 AM, John Garry wrote:
>>>> On 27/11/2019 01:46, Jens Axboe wrote:
>>>>>>> Would be interesting to check the generated code for that, 
>>>>>>> ideally we'd
>>>>>>> get rid of the extra load for that case, even if it is in the same
>>>>>>> cacheline.
>>>>>>>
>>>>>> I checked the disassembly and we still have the load instead of 
>>>>>> the add.
>>>>>>
>>>>>> This is not surprising, as the compiler would not know for certain 
>>>>>> that
>>>>>> we point to a field within the same struct. But at least we still 
>>>>>> should
>>>>>> point to a close memory.
>>>>>>
>>>>>> Note that the pointer could be dropped, which would remove the 
>>>>>> load, but
>>>>>> then we have many if-elses which could be slower, not to mention that
>>>>>> the blk-mq-tag code deals in bitmap pointers anyway.
>>>>
>>>> Hi Jens,
>>>>
>>>>> It might still be worthwhile to do:
>>>>>
>>>>> if (tags->ptr == &tags->__default)
>>>>>     foo(&tags->__default);
>>>>>
>>>>> to make it clear, as that branch will predict easily.
>>>>
>>>> Not sure. So this code does produce the same assembly, as we still need
>>>> to do the tags->ptr load for the comparison.
>>>
>>
>> Hi Jens,
>>
>>> How can it be the same? The approach in the patchset needs to load
>>> *tags->ptr, this one needs tags->ptr. That's the big difference.
>>>
>>
>> In the patch for this thread, we have:
>>
>> @@ -121,10 +121,10 @@ unsigned int blk_mq_get_tag(struct 
>> blk_mq_alloc_data *data)
>>               WARN_ON_ONCE(1);
>>               return BLK_MQ_TAG_FAIL;
>>           }
>> -        bt = &tags->breserved_tags;
>> +        bt = tags->breserved_tags;
>>           tag_offset = 0;
>>       } else {
>> -        bt = &tags->bitmap_tags;
>> +        bt = tags->bitmap_tags;
>>           tag_offset = tags->nr_reserved_tags;
>>       }
>>
>>
>> So current code gets bt pointer by simply offsetting a certain 
>> distance from tags pointer - that is the add I mention.
>>
>> With the change in this patch, we need to load memory at address 
>> &tags->bitmap_tags to get bt - this is the load I mention.
>>
>> So for this:
>>
>> if (tags->ptr == &tags->__default)
>>
>> We load &tags->ptr to get the pointer value for comparison vs 
>> &tags->__default.
>>
>> There must be something I'm missing...
>>
> The point here was that the load might refer to _other_ memory locations 
> (as it's being allocated separately),

I think that we're talking about something different.

  thus incurring a cache miss.
> With embedded tag bitmaps we'll load from the same cache line 
> (hopefully), and won't get a performance hit.

But I'll just wait to see what you come up with.

Thanks,
John
diff mbox series

Patch

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0319d6339822..ca89d0c34994 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6327,8 +6327,8 @@  static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
 	struct blk_mq_tags *tags = hctx->sched_tags;
 	unsigned int min_shallow;
 
-	min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
-	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+	min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
+	sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
 }
 
 static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 33a40ae1d60f..fca87470e267 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -449,11 +449,11 @@  static void blk_mq_debugfs_tags_show(struct seq_file *m,
 		   atomic_read(&tags->active_queues));
 
 	seq_puts(m, "\nbitmap_tags:\n");
-	sbitmap_queue_show(&tags->bitmap_tags, m);
+	sbitmap_queue_show(tags->bitmap_tags, m);
 
 	if (tags->nr_reserved_tags) {
 		seq_puts(m, "\nbreserved_tags:\n");
-		sbitmap_queue_show(&tags->breserved_tags, m);
+		sbitmap_queue_show(tags->breserved_tags, m);
 	}
 }
 
@@ -484,7 +484,7 @@  static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
 	if (res)
 		goto out;
 	if (hctx->tags)
-		sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
+		sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
 	mutex_unlock(&q->sysfs_lock);
 
 out:
@@ -518,7 +518,7 @@  static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
 	if (res)
 		goto out;
 	if (hctx->sched_tags)
-		sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
+		sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
 	mutex_unlock(&q->sysfs_lock);
 
 out:
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d7aa23c82dbf..332d71cd3976 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -20,7 +20,7 @@  bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 	if (!tags)
 		return true;
 
-	return sbitmap_any_bit_clear(&tags->bitmap_tags.sb);
+	return sbitmap_any_bit_clear(&tags->bitmap_tags->sb);
 }
 
 /*
@@ -43,9 +43,9 @@  bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  */
 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 {
-	sbitmap_queue_wake_all(&tags->bitmap_tags);
+	sbitmap_queue_wake_all(tags->bitmap_tags);
 	if (include_reserve)
-		sbitmap_queue_wake_all(&tags->breserved_tags);
+		sbitmap_queue_wake_all(tags->breserved_tags);
 }
 
 /*
@@ -121,10 +121,10 @@  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 			WARN_ON_ONCE(1);
 			return BLK_MQ_TAG_FAIL;
 		}
-		bt = &tags->breserved_tags;
+		bt = tags->breserved_tags;
 		tag_offset = 0;
 	} else {
-		bt = &tags->bitmap_tags;
+		bt = tags->bitmap_tags;
 		tag_offset = tags->nr_reserved_tags;
 	}
 
@@ -170,9 +170,9 @@  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 						data->ctx);
 		tags = blk_mq_tags_from_data(data);
 		if (data->flags & BLK_MQ_REQ_RESERVED)
-			bt = &tags->breserved_tags;
+			bt = tags->breserved_tags;
 		else
-			bt = &tags->bitmap_tags;
+			bt = tags->bitmap_tags;
 
 		/*
 		 * If destination hw queue is changed, fake wake up on
@@ -198,10 +198,10 @@  void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 		const int real_tag = tag - tags->nr_reserved_tags;
 
 		BUG_ON(real_tag >= tags->nr_tags);
-		sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
+		sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
-		sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
+		sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
 	}
 }
 
@@ -329,8 +329,8 @@  static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
 		busy_tag_iter_fn *fn, void *priv)
 {
 	if (tags->nr_reserved_tags)
-		bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
-	bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
+		bt_tags_for_each(tags, tags->breserved_tags, fn, priv, true);
+	bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, false);
 }
 
 /**
@@ -427,8 +427,8 @@  void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 			continue;
 
 		if (tags->nr_reserved_tags)
-			bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
-		bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
+			bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
+		bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
 	}
 	blk_queue_exit(q);
 }
@@ -440,24 +440,34 @@  static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
 				       node);
 }
 
-static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
-						   int node, int alloc_policy)
+static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
+				   int node, int alloc_policy)
 {
 	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
 	bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
 
-	if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
+	tags->bitmap_tags = kzalloc(sizeof(struct sbitmap_queue), GFP_KERNEL);
+	if (!tags->bitmap_tags)
+		return -ENOMEM;
+	if (bt_alloc(tags->bitmap_tags, depth, round_robin, node))
 		goto free_tags;
-	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
-		     node))
+
+	tags->breserved_tags = kzalloc(sizeof(struct sbitmap_queue),
+				       GFP_KERNEL);
+	if (!tags->breserved_tags)
 		goto free_bitmap_tags;
+	if (bt_alloc(tags->breserved_tags, tags->nr_reserved_tags,
+		     round_robin, node))
+		goto free_reserved_tags;
 
-	return tags;
+	return 0;
+free_reserved_tags:
+	kfree(tags->breserved_tags);
 free_bitmap_tags:
-	sbitmap_queue_free(&tags->bitmap_tags);
+	sbitmap_queue_free(tags->bitmap_tags);
 free_tags:
-	kfree(tags);
-	return NULL;
+	kfree(tags->bitmap_tags);
+	return -ENOMEM;
 }
 
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
@@ -478,13 +488,19 @@  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
-	return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
+	if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
+		kfree(tags);
+		tags = NULL;
+	}
+	return tags;
 }
 
 void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-	sbitmap_queue_free(&tags->bitmap_tags);
-	sbitmap_queue_free(&tags->breserved_tags);
+	sbitmap_queue_free(tags->bitmap_tags);
+	kfree(tags->bitmap_tags);
+	sbitmap_queue_free(tags->breserved_tags);
+	kfree(tags->breserved_tags);
 	kfree(tags);
 }
 
@@ -534,7 +550,7 @@  int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 		 * Don't need (or can't) update reserved tags here, they
 		 * remain static and should never need resizing.
 		 */
-		sbitmap_queue_resize(&tags->bitmap_tags,
+		sbitmap_queue_resize(tags->bitmap_tags,
 				tdepth - tags->nr_reserved_tags);
 	}
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 6c0f7c9ce9f6..10b66fd4664a 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -13,8 +13,8 @@  struct blk_mq_tags {
 
 	atomic_t active_queues;
 
-	struct sbitmap_queue bitmap_tags;
-	struct sbitmap_queue breserved_tags;
+	struct sbitmap_queue *bitmap_tags;
+	struct sbitmap_queue *breserved_tags;
 
 	struct request **rqs;
 	struct request **static_rqs;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6b39cf0efdcd..f0c40fcbd8ae 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1095,7 +1095,7 @@  static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 		struct sbitmap_queue *sbq;
 
 		list_del_init(&wait->entry);
-		sbq = &hctx->tags->bitmap_tags;
+		sbq = hctx->tags->bitmap_tags;
 		atomic_dec(&sbq->ws_active);
 	}
 	spin_unlock(&hctx->dispatch_wait_lock);
@@ -1113,7 +1113,7 @@  static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
 				 struct request *rq)
 {
-	struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
+	struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
 	struct wait_queue_head *wq;
 	wait_queue_entry_t *wait;
 	bool ret;
@@ -2081,7 +2081,6 @@  void blk_mq_free_rq_map(struct blk_mq_tags *tags)
 	tags->rqs = NULL;
 	kfree(tags->static_rqs);
 	tags->static_rqs = NULL;
-
 	blk_mq_free_tags(tags);
 }
 
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 34dcea0ef637..a7a537501d70 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -359,7 +359,7 @@  static unsigned int kyber_sched_tags_shift(struct request_queue *q)
 	 * All of the hardware queues have the same depth, so we can just grab
 	 * the shift of the first one.
 	 */
-	return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
+	return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift;
 }
 
 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
@@ -502,7 +502,7 @@  static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 	khd->batching = 0;
 
 	hctx->sched_data = khd;
-	sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
+	sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags,
 					kqd->async_depth);
 
 	return 0;