diff mbox series

[RFC] io_uring/rsrc: add last-lookup cache hit to io_rsrc_node_lookup()

Message ID db316d73-cb32-4f7f-beb0-68f253f5e0c5@kernel.dk (mailing list archive)
State New
Headers show
Series [RFC] io_uring/rsrc: add last-lookup cache hit to io_rsrc_node_lookup() | expand

Commit Message

Jens Axboe Oct. 30, 2024, 4:58 p.m. UTC
This avoids array_index_nospec() for repeated lookups on the same node,
which can be quite common (and costly). If a cached node is removed from
the given table, it'll get cleared in the cache as well.
io_reset_rsrc_node() takes care of that, which is used in the spots
that's replacing a node.

Note: need to double check this is 100% safe wrt speculation, but I
believe it should be as we're not using the passed in value to index
any arrays (directly).

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

Sending this out as an RFC, as array_index_nospec() can cause stalls for
frequent lookups. For buffers, it's not unusual to have larger regions
registered, which means hitting the same resource node lookup all the
time.

At the same time, I'm not 100% certain on the sanity of this. Before
you'd always do:

index = array_index_nospec(index, max_nr);
node = some_table[index];

and now you can do:

if (index == last_index)
	return last_node;
last_node = some_table[array_index_nospec(index, max_nr)];
last_index = index;
return last_node;

which _seems_ like it should be safe as no array indexing occurs. Hence
the Jann CC :-)

Comments

Jann Horn Oct. 30, 2024, 5:20 p.m. UTC | #1
On Wed, Oct 30, 2024 at 5:58 PM Jens Axboe <axboe@kernel.dk> wrote:
> This avoids array_index_nospec() for repeated lookups on the same node,
> which can be quite common (and costly). If a cached node is removed from

You're saying array_index_nospec() can be quite costly - which
architecture is this on? Is this the cost of the compare+subtract+and
making the critical path longer?

> the given table, it'll get cleared in the cache as well.
> io_reset_rsrc_node() takes care of that, which is used in the spots
> that's replacing a node.
>
> Note: need to double check this is 100% safe wrt speculation, but I
> believe it should be as we're not using the passed in value to index
> any arrays (directly).
>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>
> ---
>
> Sending this out as an RFC, as array_index_nospec() can cause stalls for
> frequent lookups. For buffers, it's not unusual to have larger regions
> registered, which means hitting the same resource node lookup all the
> time.
>
> At the same time, I'm not 100% certain on the sanity of this. Before
> you'd always do:
>
> index = array_index_nospec(index, max_nr);
> node = some_table[index];
>
> and now you can do:
>
> if (index == last_index)
>         return last_node;
> last_node = some_table[array_index_nospec(index, max_nr)];
> last_index = index;
> return last_node;
>
> which _seems_ like it should be safe as no array indexing occurs. Hence
> the Jann CC :-)

I guess the overall approach should be safe as long as you make sure
that last_node is always valid or NULL, though I wonder if it wouldn't
be a more straightforward improvement to make sure the table has a
power-of-two size and then using a bitwise AND to truncate the
index... with that I think you'd maybe just have a single-cycle
lengthening of the critical path? Though we would need a new helper
for that in nospec.h.

> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 77fd508d043a..c283179b0c89 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -57,6 +57,8 @@ struct io_wq_work {
>
>  struct io_rsrc_data {
>         unsigned int                    nr;
> +       unsigned int                    last_index;
> +       struct io_rsrc_node             *last_node;
>         struct io_rsrc_node             **nodes;
>  };
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 9829c51105ed..413d003bc5d7 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -139,6 +139,8 @@ __cold void io_rsrc_data_free(struct io_rsrc_data *data)
>                 if (data->nodes[data->nr])
>                         io_put_rsrc_node(data->nodes[data->nr]);
>         }
> +       data->last_node = NULL;
> +       data->last_index = -1U;
>         kvfree(data->nodes);
>         data->nodes = NULL;
>         data->nr = 0;
> @@ -150,6 +152,7 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
>                                         GFP_KERNEL_ACCOUNT | __GFP_ZERO);
>         if (data->nodes) {
>                 data->nr = nr;
> +               data->last_index = -1U;
>                 return 0;
>         }
>         return -ENOMEM;
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index a40fad783a69..e2795daa877d 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -70,8 +70,16 @@ int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
>  static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
>                                                        int index)
>  {
> -       if (index < data->nr)
> -               return data->nodes[array_index_nospec(index, data->nr)];
> +       if (index < data->nr) {
> +               if (index != data->last_index) {
> +                       index = array_index_nospec(index, data->nr);
> +                       if (data->nodes[index]) {

I guess I'm not sure if eliding the array_index_nospec() is worth
adding a new branch here that you could mispredict... probably depends
on your workload, I guess?

> +                               data->last_index = index;
> +                               data->last_node = data->nodes[index];

This seems a bit functionally broken - if data->nodes[index] is NULL,
you just leave data->last_node set to its previous value and return
that?


> +                       }
> +               }
> +               return data->last_node;
> +       }
>         return NULL;
>  }
>
> @@ -85,8 +93,14 @@ static inline bool io_reset_rsrc_node(struct io_rsrc_data *data, int index)
>  {
>         struct io_rsrc_node *node = data->nodes[index];
>
> -       if (!node)
> +       if (!node) {
> +               WARN_ON_ONCE(index == data->last_index);
>                 return false;
> +       }
> +       if (index == data->last_index) {
> +               data->last_node = NULL;
> +               data->last_index = -1U;
> +       }
>         io_put_rsrc_node(node);
>         data->nodes[index] = NULL;
>         return true;
>
> --
> Jens Axboe
>
Jens Axboe Oct. 30, 2024, 8:25 p.m. UTC | #2
On 10/30/24 11:20 AM, Jann Horn wrote:
> On Wed, Oct 30, 2024 at 5:58?PM Jens Axboe <axboe@kernel.dk> wrote:
>> This avoids array_index_nospec() for repeated lookups on the same node,
>> which can be quite common (and costly). If a cached node is removed from
> 
> You're saying array_index_nospec() can be quite costly - which
> architecture is this on? Is this the cost of the compare+subtract+and
> making the critical path longer?

Tested this on arm64, in a vm to be specific. Let me try and generate
some numbers/profiles on x86-64 as well. It's noticeable there as well,
though not quite as bad as the below example. For arm64, with the patch,
we get roughly 8.7% of the time spent getting a resource - without it's
66% of the time. This is just doing a microbenchmark, but it clearly
shows that anything following the barrier on arm64 is very costly:

  0.98 │       ldr   x21, [x0, #96]
       │     ↓ tbnz  w2, #1, b8
  1.04 │       ldr   w1, [x21, #144]
       │       cmp   w1, w19
       │     ↓ b.ls  a0
       │ 30:   mov   w1, w1
       │       sxtw  x0, w19
       │       cmp   x0, x1
       │       ngc   x0, xzr
       │       csdb
       │       ldr   x1, [x21, #160]
       │       and   w19, w19, w0
 93.98 │       ldr   x19, [x1, w19, sxtw #3]

and accounts for most of that 66% of the total cost of the micro bench,
even though it's doing a ton more stuff than simple getting this node
via a lookup.

>> the given table, it'll get cleared in the cache as well.
>> io_reset_rsrc_node() takes care of that, which is used in the spots
>> that's replacing a node.
>>
>> Note: need to double check this is 100% safe wrt speculation, but I
>> believe it should be as we're not using the passed in value to index
>> any arrays (directly).
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>
>> ---
>>
>> Sending this out as an RFC, as array_index_nospec() can cause stalls for
>> frequent lookups. For buffers, it's not unusual to have larger regions
>> registered, which means hitting the same resource node lookup all the
>> time.
>>
>> At the same time, I'm not 100% certain on the sanity of this. Before
>> you'd always do:
>>
>> index = array_index_nospec(index, max_nr);
>> node = some_table[index];
>>
>> and now you can do:
>>
>> if (index == last_index)
>>         return last_node;
>> last_node = some_table[array_index_nospec(index, max_nr)];
>> last_index = index;
>> return last_node;
>>
>> which _seems_ like it should be safe as no array indexing occurs. Hence
>> the Jann CC :-)
> 
> I guess the overall approach should be safe as long as you make sure
> that last_node is always valid or NULL, though I wonder if it wouldn't

Right, that obviously has to be true.

> be a more straightforward improvement to make sure the table has a
> power-of-two size and then using a bitwise AND to truncate the
> index... with that I think you'd maybe just have a single-cycle
> lengthening of the critical path? Though we would need a new helper
> for that in nospec.h.

That might work too. I don't necessarily control the size of the array,
but I could indeed just oversize it to ensure it's a power of two.
That'd certainly help code generation for the truncation, but not get
rid of the csdb()?

>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>> index 77fd508d043a..c283179b0c89 100644
>> --- a/include/linux/io_uring_types.h
>> +++ b/include/linux/io_uring_types.h
>> @@ -57,6 +57,8 @@ struct io_wq_work {
>>
>>  struct io_rsrc_data {
>>         unsigned int                    nr;
>> +       unsigned int                    last_index;
>> +       struct io_rsrc_node             *last_node;
>>         struct io_rsrc_node             **nodes;
>>  };
>>
>> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
>> index 9829c51105ed..413d003bc5d7 100644
>> --- a/io_uring/rsrc.c
>> +++ b/io_uring/rsrc.c
>> @@ -139,6 +139,8 @@ __cold void io_rsrc_data_free(struct io_rsrc_data *data)
>>                 if (data->nodes[data->nr])
>>                         io_put_rsrc_node(data->nodes[data->nr]);
>>         }
>> +       data->last_node = NULL;
>> +       data->last_index = -1U;
>>         kvfree(data->nodes);
>>         data->nodes = NULL;
>>         data->nr = 0;
>> @@ -150,6 +152,7 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
>>                                         GFP_KERNEL_ACCOUNT | __GFP_ZERO);
>>         if (data->nodes) {
>>                 data->nr = nr;
>> +               data->last_index = -1U;
>>                 return 0;
>>         }
>>         return -ENOMEM;
>> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
>> index a40fad783a69..e2795daa877d 100644
>> --- a/io_uring/rsrc.h
>> +++ b/io_uring/rsrc.h
>> @@ -70,8 +70,16 @@ int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
>>  static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
>>                                                        int index)
>>  {
>> -       if (index < data->nr)
>> -               return data->nodes[array_index_nospec(index, data->nr)];
>> +       if (index < data->nr) {
>> +               if (index != data->last_index) {
>> +                       index = array_index_nospec(index, data->nr);
>> +                       if (data->nodes[index]) {
> 
> I guess I'm not sure if eliding the array_index_nospec() is worth
> adding a new branch here that you could mispredict... probably depends
> on your workload, I guess?
> 
>> +                               data->last_index = index;
>> +                               data->last_node = data->nodes[index];
> 
> This seems a bit functionally broken - if data->nodes[index] is NULL,
> you just leave data->last_node set to its previous value and return
> that?

Ah true, yeah it should always clear it. Should set it regardless, that
branch can go away.
Jens Axboe Oct. 30, 2024, 8:52 p.m. UTC | #3
On 10/30/24 2:25 PM, Jens Axboe wrote:
> On 10/30/24 11:20 AM, Jann Horn wrote:
>> On Wed, Oct 30, 2024 at 5:58?PM Jens Axboe <axboe@kernel.dk> wrote:
>>> This avoids array_index_nospec() for repeated lookups on the same node,
>>> which can be quite common (and costly). If a cached node is removed from
>>
>> You're saying array_index_nospec() can be quite costly - which
>> architecture is this on? Is this the cost of the compare+subtract+and
>> making the critical path longer?
> 
> Tested this on arm64, in a vm to be specific. Let me try and generate
> some numbers/profiles on x86-64 as well. It's noticeable there as well,
> though not quite as bad as the below example. For arm64, with the patch,
> we get roughly 8.7% of the time spent getting a resource - without it's
> 66% of the time. This is just doing a microbenchmark, but it clearly
> shows that anything following the barrier on arm64 is very costly:
> 
>   0.98 ?       ldr   x21, [x0, #96]
>        ?     ? tbnz  w2, #1, b8
>   1.04 ?       ldr   w1, [x21, #144]
>        ?       cmp   w1, w19
>        ?     ? b.ls  a0
>        ? 30:   mov   w1, w1
>        ?       sxtw  x0, w19
>        ?       cmp   x0, x1
>        ?       ngc   x0, xzr
>        ?       csdb
>        ?       ldr   x1, [x21, #160]
>        ?       and   w19, w19, w0
>  93.98 ?       ldr   x19, [x1, w19, sxtw #3]
> 
> and accounts for most of that 66% of the total cost of the micro bench,
> even though it's doing a ton more stuff than simple getting this node
> via a lookup.

Ran some x86-64 testing, and there's no such effect on x86-64. So mostly
useful on archs with more expensive array_index_nospec(). There's
obviously a cost associated with it, but it's more of an even trade off
in terms of having the extra branch vs the nospec indexing. Which means
at that point you may as well not add the extra cache, as this
particular case always hits it, and hence it's a best case kind of test.
Jann Horn Oct. 30, 2024, 9:01 p.m. UTC | #4
On Wed, Oct 30, 2024 at 9:25 PM Jens Axboe <axboe@kernel.dk> wrote:
> On 10/30/24 11:20 AM, Jann Horn wrote:
> > On Wed, Oct 30, 2024 at 5:58?PM Jens Axboe <axboe@kernel.dk> wrote:
> >> This avoids array_index_nospec() for repeated lookups on the same node,
> >> which can be quite common (and costly). If a cached node is removed from
> >
> > You're saying array_index_nospec() can be quite costly - which
> > architecture is this on? Is this the cost of the compare+subtract+and
> > making the critical path longer?
>
> Tested this on arm64, in a vm to be specific. Let me try and generate
> some numbers/profiles on x86-64 as well. It's noticeable there as well,
> though not quite as bad as the below example. For arm64, with the patch,
> we get roughly 8.7% of the time spent getting a resource - without it's
> 66% of the time. This is just doing a microbenchmark, but it clearly
> shows that anything following the barrier on arm64 is very costly:
>
>   0.98 │       ldr   x21, [x0, #96]
>        │     ↓ tbnz  w2, #1, b8
>   1.04 │       ldr   w1, [x21, #144]
>        │       cmp   w1, w19
>        │     ↓ b.ls  a0
>        │ 30:   mov   w1, w1
>        │       sxtw  x0, w19
>        │       cmp   x0, x1
>        │       ngc   x0, xzr
>        │       csdb
>        │       ldr   x1, [x21, #160]
>        │       and   w19, w19, w0
>  93.98 │       ldr   x19, [x1, w19, sxtw #3]
>
> and accounts for most of that 66% of the total cost of the micro bench,
> even though it's doing a ton more stuff than simple getting this node
> via a lookup.

Ah, actually... a difference between x86 and arm64 is that arm64 does
an extra Speculative Data Barrier here, while x86 just does some
arithmetic. Which I think is to work around "data value predictions",
in which case my idea of using bitwise AND probably isn't valid.

https://developer.arm.com/documentation/102816/0205/ section "Software
Mitigations" says "Such code sequences are based around specific data
processing operations (for example conditional select or conditional
move) and a new barrier instruction (CSDB). The combination of both a
conditional select/conditional move and the new barrier are sufficient
to address this problem on ALL Arm implementations, both current and
future".
Jens Axboe Oct. 30, 2024, 9:04 p.m. UTC | #5
On 10/30/24 3:01 PM, Jann Horn wrote:
> On Wed, Oct 30, 2024 at 9:25?PM Jens Axboe <axboe@kernel.dk> wrote:
>> On 10/30/24 11:20 AM, Jann Horn wrote:
>>> On Wed, Oct 30, 2024 at 5:58?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>> This avoids array_index_nospec() for repeated lookups on the same node,
>>>> which can be quite common (and costly). If a cached node is removed from
>>>
>>> You're saying array_index_nospec() can be quite costly - which
>>> architecture is this on? Is this the cost of the compare+subtract+and
>>> making the critical path longer?
>>
>> Tested this on arm64, in a vm to be specific. Let me try and generate
>> some numbers/profiles on x86-64 as well. It's noticeable there as well,
>> though not quite as bad as the below example. For arm64, with the patch,
>> we get roughly 8.7% of the time spent getting a resource - without it's
>> 66% of the time. This is just doing a microbenchmark, but it clearly
>> shows that anything following the barrier on arm64 is very costly:
>>
>>   0.98 ?       ldr   x21, [x0, #96]
>>        ?     ? tbnz  w2, #1, b8
>>   1.04 ?       ldr   w1, [x21, #144]
>>        ?       cmp   w1, w19
>>        ?     ? b.ls  a0
>>        ? 30:   mov   w1, w1
>>        ?       sxtw  x0, w19
>>        ?       cmp   x0, x1
>>        ?       ngc   x0, xzr
>>        ?       csdb
>>        ?       ldr   x1, [x21, #160]
>>        ?       and   w19, w19, w0
>>  93.98 ?       ldr   x19, [x1, w19, sxtw #3]
>>
>> and accounts for most of that 66% of the total cost of the micro bench,
>> even though it's doing a ton more stuff than simple getting this node
>> via a lookup.
> 
> Ah, actually... a difference between x86 and arm64 is that arm64 does
> an extra Speculative Data Barrier here, while x86 just does some
> arithmetic. Which I think is to work around "data value predictions",
> in which case my idea of using bitwise AND probably isn't valid.
> 
> https://developer.arm.com/documentation/102816/0205/ section "Software
> Mitigations" says "Such code sequences are based around specific data
> processing operations (for example conditional select or conditional
> move) and a new barrier instruction (CSDB). The combination of both a
> conditional select/conditional move and the new barrier are sufficient
> to address this problem on ALL Arm implementations, both current and
> future".

Yep, see my followup on the x86-64 side too. Don't think it's worth
doing something just because it's expensive on arm64, in fact any kind
of higher frequency array_index_nospec() will be expensive on arm64 :/
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 77fd508d043a..c283179b0c89 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -57,6 +57,8 @@  struct io_wq_work {
 
 struct io_rsrc_data {
 	unsigned int			nr;
+	unsigned int			last_index;
+	struct io_rsrc_node		*last_node;
 	struct io_rsrc_node		**nodes;
 };
 
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 9829c51105ed..413d003bc5d7 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -139,6 +139,8 @@  __cold void io_rsrc_data_free(struct io_rsrc_data *data)
 		if (data->nodes[data->nr])
 			io_put_rsrc_node(data->nodes[data->nr]);
 	}
+	data->last_node = NULL;
+	data->last_index = -1U;
 	kvfree(data->nodes);
 	data->nodes = NULL;
 	data->nr = 0;
@@ -150,6 +152,7 @@  __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
 					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 	if (data->nodes) {
 		data->nr = nr;
+		data->last_index = -1U;
 		return 0;
 	}
 	return -ENOMEM;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index a40fad783a69..e2795daa877d 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -70,8 +70,16 @@  int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
 						       int index)
 {
-	if (index < data->nr)
-		return data->nodes[array_index_nospec(index, data->nr)];
+	if (index < data->nr) {
+		if (index != data->last_index) {
+			index = array_index_nospec(index, data->nr);
+			if (data->nodes[index]) {
+				data->last_index = index;
+				data->last_node = data->nodes[index];
+			}
+		}
+		return data->last_node;
+	}
 	return NULL;
 }
 
@@ -85,8 +93,14 @@  static inline bool io_reset_rsrc_node(struct io_rsrc_data *data, int index)
 {
 	struct io_rsrc_node *node = data->nodes[index];
 
-	if (!node)
+	if (!node) {
+		WARN_ON_ONCE(index == data->last_index);
 		return false;
+	}
+	if (index == data->last_index) {
+		data->last_node = NULL;
+		data->last_index = -1U;
+	}
 	io_put_rsrc_node(node);
 	data->nodes[index] = NULL;
 	return true;