diff mbox series

drm: a6xx: avoid excessive stack usage

Message ID 20241018151143.3543939-1-arnd@kernel.org (mailing list archive)
State New, archived
Headers show
Series drm: a6xx: avoid excessive stack usage | expand

Commit Message

Arnd Bergmann Oct. 18, 2024, 3:11 p.m. UTC
From: Arnd Bergmann <arnd@arndb.de>

Clang-19 and above sometimes end up with multiple copies of the large
a6xx_hfi_msg_bw_table structure on the stack. The problem is that
a6xx_hfi_send_bw_table() calls a number of device specific functions to
fill the structure, but these create another copy of the structure on
the stack which gets copied to the first.

If the functions get inlined, that busts the warning limit:

drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]

Mark all of them as 'noinline_for_stack' ensure we only have one copy
of the structure per function.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

Comments

Akhil P Oommen Oct. 19, 2024, 9:31 a.m. UTC | #1
On Fri, Oct 18, 2024 at 03:11:38PM +0000, Arnd Bergmann wrote:
> From: Arnd Bergmann <arnd@arndb.de>
> 
> Clang-19 and above sometimes end up with multiple copies of the large
> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> a6xx_hfi_send_bw_table() calls a number of device specific functions to
> fill the structure, but these create another copy of the structure on
> the stack which gets copied to the first.
> 
> If the functions get inlined, that busts the warning limit:
> 
> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]

Why does this warning says that the limit is 1024? 1024 bytes is too small, isn't it?

-Akhil.

> 
> Mark all of them as 'noinline_for_stack' ensure we only have one copy
> of the structure per function.
> 
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 21 +++++++++++----------
>  1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> index cdb3f6e74d3e..5699e0420eb8 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> @@ -259,7 +259,8 @@ static int a6xx_hfi_send_perf_table(struct a6xx_gmu *gmu)
>  		NULL, 0);
>  }
>  
> -static void a618_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +/* noinline to avoid having multiple copies of 'msg' on stack */
> +static noinline_for_stack void a618_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	/* Send a single "off" entry since the 618 GMU doesn't do bus scaling */
>  	msg->bw_level_num = 1;
> @@ -287,7 +288,7 @@ static void a618_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  	msg->cnoc_cmds_data[1][0] =  0x60000001;
>  }
>  
> -static void a619_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +static noinline_for_stack void a619_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	msg->bw_level_num = 13;
>  
> @@ -346,7 +347,7 @@ static void a619_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  	msg->cnoc_cmds_data[0][0] = 0x40000000;
>  }
>  
> -static void a640_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +static noinline_for_stack void a640_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	/*
>  	 * Send a single "off" entry just to get things running
> @@ -385,7 +386,7 @@ static void a640_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  	msg->cnoc_cmds_data[1][2] =  0x60000001;
>  }
>  
> -static void a650_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +static noinline_for_stack void a650_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	/*
>  	 * Send a single "off" entry just to get things running
> @@ -416,7 +417,7 @@ static void a650_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  	msg->cnoc_cmds_data[1][0] =  0x60000001;
>  }
>  
> -static void a690_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +static noinline_for_stack void a690_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	/*
>  	 * Send a single "off" entry just to get things running
> @@ -447,7 +448,7 @@ static void a690_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  	msg->cnoc_cmds_data[1][0] =  0x60000001;
>  }
>  
> -static void a660_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +static noinline_for_stack void a660_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	/*
>  	 * Send a single "off" entry just to get things running
> @@ -478,7 +479,7 @@ static void a660_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  	msg->cnoc_cmds_data[1][0] =  0x60000001;
>  }
>  
> -static void adreno_7c3_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +static noinline_for_stack void adreno_7c3_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	/*
>  	 * Send a single "off" entry just to get things running
> @@ -509,7 +510,7 @@ static void adreno_7c3_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  	msg->cnoc_cmds_data[1][0] =  0x60000001;
>  }
>  
> -static void a730_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +static noinline_for_stack void a730_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	msg->bw_level_num = 12;
>  
> @@ -565,7 +566,7 @@ static void a730_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  	msg->cnoc_cmds_data[1][0] = 0x60000001;
>  }
>  
> -static void a740_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +static noinline_for_stack void a740_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	msg->bw_level_num = 1;
>  
> @@ -590,7 +591,7 @@ static void a740_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  	msg->cnoc_cmds_data[1][0] = 0x60000001;
>  }
>  
> -static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> +static noinline_for_stack void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  {
>  	/* Send a single "off" entry since the 630 GMU doesn't do bus scaling */
>  	msg->bw_level_num = 1;
> -- 
> 2.39.5
>
Dmitry Baryshkov Oct. 19, 2024, 1:14 p.m. UTC | #2
On Sat, Oct 19, 2024 at 03:01:46PM +0530, Akhil P Oommen wrote:
> On Fri, Oct 18, 2024 at 03:11:38PM +0000, Arnd Bergmann wrote:
> > From: Arnd Bergmann <arnd@arndb.de>
> > 
> > Clang-19 and above sometimes end up with multiple copies of the large
> > a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> > a6xx_hfi_send_bw_table() calls a number of device specific functions to
> > fill the structure, but these create another copy of the structure on
> > the stack which gets copied to the first.
> > 
> > If the functions get inlined, that busts the warning limit:
> > 
> > drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
> 
> Why does this warning says that the limit is 1024? 1024 bytes is too small, isn't it?

Kernel stacks are expected to be space limited, so 1024 is a logical
limit for a single function.
Akhil P Oommen Oct. 21, 2024, 9:25 a.m. UTC | #3
On Sat, Oct 19, 2024 at 04:14:13PM +0300, Dmitry Baryshkov wrote:
> On Sat, Oct 19, 2024 at 03:01:46PM +0530, Akhil P Oommen wrote:
> > On Fri, Oct 18, 2024 at 03:11:38PM +0000, Arnd Bergmann wrote:
> > > From: Arnd Bergmann <arnd@arndb.de>
> > > 
> > > Clang-19 and above sometimes end up with multiple copies of the large
> > > a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> > > a6xx_hfi_send_bw_table() calls a number of device specific functions to
> > > fill the structure, but these create another copy of the structure on
> > > the stack which gets copied to the first.
> > > 
> > > If the functions get inlined, that busts the warning limit:
> > > 
> > > drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
> > 
> > Why does this warning says that the limit is 1024? 1024 bytes is too small, isn't it?
> 
> Kernel stacks are expected to be space limited, so 1024 is a logical
> limit for a single function.

Thanks for the clarification. I think it is better to move this table to
struct a6xx_gmu which is required anyway when we implement dynamic generation
of bw table. Also, we can skip initializing it in subsequent gpu wake ups.

Arnd, do you think that would be sufficient? I can send that patch if you
want help.

-Akhil

> 
> 
> -- 
> With best wishes
> Dmitry
Konrad Dybcio Oct. 21, 2024, 9:31 a.m. UTC | #4
On 21.10.2024 11:25 AM, Akhil P Oommen wrote:
> On Sat, Oct 19, 2024 at 04:14:13PM +0300, Dmitry Baryshkov wrote:
>> On Sat, Oct 19, 2024 at 03:01:46PM +0530, Akhil P Oommen wrote:
>>> On Fri, Oct 18, 2024 at 03:11:38PM +0000, Arnd Bergmann wrote:
>>>> From: Arnd Bergmann <arnd@arndb.de>
>>>>
>>>> Clang-19 and above sometimes end up with multiple copies of the large
>>>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
>>>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
>>>> fill the structure, but these create another copy of the structure on
>>>> the stack which gets copied to the first.
>>>>
>>>> If the functions get inlined, that busts the warning limit:
>>>>
>>>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
>>>
>>> Why does this warning says that the limit is 1024? 1024 bytes is too small, isn't it?
>>
>> Kernel stacks are expected to be space limited, so 1024 is a logical
>> limit for a single function.
> 
> Thanks for the clarification. I think it is better to move this table to
> struct a6xx_gmu which is required anyway when we implement dynamic generation
> of bw table. Also, we can skip initializing it in subsequent gpu wake ups.
> 
> Arnd, do you think that would be sufficient? I can send that patch if you
> want help.

FWIW I implemented this at one point.. ended up only rebasing it for months
as I kept delaying GMU voting until we get an in-tree dram frequency LUT
retrieving driver..

https://github.com/SoMainline/linux/commits/konrad/longbois-next/drivers/gpu/drm/msm/adreno

worth noting that this used to be my R&D branch so this is very much err..
"provided as-is".. but it did work on 8250!

Konrad
Arnd Bergmann Oct. 21, 2024, 10:01 a.m. UTC | #5
On Mon, Oct 21, 2024, at 09:25, Akhil P Oommen wrote:
> On Sat, Oct 19, 2024 at 04:14:13PM +0300, Dmitry Baryshkov wrote:
>> On Sat, Oct 19, 2024 at 03:01:46PM +0530, Akhil P Oommen wrote:
>> > On Fri, Oct 18, 2024 at 03:11:38PM +0000, Arnd Bergmann wrote:
>> > > From: Arnd Bergmann <arnd@arndb.de>
>> > > 
>> > > Clang-19 and above sometimes end up with multiple copies of the large
>> > > a6xx_hfi_msg_bw_table structure on the stack. The problem is that
>> > > a6xx_hfi_send_bw_table() calls a number of device specific functions to
>> > > fill the structure, but these create another copy of the structure on
>> > > the stack which gets copied to the first.
>> > > 
>> > > If the functions get inlined, that busts the warning limit:
>> > > 
>> > > drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
>> > 
>> > Why does this warning says that the limit is 1024? 1024 bytes is too small, isn't it?
>> 
>> Kernel stacks are expected to be space limited, so 1024 is a logical
>> limit for a single function.
>
> Thanks for the clarification. I think it is better to move this table to
> struct a6xx_gmu which is required anyway when we implement dynamic generation
> of bw table. Also, we can skip initializing it in subsequent gpu wake ups.
>
> Arnd, do you think that would be sufficient? I can send that patch if you
> want help.

Yes, that should work. I actually tried first to turn the model
specific data into static const structures but that ended up 
not working because some of them have a couple of dynamically
computed values. I think that would have been even nicer.

      Arnd
Akhil P Oommen Oct. 23, 2024, 7:42 p.m. UTC | #6
On 10/21/2024 3:01 PM, Konrad Dybcio wrote:
> On 21.10.2024 11:25 AM, Akhil P Oommen wrote:
>> On Sat, Oct 19, 2024 at 04:14:13PM +0300, Dmitry Baryshkov wrote:
>>> On Sat, Oct 19, 2024 at 03:01:46PM +0530, Akhil P Oommen wrote:
>>>> On Fri, Oct 18, 2024 at 03:11:38PM +0000, Arnd Bergmann wrote:
>>>>> From: Arnd Bergmann <arnd@arndb.de>
>>>>>
>>>>> Clang-19 and above sometimes end up with multiple copies of the large
>>>>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
>>>>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
>>>>> fill the structure, but these create another copy of the structure on
>>>>> the stack which gets copied to the first.
>>>>>
>>>>> If the functions get inlined, that busts the warning limit:
>>>>>
>>>>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
>>>>
>>>> Why does this warning says that the limit is 1024? 1024 bytes is too small, isn't it?
>>>
>>> Kernel stacks are expected to be space limited, so 1024 is a logical
>>> limit for a single function.
>>
>> Thanks for the clarification. I think it is better to move this table to
>> struct a6xx_gmu which is required anyway when we implement dynamic generation
>> of bw table. Also, we can skip initializing it in subsequent gpu wake ups.
>>
>> Arnd, do you think that would be sufficient? I can send that patch if you
>> want help.
> 
> FWIW I implemented this at one point.. ended up only rebasing it for months
> as I kept delaying GMU voting until we get an in-tree dram frequency LUT
> retrieving driver..
> 
> https://github.com/SoMainline/linux/commits/konrad/longbois-next/drivers/gpu/drm/msm/adreno
> 
> worth noting that this used to be my R&D branch so this is very much err..
> "provided as-is".. but it did work on 8250!
> 

Thanks, Konrad. "IB vote via GMU" support is there in my bucket list. I can help
out if you want to clean this up and send out.

-Akhil
 
> Konrad
Akhil P Oommen Oct. 27, 2024, 6:14 p.m. UTC | #7
On 10/21/2024 3:31 PM, Arnd Bergmann wrote:
> On Mon, Oct 21, 2024, at 09:25, Akhil P Oommen wrote:
>> On Sat, Oct 19, 2024 at 04:14:13PM +0300, Dmitry Baryshkov wrote:
>>> On Sat, Oct 19, 2024 at 03:01:46PM +0530, Akhil P Oommen wrote:
>>>> On Fri, Oct 18, 2024 at 03:11:38PM +0000, Arnd Bergmann wrote:
>>>>> From: Arnd Bergmann <arnd@arndb.de>
>>>>>
>>>>> Clang-19 and above sometimes end up with multiple copies of the large
>>>>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
>>>>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
>>>>> fill the structure, but these create another copy of the structure on
>>>>> the stack which gets copied to the first.
>>>>>
>>>>> If the functions get inlined, that busts the warning limit:
>>>>>
>>>>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
>>>>
>>>> Why does this warning says that the limit is 1024? 1024 bytes is too small, isn't it?
>>>
>>> Kernel stacks are expected to be space limited, so 1024 is a logical
>>> limit for a single function.
>>
>> Thanks for the clarification. I think it is better to move this table to
>> struct a6xx_gmu which is required anyway when we implement dynamic generation
>> of bw table. Also, we can skip initializing it in subsequent gpu wake ups.
>>
>> Arnd, do you think that would be sufficient? I can send that patch if you
>> want help.
> 
> Yes, that should work. I actually tried first to turn the model
> specific data into static const structures but that ended up 
> not working because some of them have a couple of dynamically
> computed values. I think that would have been even nicer.
> 
>       Arnd

https://patchwork.freedesktop.org/patch/621814/

Posted the fix. Btw, I have copied most of the commit text from this patch.

-Akhil.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
index cdb3f6e74d3e..5699e0420eb8 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
@@ -259,7 +259,8 @@  static int a6xx_hfi_send_perf_table(struct a6xx_gmu *gmu)
 		NULL, 0);
 }
 
-static void a618_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+/* noinline to avoid having multiple copies of 'msg' on stack */
+static noinline_for_stack void a618_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	/* Send a single "off" entry since the 618 GMU doesn't do bus scaling */
 	msg->bw_level_num = 1;
@@ -287,7 +288,7 @@  static void a618_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 	msg->cnoc_cmds_data[1][0] =  0x60000001;
 }
 
-static void a619_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+static noinline_for_stack void a619_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	msg->bw_level_num = 13;
 
@@ -346,7 +347,7 @@  static void a619_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 	msg->cnoc_cmds_data[0][0] = 0x40000000;
 }
 
-static void a640_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+static noinline_for_stack void a640_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	/*
 	 * Send a single "off" entry just to get things running
@@ -385,7 +386,7 @@  static void a640_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 	msg->cnoc_cmds_data[1][2] =  0x60000001;
 }
 
-static void a650_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+static noinline_for_stack void a650_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	/*
 	 * Send a single "off" entry just to get things running
@@ -416,7 +417,7 @@  static void a650_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 	msg->cnoc_cmds_data[1][0] =  0x60000001;
 }
 
-static void a690_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+static noinline_for_stack void a690_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	/*
 	 * Send a single "off" entry just to get things running
@@ -447,7 +448,7 @@  static void a690_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 	msg->cnoc_cmds_data[1][0] =  0x60000001;
 }
 
-static void a660_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+static noinline_for_stack void a660_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	/*
 	 * Send a single "off" entry just to get things running
@@ -478,7 +479,7 @@  static void a660_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 	msg->cnoc_cmds_data[1][0] =  0x60000001;
 }
 
-static void adreno_7c3_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+static noinline_for_stack void adreno_7c3_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	/*
 	 * Send a single "off" entry just to get things running
@@ -509,7 +510,7 @@  static void adreno_7c3_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 	msg->cnoc_cmds_data[1][0] =  0x60000001;
 }
 
-static void a730_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+static noinline_for_stack void a730_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	msg->bw_level_num = 12;
 
@@ -565,7 +566,7 @@  static void a730_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 	msg->cnoc_cmds_data[1][0] = 0x60000001;
 }
 
-static void a740_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+static noinline_for_stack void a740_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	msg->bw_level_num = 1;
 
@@ -590,7 +591,7 @@  static void a740_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 	msg->cnoc_cmds_data[1][0] = 0x60000001;
 }
 
-static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
+static noinline_for_stack void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 {
 	/* Send a single "off" entry since the 630 GMU doesn't do bus scaling */
 	msg->bw_level_num = 1;