diff mbox series

[RFC,04/11] drm/bridge: ti-sn65dsi86: Use bitmask to store valid rates

Message ID 20210322030128.2283-5-laurent.pinchart+renesas@ideasonboard.com (mailing list archive)
State New
Delegated to: Kieran Bingham
Headers show
Series drm/bridge: ti-sn65dsi86: Support DisplayPort mode | expand

Commit Message

Laurent Pinchart March 22, 2021, 3:01 a.m. UTC
The valid rates are stored in an array of 8 booleans. Replace it with a
bitmask to save space.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
---
 drivers/gpu/drm/bridge/ti-sn65dsi86.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

Comments

Stephen Boyd March 23, 2021, 7:11 a.m. UTC | #1
Quoting Laurent Pinchart (2021-03-21 20:01:21)
> The valid rates are stored in an array of 8 booleans. Replace it with a
> bitmask to save space.
> 
> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> ---

Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Doug Anderson March 23, 2021, 9:08 p.m. UTC | #2
Hi,

On Sun, Mar 21, 2021 at 8:02 PM Laurent Pinchart
<laurent.pinchart+renesas@ideasonboard.com> wrote:
>
> The valid rates are stored in an array of 8 booleans. Replace it with a
> bitmask to save space.

I'm curious: do you have evidence that this does anything useful? I
guess you're expecting it to save .text space, right? Stack usage and
execution time differences should be irrelevant--it's not in a
critical section and the difference should be tiny anyway. As far as
.text segment goes, it's not obvious to me that the compiler will use
fewer instructions to manipulate bits compared to booleans.

Doing a super simple "ls -ah" on vmlinux (unstripped):

Before: 224820232 bytes
After: 224820376 bytes

...so your change made it _bigger_.   OK, so running "strip
--strip-debug" on those:

Before: 26599464 bytes
After: 26599464 bytes

...so exactly the same. I tried finding some evidence using "readelf -ah":

Before:
  [ 2] .text             PROGBITS         ffffffc010010000  00020000
       0000000000b03508  0000000000000000 WAX       0     0     65536
  [ 3] .rodata           PROGBITS         ffffffc010b20000  00b30000
       00000000002e84b3  0000000000000000 WAMS       0     0     4096

After:
  [ 2] .text             PROGBITS         ffffffc010010000  00020000
       0000000000b03508  0000000000000000 WAX       0     0     65536
  [ 3] .rodata           PROGBITS         ffffffc010b20000  00b30000
       00000000002e84b3  0000000000000000 WAMS       0     0     4096

Maybe you have some evidence showing an improvement? Ah, OK. I
disassembled ti_sn_bridge_enable() and your patch saves 12 bytes, but
I guess maybe alignment washes it out in reality...


In terms of readability / conventions, I personally find this change a
bit of a wash. I mean, I guess I originally implemented it as an array
and I thought that was the most readable, but I like bitfields fine
too. If everyone loves it then I won't object, but to me it feels like
touching lines of code for something that's personal preference. ;-)


> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> ---
>  drivers/gpu/drm/bridge/ti-sn65dsi86.c | 24 +++++++++++++-----------
>  1 file changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
> index c45420a50e73..1d1be791d5ba 100644
> --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
> +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
> @@ -557,9 +557,9 @@ static int ti_sn_bridge_calc_min_dp_rate_idx(struct ti_sn_bridge *pdata)
>         return i;
>  }
>
> -static void ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata,
> -                                         bool rate_valid[])
> +static unsigned int ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata)
>  {
> +       unsigned int valid_rates = 0;
>         unsigned int rate_per_200khz;
>         unsigned int rate_mhz;
>         u8 dpcd_val;
> @@ -599,13 +599,13 @@ static void ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata,
>                              j < ARRAY_SIZE(ti_sn_bridge_dp_rate_lut);
>                              j++) {
>                                 if (ti_sn_bridge_dp_rate_lut[j] == rate_mhz)
> -                                       rate_valid[j] = true;
> +                                       valid_rates |= BIT(j);
>                         }
>                 }
>
>                 for (i = 0; i < ARRAY_SIZE(ti_sn_bridge_dp_rate_lut); i++) {
> -                       if (rate_valid[i])
> -                               return;
> +                       if (valid_rates & BIT(i))
> +                               return valid_rates;
>                 }
>                 DRM_DEV_ERROR(pdata->dev,
>                               "No matching eDP rates in table; falling back\n");
> @@ -627,15 +627,17 @@ static void ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata,
>                               (int)dpcd_val);
>                 fallthrough;
>         case DP_LINK_BW_5_4:
> -               rate_valid[7] = 1;
> +               valid_rates |= BIT(7);
>                 fallthrough;
>         case DP_LINK_BW_2_7:
> -               rate_valid[4] = 1;
> +               valid_rates |= BIT(4);
>                 fallthrough;
>         case DP_LINK_BW_1_62:
> -               rate_valid[1] = 1;
> +               valid_rates |= BIT(1);
>                 break;
>         }
> +
> +       return valid_rates;
>  }
>
>  static void ti_sn_bridge_set_video_timings(struct ti_sn_bridge *pdata)
> @@ -753,8 +755,8 @@ static int ti_sn_link_training(struct ti_sn_bridge *pdata, int dp_rate_idx,
>  static void ti_sn_bridge_enable(struct drm_bridge *bridge)
>  {
>         struct ti_sn_bridge *pdata = bridge_to_ti_sn_bridge(bridge);
> -       bool rate_valid[ARRAY_SIZE(ti_sn_bridge_dp_rate_lut)] = { };
>         const char *last_err_str = "No supported DP rate";
> +       unsigned int valid_rates;
>         int dp_rate_idx;
>         unsigned int val;
>         int ret = -EINVAL;
> @@ -793,13 +795,13 @@ static void ti_sn_bridge_enable(struct drm_bridge *bridge)
>         regmap_update_bits(pdata->regmap, SN_SSC_CONFIG_REG, DP_NUM_LANES_MASK,
>                            val);
>
> -       ti_sn_bridge_read_valid_rates(pdata, rate_valid);
> +       valid_rates = ti_sn_bridge_read_valid_rates(pdata);
>
>         /* Train until we run out of rates */
>         for (dp_rate_idx = ti_sn_bridge_calc_min_dp_rate_idx(pdata);
>              dp_rate_idx < ARRAY_SIZE(ti_sn_bridge_dp_rate_lut);
>              dp_rate_idx++) {
> -               if (!rate_valid[dp_rate_idx])
> +               if (!(valid_rates & BIT(dp_rate_idx)))
>                         continue;
>
>                 ret = ti_sn_link_training(pdata, dp_rate_idx, &last_err_str);

In any case, since it does save 12 bytes:

Reviewed-by: Douglas Anderson <dianders@chromium.org>
Laurent Pinchart March 23, 2021, 9:45 p.m. UTC | #3
Hi Doug,

On Tue, Mar 23, 2021 at 02:08:55PM -0700, Doug Anderson wrote:
> On Sun, Mar 21, 2021 at 8:02 PM Laurent Pinchart wrote:
> >
> > The valid rates are stored in an array of 8 booleans. Replace it with a
> > bitmask to save space.
> 
> I'm curious: do you have evidence that this does anything useful? I
> guess you're expecting it to save .text space, right? Stack usage and
> execution time differences should be irrelevant--it's not in a
> critical section and the difference should be tiny anyway. As far as
> .text segment goes, it's not obvious to me that the compiler will use
> fewer instructions to manipulate bits compared to booleans.
> 
> Doing a super simple "ls -ah" on vmlinux (unstripped):
> 
> Before: 224820232 bytes
> After: 224820376 bytes
> 
> ...so your change made it _bigger_.   OK, so running "strip
> --strip-debug" on those:
> 
> Before: 26599464 bytes
> After: 26599464 bytes
> 
> ...so exactly the same. I tried finding some evidence using "readelf -ah":
> 
> Before:
>   [ 2] .text             PROGBITS         ffffffc010010000  00020000
>        0000000000b03508  0000000000000000 WAX       0     0     65536
>   [ 3] .rodata           PROGBITS         ffffffc010b20000  00b30000
>        00000000002e84b3  0000000000000000 WAMS       0     0     4096
> 
> After:
>   [ 2] .text             PROGBITS         ffffffc010010000  00020000
>        0000000000b03508  0000000000000000 WAX       0     0     65536
>   [ 3] .rodata           PROGBITS         ffffffc010b20000  00b30000
>        00000000002e84b3  0000000000000000 WAMS       0     0     4096
> 
> Maybe you have some evidence showing an improvement? Ah, OK. I
> disassembled ti_sn_bridge_enable() and your patch saves 12 bytes, but
> I guess maybe alignment washes it out in reality...
> 
> 
> In terms of readability / conventions, I personally find this change a
> bit of a wash. I mean, I guess I originally implemented it as an array
> and I thought that was the most readable, but I like bitfields fine
> too. If everyone loves it then I won't object, but to me it feels like
> touching lines of code for something that's personal preference. ;-)

You're right that the .text and CPU time improvements were not my
target. I was focussed on stack usage, as that's a limited resource in
the kernel. I don't have any evidence that we would be close to any
limit, so it's tiny, and if you or anyone else have a strong opinion
that an array of booleans is better due to readability concerns, I can
drop this change. I only thought about those poor 7 bits in every bool
that sat there unused, feeling useless :-)

> > Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> > ---
> >  drivers/gpu/drm/bridge/ti-sn65dsi86.c | 24 +++++++++++++-----------
> >  1 file changed, 13 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
> > index c45420a50e73..1d1be791d5ba 100644
> > --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
> > +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
> > @@ -557,9 +557,9 @@ static int ti_sn_bridge_calc_min_dp_rate_idx(struct ti_sn_bridge *pdata)
> >         return i;
> >  }
> >
> > -static void ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata,
> > -                                         bool rate_valid[])
> > +static unsigned int ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata)
> >  {
> > +       unsigned int valid_rates = 0;
> >         unsigned int rate_per_200khz;
> >         unsigned int rate_mhz;
> >         u8 dpcd_val;
> > @@ -599,13 +599,13 @@ static void ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata,
> >                              j < ARRAY_SIZE(ti_sn_bridge_dp_rate_lut);
> >                              j++) {
> >                                 if (ti_sn_bridge_dp_rate_lut[j] == rate_mhz)
> > -                                       rate_valid[j] = true;
> > +                                       valid_rates |= BIT(j);
> >                         }
> >                 }
> >
> >                 for (i = 0; i < ARRAY_SIZE(ti_sn_bridge_dp_rate_lut); i++) {
> > -                       if (rate_valid[i])
> > -                               return;
> > +                       if (valid_rates & BIT(i))
> > +                               return valid_rates;
> >                 }
> >                 DRM_DEV_ERROR(pdata->dev,
> >                               "No matching eDP rates in table; falling back\n");
> > @@ -627,15 +627,17 @@ static void ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata,
> >                               (int)dpcd_val);
> >                 fallthrough;
> >         case DP_LINK_BW_5_4:
> > -               rate_valid[7] = 1;
> > +               valid_rates |= BIT(7);
> >                 fallthrough;
> >         case DP_LINK_BW_2_7:
> > -               rate_valid[4] = 1;
> > +               valid_rates |= BIT(4);
> >                 fallthrough;
> >         case DP_LINK_BW_1_62:
> > -               rate_valid[1] = 1;
> > +               valid_rates |= BIT(1);
> >                 break;
> >         }
> > +
> > +       return valid_rates;
> >  }
> >
> >  static void ti_sn_bridge_set_video_timings(struct ti_sn_bridge *pdata)
> > @@ -753,8 +755,8 @@ static int ti_sn_link_training(struct ti_sn_bridge *pdata, int dp_rate_idx,
> >  static void ti_sn_bridge_enable(struct drm_bridge *bridge)
> >  {
> >         struct ti_sn_bridge *pdata = bridge_to_ti_sn_bridge(bridge);
> > -       bool rate_valid[ARRAY_SIZE(ti_sn_bridge_dp_rate_lut)] = { };
> >         const char *last_err_str = "No supported DP rate";
> > +       unsigned int valid_rates;
> >         int dp_rate_idx;
> >         unsigned int val;
> >         int ret = -EINVAL;
> > @@ -793,13 +795,13 @@ static void ti_sn_bridge_enable(struct drm_bridge *bridge)
> >         regmap_update_bits(pdata->regmap, SN_SSC_CONFIG_REG, DP_NUM_LANES_MASK,
> >                            val);
> >
> > -       ti_sn_bridge_read_valid_rates(pdata, rate_valid);
> > +       valid_rates = ti_sn_bridge_read_valid_rates(pdata);
> >
> >         /* Train until we run out of rates */
> >         for (dp_rate_idx = ti_sn_bridge_calc_min_dp_rate_idx(pdata);
> >              dp_rate_idx < ARRAY_SIZE(ti_sn_bridge_dp_rate_lut);
> >              dp_rate_idx++) {
> > -               if (!rate_valid[dp_rate_idx])
> > +               if (!(valid_rates & BIT(dp_rate_idx)))
> >                         continue;
> >
> >                 ret = ti_sn_link_training(pdata, dp_rate_idx, &last_err_str);
> 
> In any case, since it does save 12 bytes:
> 
> Reviewed-by: Douglas Anderson <dianders@chromium.org>
Doug Anderson March 23, 2021, 10:45 p.m. UTC | #4
Hi,

On Tue, Mar 23, 2021 at 2:46 PM Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
>
> Hi Doug,
>
> On Tue, Mar 23, 2021 at 02:08:55PM -0700, Doug Anderson wrote:
> > On Sun, Mar 21, 2021 at 8:02 PM Laurent Pinchart wrote:
> > >
> > > The valid rates are stored in an array of 8 booleans. Replace it with a
> > > bitmask to save space.
> >
> > I'm curious: do you have evidence that this does anything useful? I
> > guess you're expecting it to save .text space, right? Stack usage and
> > execution time differences should be irrelevant--it's not in a
> > critical section and the difference should be tiny anyway. As far as
> > .text segment goes, it's not obvious to me that the compiler will use
> > fewer instructions to manipulate bits compared to booleans.
> >
> > Doing a super simple "ls -ah" on vmlinux (unstripped):
> >
> > Before: 224820232 bytes
> > After: 224820376 bytes
> >
> > ...so your change made it _bigger_.   OK, so running "strip
> > --strip-debug" on those:
> >
> > Before: 26599464 bytes
> > After: 26599464 bytes
> >
> > ...so exactly the same. I tried finding some evidence using "readelf -ah":
> >
> > Before:
> >   [ 2] .text             PROGBITS         ffffffc010010000  00020000
> >        0000000000b03508  0000000000000000 WAX       0     0     65536
> >   [ 3] .rodata           PROGBITS         ffffffc010b20000  00b30000
> >        00000000002e84b3  0000000000000000 WAMS       0     0     4096
> >
> > After:
> >   [ 2] .text             PROGBITS         ffffffc010010000  00020000
> >        0000000000b03508  0000000000000000 WAX       0     0     65536
> >   [ 3] .rodata           PROGBITS         ffffffc010b20000  00b30000
> >        00000000002e84b3  0000000000000000 WAMS       0     0     4096
> >
> > Maybe you have some evidence showing an improvement? Ah, OK. I
> > disassembled ti_sn_bridge_enable() and your patch saves 12 bytes, but
> > I guess maybe alignment washes it out in reality...
> >
> >
> > In terms of readability / conventions, I personally find this change a
> > bit of a wash. I mean, I guess I originally implemented it as an array
> > and I thought that was the most readable, but I like bitfields fine
> > too. If everyone loves it then I won't object, but to me it feels like
> > touching lines of code for something that's personal preference. ;-)
>
> You're right that the .text and CPU time improvements were not my
> target. I was focussed on stack usage, as that's a limited resource in
> the kernel. I don't have any evidence that we would be close to any
> limit, so it's tiny, and if you or anyone else have a strong opinion
> that an array of booleans is better due to readability concerns, I can
> drop this change. I only thought about those poor 7 bits in every bool
> that sat there unused, feeling useless :-)

LOL. Thinking about it a bit more, I guess I feel a bit lame saying
that the array of booleans is more readable. I guess I'd call them
equivalently readable. So I guess the downside of this patch is just
churn. If someone is maintaining a downstream kernel, it's an extra
patch to take. If someone is running "git blame" it's an extra layer
to walk back to find the history of the code. That being said, it's
really not a big deal. I'll leave it up to you if you want to include
the patch in your next version or if my arguments have convinced you.
;-)

-Doug
Geert Uytterhoeven March 24, 2021, 8:47 a.m. UTC | #5
Hi Doug,

On Tue, Mar 23, 2021 at 10:10 PM Doug Anderson <dianders@chromium.org> wrote:
> On Sun, Mar 21, 2021 at 8:02 PM Laurent Pinchart
> <laurent.pinchart+renesas@ideasonboard.com> wrote:
> >
> > The valid rates are stored in an array of 8 booleans. Replace it with a
> > bitmask to save space.
>
> I'm curious: do you have evidence that this does anything useful? I
> guess you're expecting it to save .text space, right? Stack usage and
> execution time differences should be irrelevant--it's not in a
> critical section and the difference should be tiny anyway. As far as
> .text segment goes, it's not obvious to me that the compiler will use
> fewer instructions to manipulate bits compared to booleans.
>
> Doing a super simple "ls -ah" on vmlinux (unstripped):
>
> Before: 224820232 bytes
> After: 224820376 bytes
>
> ...so your change made it _bigger_.   OK, so running "strip
> --strip-debug" on those:
>
> Before: 26599464 bytes
> After: 26599464 bytes

I've been surprised by the counter-intuitive impact of similar changes
before, too.  The result may also differ a lot between arm32 or arm64.

> ...so exactly the same. I tried finding some evidence using "readelf -ah":
>
> Before:
>   [ 2] .text             PROGBITS         ffffffc010010000  00020000
>        0000000000b03508  0000000000000000 WAX       0     0     65536
>   [ 3] .rodata           PROGBITS         ffffffc010b20000  00b30000
>        00000000002e84b3  0000000000000000 WAMS       0     0     4096
>
> After:
>   [ 2] .text             PROGBITS         ffffffc010010000  00020000
>        0000000000b03508  0000000000000000 WAX       0     0     65536
>   [ 3] .rodata           PROGBITS         ffffffc010b20000  00b30000
>        00000000002e84b3  0000000000000000 WAMS       0     0     4096
>
> Maybe you have some evidence showing an improvement? Ah, OK. I
> disassembled ti_sn_bridge_enable() and your patch saves 12 bytes, but
> I guess maybe alignment washes it out in reality...

Yes, arm64 is bad w.r.t. this.

Gr{oetje,eeting}s,

                        Geert
diff mbox series

Patch

diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
index c45420a50e73..1d1be791d5ba 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
@@ -557,9 +557,9 @@  static int ti_sn_bridge_calc_min_dp_rate_idx(struct ti_sn_bridge *pdata)
 	return i;
 }
 
-static void ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata,
-					  bool rate_valid[])
+static unsigned int ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata)
 {
+	unsigned int valid_rates = 0;
 	unsigned int rate_per_200khz;
 	unsigned int rate_mhz;
 	u8 dpcd_val;
@@ -599,13 +599,13 @@  static void ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata,
 			     j < ARRAY_SIZE(ti_sn_bridge_dp_rate_lut);
 			     j++) {
 				if (ti_sn_bridge_dp_rate_lut[j] == rate_mhz)
-					rate_valid[j] = true;
+					valid_rates |= BIT(j);
 			}
 		}
 
 		for (i = 0; i < ARRAY_SIZE(ti_sn_bridge_dp_rate_lut); i++) {
-			if (rate_valid[i])
-				return;
+			if (valid_rates & BIT(i))
+				return valid_rates;
 		}
 		DRM_DEV_ERROR(pdata->dev,
 			      "No matching eDP rates in table; falling back\n");
@@ -627,15 +627,17 @@  static void ti_sn_bridge_read_valid_rates(struct ti_sn_bridge *pdata,
 			      (int)dpcd_val);
 		fallthrough;
 	case DP_LINK_BW_5_4:
-		rate_valid[7] = 1;
+		valid_rates |= BIT(7);
 		fallthrough;
 	case DP_LINK_BW_2_7:
-		rate_valid[4] = 1;
+		valid_rates |= BIT(4);
 		fallthrough;
 	case DP_LINK_BW_1_62:
-		rate_valid[1] = 1;
+		valid_rates |= BIT(1);
 		break;
 	}
+
+	return valid_rates;
 }
 
 static void ti_sn_bridge_set_video_timings(struct ti_sn_bridge *pdata)
@@ -753,8 +755,8 @@  static int ti_sn_link_training(struct ti_sn_bridge *pdata, int dp_rate_idx,
 static void ti_sn_bridge_enable(struct drm_bridge *bridge)
 {
 	struct ti_sn_bridge *pdata = bridge_to_ti_sn_bridge(bridge);
-	bool rate_valid[ARRAY_SIZE(ti_sn_bridge_dp_rate_lut)] = { };
 	const char *last_err_str = "No supported DP rate";
+	unsigned int valid_rates;
 	int dp_rate_idx;
 	unsigned int val;
 	int ret = -EINVAL;
@@ -793,13 +795,13 @@  static void ti_sn_bridge_enable(struct drm_bridge *bridge)
 	regmap_update_bits(pdata->regmap, SN_SSC_CONFIG_REG, DP_NUM_LANES_MASK,
 			   val);
 
-	ti_sn_bridge_read_valid_rates(pdata, rate_valid);
+	valid_rates = ti_sn_bridge_read_valid_rates(pdata);
 
 	/* Train until we run out of rates */
 	for (dp_rate_idx = ti_sn_bridge_calc_min_dp_rate_idx(pdata);
 	     dp_rate_idx < ARRAY_SIZE(ti_sn_bridge_dp_rate_lut);
 	     dp_rate_idx++) {
-		if (!rate_valid[dp_rate_idx])
+		if (!(valid_rates & BIT(dp_rate_idx)))
 			continue;
 
 		ret = ti_sn_link_training(pdata, dp_rate_idx, &last_err_str);