diff mbox series

[4/4] sm501: Optimise 1 pixel 2d ops

Message ID 9fab4fe6513fe8b921ce86a57f4ff7e0d5a3c3f9.1591471056.git.balaton@eik.bme.hu (mailing list archive)
State New, archived
Headers show
Series More sm501 fixes and optimisations | expand

Commit Message

BALATON Zoltan June 6, 2020, 7:17 p.m. UTC
Some guests do 1x1 blits which is faster to do directly than calling a
function for it so avoid overhead in this case.

Signed-off-by: BALATON Zoltan <balaton@eik.bme.hu>
---
 hw/display/sm501.c | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

Comments

Peter Maydell June 15, 2020, 3:42 p.m. UTC | #1
On Sat, 6 Jun 2020 at 20:22, BALATON Zoltan <balaton@eik.bme.hu> wrote:
>
> Some guests do 1x1 blits which is faster to do directly than calling a
> function for it so avoid overhead in this case.

How much does the performance improve by ?

> Signed-off-by: BALATON Zoltan <balaton@eik.bme.hu>
> ---
>  hw/display/sm501.c | 40 +++++++++++++++++++++++++++++++++++++---
>  1 file changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/hw/display/sm501.c b/hw/display/sm501.c
> index 3397ca9fbf..59693fbb5c 100644
> --- a/hw/display/sm501.c
> +++ b/hw/display/sm501.c
> @@ -793,6 +793,25 @@ static void sm501_2d_operation(SM501State *s)
>                  src_x == dst_x && src_y == dst_y) {
>                  break;
>              }
> +            /* Some clients also do 1 pixel blits, avoid overhead for these */
> +            if (width == 1 && height == 1) {
> +                unsigned int si = (src_x + src_y * src_pitch) * (1 << format);
> +                unsigned int di = (dst_x + dst_y * dst_pitch) * (1 << format);
> +                switch (format) {
> +                case 0:
> +                    s->local_mem[dst_base + di] = s->local_mem[src_base + si];
> +                    break;
> +                case 1:
> +                    *(uint16_t *)&s->local_mem[dst_base + di] =
> +                                    *(uint16_t *)&s->local_mem[src_base + si];
> +                    break;
> +                case 2:
> +                    *(uint32_t *)&s->local_mem[dst_base + di] =
> +                                    *(uint32_t *)&s->local_mem[src_base + si];
> +                    break;
> +                }

You could write this more compactly as
                   stn_he_p(&s->local_mem[dst_base + di], 1 << format,
                            ldn_he_p(&s->local_mem[src_base + si], 1
<< format));

(which handles the length-cases for you and also doesn't rely on
casting a uint8_t* giving you something correctly aligned for a
wider access).

> +                break;
> +            }
>              /* Check for overlaps, this could be made more exact */
>              uint32_t sb, se, db, de;
>              sb = src_base + src_x + src_y * (width + src_pitch);
> @@ -841,9 +860,24 @@ static void sm501_2d_operation(SM501State *s)
>              color = cpu_to_le16(color);
>          }
>
> -        pixman_fill((uint32_t *)&s->local_mem[dst_base],
> -                    dst_pitch * (1 << format) / sizeof(uint32_t),
> -                    8 * (1 << format), dst_x, dst_y, width, height, color);
> +        if (width == 1 && height == 1) {
> +            unsigned int i = (dst_x + dst_y * dst_pitch) * (1 << format);
> +            switch (format) {
> +            case 0:
> +                s->local_mem[dst_base + i] = color & 0xff;
> +                break;
> +            case 1:
> +                *(uint16_t *)&s->local_mem[dst_base + i] = color & 0xffff;
> +                break;
> +            case 2:
> +                *(uint32_t *)&s->local_mem[dst_base + i] = color;
> +                break;
> +            }

               stn_he_p(&s->local_mem[dst_base + i], 1 << format, color);


> +        } else {
> +            pixman_fill((uint32_t *)&s->local_mem[dst_base],
> +                        dst_pitch * (1 << format) / sizeof(uint32_t),
> +                        8 * (1 << format), dst_x, dst_y, width, height, color);
> +        }
>          break;
>      }
>      default:

thanks
-- PMM
BALATON Zoltan June 15, 2020, 5 p.m. UTC | #2
On Mon, 15 Jun 2020, Peter Maydell wrote:
> On Sat, 6 Jun 2020 at 20:22, BALATON Zoltan <balaton@eik.bme.hu> wrote:
>>
>> Some guests do 1x1 blits which is faster to do directly than calling a
>> function for it so avoid overhead in this case.
>
> How much does the performance improve by ?

I haven't actually measured due to lack of time experimenting with it and 
those who I've asked to check it only reported it felt a bit faster but no 
numerical measurements so that does not prove anything. Anyway these 
special cases should not hurt and simple enough to have it here even if 
may not improve performance very much. The biggest loss is probably 
converting 16 bit screen on every display update to 32 bit because the 
guest driver does not allow higher bit depth than 16 for some reason. But 
I don't plan to spend more time with improving sm501, only done it because 
I had to touch it anyway. Longer term plan is to finish ati-vga which 
should have better guest drivers and better performance.

>> Signed-off-by: BALATON Zoltan <balaton@eik.bme.hu>
>> ---
>>  hw/display/sm501.c | 40 +++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 37 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/display/sm501.c b/hw/display/sm501.c
>> index 3397ca9fbf..59693fbb5c 100644
>> --- a/hw/display/sm501.c
>> +++ b/hw/display/sm501.c
>> @@ -793,6 +793,25 @@ static void sm501_2d_operation(SM501State *s)
>>                  src_x == dst_x && src_y == dst_y) {
>>                  break;
>>              }
>> +            /* Some clients also do 1 pixel blits, avoid overhead for these */
>> +            if (width == 1 && height == 1) {
>> +                unsigned int si = (src_x + src_y * src_pitch) * (1 << format);
>> +                unsigned int di = (dst_x + dst_y * dst_pitch) * (1 << format);
>> +                switch (format) {
>> +                case 0:
>> +                    s->local_mem[dst_base + di] = s->local_mem[src_base + si];
>> +                    break;
>> +                case 1:
>> +                    *(uint16_t *)&s->local_mem[dst_base + di] =
>> +                                    *(uint16_t *)&s->local_mem[src_base + si];
>> +                    break;
>> +                case 2:
>> +                    *(uint32_t *)&s->local_mem[dst_base + di] =
>> +                                    *(uint32_t *)&s->local_mem[src_base + si];
>> +                    break;
>> +                }
>
> You could write this more compactly as
>                   stn_he_p(&s->local_mem[dst_base + di], 1 << format,
>                            ldn_he_p(&s->local_mem[src_base + si], 1
> << format));
>
> (which handles the length-cases for you and also doesn't rely on
> casting a uint8_t* giving you something correctly aligned for a
> wider access).

OK, I never know these and they are a bit hard to find because they are 
defined as glued together macros so grepping for it does not show the 
definition. Maybe adding a comment with the names where these are defined 
in bswap.h might help. stn_he_p seems to ultimately call __builtin_memcpy 
that probably does the same direct assignment for short values. I wonder 
how much overhead that has going through all the function calls but 
hopefully those are inlined.

Regards,
BALATON Zoltan

>> +                break;
>> +            }
>>              /* Check for overlaps, this could be made more exact */
>>              uint32_t sb, se, db, de;
>>              sb = src_base + src_x + src_y * (width + src_pitch);
>> @@ -841,9 +860,24 @@ static void sm501_2d_operation(SM501State *s)
>>              color = cpu_to_le16(color);
>>          }
>>
>> -        pixman_fill((uint32_t *)&s->local_mem[dst_base],
>> -                    dst_pitch * (1 << format) / sizeof(uint32_t),
>> -                    8 * (1 << format), dst_x, dst_y, width, height, color);
>> +        if (width == 1 && height == 1) {
>> +            unsigned int i = (dst_x + dst_y * dst_pitch) * (1 << format);
>> +            switch (format) {
>> +            case 0:
>> +                s->local_mem[dst_base + i] = color & 0xff;
>> +                break;
>> +            case 1:
>> +                *(uint16_t *)&s->local_mem[dst_base + i] = color & 0xffff;
>> +                break;
>> +            case 2:
>> +                *(uint32_t *)&s->local_mem[dst_base + i] = color;
>> +                break;
>> +            }
>
>               stn_he_p(&s->local_mem[dst_base + i], 1 << format, color);
diff mbox series

Patch

diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index 3397ca9fbf..59693fbb5c 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -793,6 +793,25 @@  static void sm501_2d_operation(SM501State *s)
                 src_x == dst_x && src_y == dst_y) {
                 break;
             }
+            /* Some clients also do 1 pixel blits, avoid overhead for these */
+            if (width == 1 && height == 1) {
+                unsigned int si = (src_x + src_y * src_pitch) * (1 << format);
+                unsigned int di = (dst_x + dst_y * dst_pitch) * (1 << format);
+                switch (format) {
+                case 0:
+                    s->local_mem[dst_base + di] = s->local_mem[src_base + si];
+                    break;
+                case 1:
+                    *(uint16_t *)&s->local_mem[dst_base + di] =
+                                    *(uint16_t *)&s->local_mem[src_base + si];
+                    break;
+                case 2:
+                    *(uint32_t *)&s->local_mem[dst_base + di] =
+                                    *(uint32_t *)&s->local_mem[src_base + si];
+                    break;
+                }
+                break;
+            }
             /* Check for overlaps, this could be made more exact */
             uint32_t sb, se, db, de;
             sb = src_base + src_x + src_y * (width + src_pitch);
@@ -841,9 +860,24 @@  static void sm501_2d_operation(SM501State *s)
             color = cpu_to_le16(color);
         }
 
-        pixman_fill((uint32_t *)&s->local_mem[dst_base],
-                    dst_pitch * (1 << format) / sizeof(uint32_t),
-                    8 * (1 << format), dst_x, dst_y, width, height, color);
+        if (width == 1 && height == 1) {
+            unsigned int i = (dst_x + dst_y * dst_pitch) * (1 << format);
+            switch (format) {
+            case 0:
+                s->local_mem[dst_base + i] = color & 0xff;
+                break;
+            case 1:
+                *(uint16_t *)&s->local_mem[dst_base + i] = color & 0xffff;
+                break;
+            case 2:
+                *(uint32_t *)&s->local_mem[dst_base + i] = color;
+                break;
+            }
+        } else {
+            pixman_fill((uint32_t *)&s->local_mem[dst_base],
+                        dst_pitch * (1 << format) / sizeof(uint32_t),
+                        8 * (1 << format), dst_x, dst_y, width, height, color);
+        }
         break;
     }
     default: