Message ID | 20241215104508.191237-13-geert@linux-m68k.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Export feature and large ellipses support | expand |
On 12/15/24 11:45, Geert Uytterhoeven wrote: > "test002" crashes when run with a display resolution of e.g. 2560x1440 > pixels, due to 32-bit overflow in the ellipse drawing routine. > > Fix this by creating a copy that uses 64-bit arithmetic. Use a > heuristic to pick either the 32-bit or the 64-bit version, to avoid the > overhead of the 64-bit version on small systems with small displays. I see you always build the 32- and 64-bit versions, so when you mean overhead you mean runtime overhead, not compiled binary size overhead. So, just wondering: Did you maybe measured how much slower the 64-bit version is on slow 32-bit systems? I'm fine with your decision to build both, but I'm wondering if it's really necessary to keep two versions for a "test tool"? Helge > Replace (unsigned) int by u32/s32 in the 32-bit version for clarity. > > Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org> > --- > drawops/generic.c | 127 +++++++++++++++++++++++++++++++++++++++++----- > include/types.h | 3 ++ > 2 files changed, 116 insertions(+), 14 deletions(-) > > diff --git a/drawops/generic.c b/drawops/generic.c > index 5fd971b59bc698fe..c4cfad3223773a23 100644 > --- a/drawops/generic.c > +++ b/drawops/generic.c > @@ -305,8 +305,98 @@ static void fill_ellipse_points(u32 cx, u32 cy, u32 x, u32 y, pixel_t pixel) > } > } > > -static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, > - draw_func_t draw_inner, draw_func_t draw_outer) > +static void do_ellipse_32(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, > + draw_func_t draw_inner, draw_func_t draw_outer) > +{ > + u32 a2 = a*a; > + u32 b2 = b*b; > + > + if (a <= b) { > + u32 x1 = 0; > + u32 y1 = b; > + s32 S = a2*(1-2*b)+2*b2; > + s32 T = b2-2*a2*(2*b-1); > + u32 dT1 = 4*b2; > + u32 dS1 = dT1+2*b2; > + s32 dS2 = -4*a2*(b-1); > + s32 dT2 = dS2+2*a2; > + > + while (1) { > + if (S < 0) { > + if (draw_inner) > + draw_inner(x, y, x1, y1, pixel); > + S += dS1; > + T += dT1; > + dS1 += 4*b2; > + dT1 += 4*b2; > + x1++; > + } else if (T < 0) { > + draw_outer(x, y, x1, y1, pixel); > + if (y1 == 0) > + break; > + S += dS1+dS2; > + T += dT1+dT2; > + dS1 += 4*b2; > + dT1 += 4*b2; > + dS2 += 4*a2; > + dT2 += 4*a2; > + x1++; > + y1--; > + } else { > + draw_outer(x, y, x1, y1, pixel); > + if (y1 == 0) > + break; > + S += dS2; > + T += dT2; > + dS2 += 4*a2; > + dT2 += 4*a2; > + y1--; > + } > + } > + } else { > + u32 x1 = a; > + u32 y1 = 0; > + s32 S = b2*(1-2*a)+2*a2; > + s32 T = a2-2*b2*(2*a-1); > + u32 dT1 = 4*a2; > + u32 dS1 = dT1+2*a2; > + s32 dS2 = -4*b2*(a-1); > + s32 dT2 = dS2+2*b2; > + > + draw_outer(x, y, x1, y1, pixel); > + do { > + if (S < 0) { > + S += dS1; > + T += dT1; > + dS1 += 4*a2; > + dT1 += 4*a2; > + y1++; > + draw_outer(x, y, x1, y1, pixel); > + } else if (T < 0) { > + S += dS1+dS2; > + T += dT1+dT2; > + dS1 += 4*a2; > + dT1 += 4*a2; > + dS2 += 4*b2; > + dT2 += 4*b2; > + x1--; > + y1++; > + draw_outer(x, y, x1, y1, pixel); > + } else { > + S += dS2; > + T += dT2; > + dS2 += 4*b2; > + dT2 += 4*b2; > + x1--; > + if (draw_inner) > + draw_inner(x, y, x1, y1, pixel); > + } > + } while (x1 > 0); > + } > +} > + > +static void do_ellipse_64(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, > + draw_func_t draw_inner, draw_func_t draw_outer) > { > u32 a2 = a*a; > u32 b2 = b*b; > @@ -314,12 +404,12 @@ static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, > if (a <= b) { > u32 x1 = 0; > u32 y1 = b; > - int S = a2*(1-2*b)+2*b2; > - int T = b2-2*a2*(2*b-1); > - unsigned int dT1 = 4*b2; > - unsigned int dS1 = dT1+2*b2; > - int dS2 = -4*a2*(b-1); > - int dT2 = dS2+2*a2; > + s64 S = a2*(1-2LL*b)+2LL*b2; > + s64 T = b2-2LL*a2*(2LL*b-1); > + u64 dT1 = 4*b2; > + u64 dS1 = dT1+2*b2; > + s64 dS2 = -4LL*a2*(b-1); > + s64 dT2 = dS2+2*a2; > > while (1) { > if (S < 0) { > @@ -356,12 +446,12 @@ static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, > } else { > u32 x1 = a; > u32 y1 = 0; > - int S = b2*(1-2*a)+2*a2; > - int T = a2-2*b2*(2*a-1); > - unsigned int dT1 = 4*a2; > - unsigned int dS1 = dT1+2*a2; > - int dS2 = -4*b2*(a-1); > - int dT2 = dS2+2*b2; > + s64 S = b2*(1-2LL*a)+2LL*a2; > + s64 T = a2-2LL*b2*(2LL*a-1); > + u64 dT1 = 4*a2; > + u64 dS1 = dT1+2*a2; > + s64 dS2 = -4LL*b2*(a-1); > + s64 dT2 = dS2+2*b2; > > draw_outer(x, y, x1, y1, pixel); > do { > @@ -395,6 +485,15 @@ static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, > } > } > > +static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, > + draw_func_t draw_inner, draw_func_t draw_outer) > +{ > + if ((a + 576) * (b + 576) < 1440000) > + do_ellipse_32(x, y, a, b, pixel, draw_inner, draw_outer); > + else > + do_ellipse_64(x, y, a, b, pixel, draw_inner, draw_outer); > +} > + > void generic_draw_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel) > { > if (a == b) > diff --git a/include/types.h b/include/types.h > index 9112ba6855b61eaa..0e3c76521469912f 100644 > --- a/include/types.h > +++ b/include/types.h > @@ -21,6 +21,9 @@ typedef unsigned short u16; > typedef unsigned int u32; > typedef unsigned long long u64; > > +typedef int s32; > +typedef long long s64; > + > #if defined(__LP64__) || defined(__alpha__) || defined(__ia64__) || \ > defined(__mips64__) || defined(__powerpc64__) || defined(__s390x__) || \ > defined(__sparc64__) || defined(__x86_64__) || defined(__hppa64__)
Hi Helge, On Sun, Dec 15, 2024 at 4:08 PM Helge Deller <deller@gmx.de> wrote: > On 12/15/24 11:45, Geert Uytterhoeven wrote: > > "test002" crashes when run with a display resolution of e.g. 2560x1440 > > pixels, due to 32-bit overflow in the ellipse drawing routine. > > > > Fix this by creating a copy that uses 64-bit arithmetic. Use a > > heuristic to pick either the 32-bit or the 64-bit version, to avoid the > > overhead of the 64-bit version on small systems with small displays. > > I see you always build the 32- and 64-bit versions, so when you mean > overhead you mean runtime overhead, not compiled binary size overhead. Exactly. > So, just wondering: > Did you maybe measured how much slower the 64-bit version is on slow 32-bit systems? > I'm fine with your decision to build both, but I'm wondering if it's really necessary > to keep two versions for a "test tool"? On ARM Cortex-A9, draw_ellipse(400, 240, 300, 239, ...) with a dummy (empty) set_pixel() method using the 64-bit version takes 44% longer than the 32-bit version, so I think it is worthwhile to have both versions. Gr{oetje,eeting}s, Geert
On 12/18/24 17:29, Geert Uytterhoeven wrote: > Hi Helge, > > On Sun, Dec 15, 2024 at 4:08 PM Helge Deller <deller@gmx.de> wrote: >> On 12/15/24 11:45, Geert Uytterhoeven wrote: >>> "test002" crashes when run with a display resolution of e.g. 2560x1440 >>> pixels, due to 32-bit overflow in the ellipse drawing routine. >>> >>> Fix this by creating a copy that uses 64-bit arithmetic. Use a >>> heuristic to pick either the 32-bit or the 64-bit version, to avoid the >>> overhead of the 64-bit version on small systems with small displays. >> >> I see you always build the 32- and 64-bit versions, so when you mean >> overhead you mean runtime overhead, not compiled binary size overhead. > > Exactly. > >> So, just wondering: >> Did you maybe measured how much slower the 64-bit version is on slow 32-bit systems? >> I'm fine with your decision to build both, but I'm wondering if it's really necessary >> to keep two versions for a "test tool"? > > On ARM Cortex-A9, draw_ellipse(400, 240, 300, 239, ...) with a > dummy (empty) set_pixel() method using the 64-bit version takes 44% > longer than the 32-bit version, so I think it is worthwhile to have > both versions. Oh, I didn't expect that much. I agree it's worthwhile to have both versions. Helge
diff --git a/drawops/generic.c b/drawops/generic.c index 5fd971b59bc698fe..c4cfad3223773a23 100644 --- a/drawops/generic.c +++ b/drawops/generic.c @@ -305,8 +305,98 @@ static void fill_ellipse_points(u32 cx, u32 cy, u32 x, u32 y, pixel_t pixel) } } -static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, - draw_func_t draw_inner, draw_func_t draw_outer) +static void do_ellipse_32(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, + draw_func_t draw_inner, draw_func_t draw_outer) +{ + u32 a2 = a*a; + u32 b2 = b*b; + + if (a <= b) { + u32 x1 = 0; + u32 y1 = b; + s32 S = a2*(1-2*b)+2*b2; + s32 T = b2-2*a2*(2*b-1); + u32 dT1 = 4*b2; + u32 dS1 = dT1+2*b2; + s32 dS2 = -4*a2*(b-1); + s32 dT2 = dS2+2*a2; + + while (1) { + if (S < 0) { + if (draw_inner) + draw_inner(x, y, x1, y1, pixel); + S += dS1; + T += dT1; + dS1 += 4*b2; + dT1 += 4*b2; + x1++; + } else if (T < 0) { + draw_outer(x, y, x1, y1, pixel); + if (y1 == 0) + break; + S += dS1+dS2; + T += dT1+dT2; + dS1 += 4*b2; + dT1 += 4*b2; + dS2 += 4*a2; + dT2 += 4*a2; + x1++; + y1--; + } else { + draw_outer(x, y, x1, y1, pixel); + if (y1 == 0) + break; + S += dS2; + T += dT2; + dS2 += 4*a2; + dT2 += 4*a2; + y1--; + } + } + } else { + u32 x1 = a; + u32 y1 = 0; + s32 S = b2*(1-2*a)+2*a2; + s32 T = a2-2*b2*(2*a-1); + u32 dT1 = 4*a2; + u32 dS1 = dT1+2*a2; + s32 dS2 = -4*b2*(a-1); + s32 dT2 = dS2+2*b2; + + draw_outer(x, y, x1, y1, pixel); + do { + if (S < 0) { + S += dS1; + T += dT1; + dS1 += 4*a2; + dT1 += 4*a2; + y1++; + draw_outer(x, y, x1, y1, pixel); + } else if (T < 0) { + S += dS1+dS2; + T += dT1+dT2; + dS1 += 4*a2; + dT1 += 4*a2; + dS2 += 4*b2; + dT2 += 4*b2; + x1--; + y1++; + draw_outer(x, y, x1, y1, pixel); + } else { + S += dS2; + T += dT2; + dS2 += 4*b2; + dT2 += 4*b2; + x1--; + if (draw_inner) + draw_inner(x, y, x1, y1, pixel); + } + } while (x1 > 0); + } +} + +static void do_ellipse_64(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, + draw_func_t draw_inner, draw_func_t draw_outer) { u32 a2 = a*a; u32 b2 = b*b; @@ -314,12 +404,12 @@ static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, if (a <= b) { u32 x1 = 0; u32 y1 = b; - int S = a2*(1-2*b)+2*b2; - int T = b2-2*a2*(2*b-1); - unsigned int dT1 = 4*b2; - unsigned int dS1 = dT1+2*b2; - int dS2 = -4*a2*(b-1); - int dT2 = dS2+2*a2; + s64 S = a2*(1-2LL*b)+2LL*b2; + s64 T = b2-2LL*a2*(2LL*b-1); + u64 dT1 = 4*b2; + u64 dS1 = dT1+2*b2; + s64 dS2 = -4LL*a2*(b-1); + s64 dT2 = dS2+2*a2; while (1) { if (S < 0) { @@ -356,12 +446,12 @@ static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, } else { u32 x1 = a; u32 y1 = 0; - int S = b2*(1-2*a)+2*a2; - int T = a2-2*b2*(2*a-1); - unsigned int dT1 = 4*a2; - unsigned int dS1 = dT1+2*a2; - int dS2 = -4*b2*(a-1); - int dT2 = dS2+2*b2; + s64 S = b2*(1-2LL*a)+2LL*a2; + s64 T = a2-2LL*b2*(2LL*a-1); + u64 dT1 = 4*a2; + u64 dS1 = dT1+2*a2; + s64 dS2 = -4LL*b2*(a-1); + s64 dT2 = dS2+2*b2; draw_outer(x, y, x1, y1, pixel); do { @@ -395,6 +485,15 @@ static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, } } +static void do_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel, + draw_func_t draw_inner, draw_func_t draw_outer) +{ + if ((a + 576) * (b + 576) < 1440000) + do_ellipse_32(x, y, a, b, pixel, draw_inner, draw_outer); + else + do_ellipse_64(x, y, a, b, pixel, draw_inner, draw_outer); +} + void generic_draw_ellipse(u32 x, u32 y, u32 a, u32 b, pixel_t pixel) { if (a == b) diff --git a/include/types.h b/include/types.h index 9112ba6855b61eaa..0e3c76521469912f 100644 --- a/include/types.h +++ b/include/types.h @@ -21,6 +21,9 @@ typedef unsigned short u16; typedef unsigned int u32; typedef unsigned long long u64; +typedef int s32; +typedef long long s64; + #if defined(__LP64__) || defined(__alpha__) || defined(__ia64__) || \ defined(__mips64__) || defined(__powerpc64__) || defined(__s390x__) || \ defined(__sparc64__) || defined(__x86_64__) || defined(__hppa64__)
"test002" crashes when run with a display resolution of e.g. 2560x1440 pixels, due to 32-bit overflow in the ellipse drawing routine. Fix this by creating a copy that uses 64-bit arithmetic. Use a heuristic to pick either the 32-bit or the 64-bit version, to avoid the overhead of the 64-bit version on small systems with small displays. Replace (unsigned) int by u32/s32 in the 32-bit version for clarity. Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org> --- drawops/generic.c | 127 +++++++++++++++++++++++++++++++++++++++++----- include/types.h | 3 ++ 2 files changed, 116 insertions(+), 14 deletions(-)