diff mbox series

ppc: Use hard-float in ppc fp_hlper as early as possible. This would increase the performance better than enable hard-float it in soft-float.c; Just using fadd fsub fmul fdiv as a simple bench demo. With this patch, performance are increased 2x. and 1.3x

Message ID 20200504192954.1387-1-luoyonggang@gmail.com (mailing list archive)
State New, archived
Headers show
Series ppc: Use hard-float in ppc fp_hlper as early as possible. This would increase the performance better than enable hard-float it in soft-float.c; Just using fadd fsub fmul fdiv as a simple bench demo. With this patch, performance are increased 2x. and 1.3x | expand

Commit Message

Yonggang Luo May 4, 2020, 7:29 p.m. UTC
From: Yonggang Luo <luoyonggang@gmail.com>

Just post as an idea to improve PPC fp performance.
With this idea, we have no need to adjust the helper orders.

Signed-off-by: Yonggang Luo <luoyonggang@gmail.com>
---
 target/ppc/fpu_helper.c | 44 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

Comments

Yonggang Luo May 4, 2020, 8:02 p.m. UTC | #1
Bench result;
orignal:
-> FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 27.768 sec
MFLOPS: 38.65
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 28.359 sec
MFLOPS: 37.84


soft-hard-float:

GCC version: 4.3.3
Ops count: 1073217024
Time spent: 14.874 sec
MFLOPS: 72.15
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 14.249 sec
MFLOPS: 75.32

direct-hard-float:

-> FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 13.021 sec
MFLOPS: 82.42
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 12.472 sec
MFLOPS: 86.05
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 11.803 sec
MFLOPS: 90.93
FLOPS 3.00
GCC version: 4.3.3
Ops count: 1073217024
Time spent: 11.945 sec
MFLOPS: 89.85

bench program:

```
#include <stdio.h>
#include <stdlib.h>
#ifdef __vxworks
#include <sys/resource.h>
#include <vxworks.h>
#include <timers.h>
#include <time.h>
#elif defined(_MSC_VER)
#include <Windows.h>
#include <time.h>
#else
#include <time.h>
#endif
/*
cl -O2 test_flops.c
gcc -O2 test_flops.c -o test_flops

*/
#ifndef DIM
#define DIM 1024
const long long int nop = 1073217024;
#else
#define COUNT
long long int nop = 0;
#endif

void printm(double A[DIM][DIM])
{
int i,j;
for (i=0; i<DIM; i++) {
for (j=0; j<DIM; j++)
printf("%6.3f", A[i][j]);
printf("\n");
}
}

void initm(double A[DIM][DIM])
{
int i,j;
srand(38741);
for (i = 0; i < DIM; i++)
for (j = 0; j < DIM; j++)
A[i][j] = (double)rand() / (double)RAND_MAX - 0.5;
}

void dge(double A[DIM][DIM])
{
int i, j, k;
double c;
for (k = 1; k < DIM; k++) {
for (i = k; i < DIM; i++) {
c = A[i][k-1] / A[k-1][k-1];
#ifdef COUNT
nop += 1;
#endif
for (j = 0; j < DIM; j++) {
A[i][j] -= c * A[k-1][j];
#ifdef COUNT
nop += 2;
#endif
}
}
}
}

double X[DIM][DIM];

/*
 * return a timestamp with sub-second precision
 * QueryPerformanceCounter and clock_gettime have an undefined starting
point (null/zero)
 * and can wrap around, i.e. be nulled again.
 */
double get_seconds()
{
#ifdef _MSC_VER
  static LARGE_INTEGER frequency;
  if (frequency.QuadPart == 0)
    QueryPerformanceFrequency(&frequency);
  LARGE_INTEGER now;
  QueryPerformanceCounter(&now);
  return (now.QuadPart * 1.0) / frequency.QuadPart;
#else
  struct timespec now;
  clock_gettime(CLOCK_REALTIME, &now);
  return now.tv_sec + now.tv_nsec * 1e-9;
#endif
}

int main (int argc, char **argv)
{
double a = 1.0;
double b = 2.0;
double c = a + b;
double t;
int count = 1;
int i;
printf("FLOPS %.2lf\n", c);
#ifdef _MSC_VER
printf("MSC_VER version: %d\n", _MSC_VER);
#else
printf("GCC version: " __VERSION__ "\n");
#endif
initm(X);
t = get_seconds();
#ifndef __vxworks
if (argc > 1) {
sscanf(argv[1], "%d", &count);
}
#endif
for (i = 0; i < count; i += 1) {
dge(X);
}
t = get_seconds() - t;
printf("Ops count: %llu\n", nop * count);
printf("Time spent: %.3lf sec\n", t);
printf("MFLOPS: %.2f\n", 1e-6 * nop * count / t );
#ifdef PRINTM
printm(X);
#endif
return 0;
}
```

On Tue, May 5, 2020 at 3:30 AM <luoyonggang@gmail.com> wrote:

> From: Yonggang Luo <luoyonggang@gmail.com>
>
> Just post as an idea to improve PPC fp performance.
> With this idea, we have no need to adjust the helper orders.
>
> Signed-off-by: Yonggang Luo <luoyonggang@gmail.com>
> ---
>  target/ppc/fpu_helper.c | 44 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 44 insertions(+)
>
> diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
> index 2bd49a2cdf..79051e4540 100644
> --- a/target/ppc/fpu_helper.c
> +++ b/target/ppc/fpu_helper.c
> @@ -926,6 +926,17 @@ static void float_invalid_op_addsub(CPUPPCState *env,
> bool set_fpcc,
>  /* fadd - fadd. */
>  float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2)
>  {
> +    CPU_DoubleU u1, u2;
> +
> +    u1.d = arg1;
> +    u2.d = arg2;
> +    CPU_DoubleU retDouble;
> +    retDouble.nd = u1.nd + u2.nd;
> +    if (likely(float64_is_zero_or_normal(retDouble.d)))
> +    {
> +        /* TODO: Handling inexact */
> +        return retDouble.d;
> +    }
>      float64 ret = float64_add(arg1, arg2, &env->fp_status);
>      int status = get_float_exception_flags(&env->fp_status);
>
> @@ -941,6 +952,17 @@ float64 helper_fadd(CPUPPCState *env, float64 arg1,
> float64 arg2)
>  /* fsub - fsub. */
>  float64 helper_fsub(CPUPPCState *env, float64 arg1, float64 arg2)
>  {
> +    CPU_DoubleU u1, u2;
> +
> +    u1.d = arg1;
> +    u2.d = arg2;
> +    CPU_DoubleU retDouble;
> +    retDouble.nd = u1.nd - u2.nd;
> +    if (likely(float64_is_zero_or_normal(retDouble.d)))
> +    {
> +        /* TODO: Handling inexact */
> +        return retDouble.d;
> +    }
>      float64 ret = float64_sub(arg1, arg2, &env->fp_status);
>      int status = get_float_exception_flags(&env->fp_status);
>
> @@ -967,6 +989,17 @@ static void float_invalid_op_mul(CPUPPCState *env,
> bool set_fprc,
>  /* fmul - fmul. */
>  float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2)
>  {
> +    CPU_DoubleU u1, u2;
> +
> +    u1.d = arg1;
> +    u2.d = arg2;
> +    CPU_DoubleU retDouble;
> +    retDouble.nd = u1.nd * u2.nd;
> +    if (likely(float64_is_zero_or_normal(retDouble.d)))
> +    {
> +        /* TODO: Handling inexact */
> +        return retDouble.d;
> +    }
>      float64 ret = float64_mul(arg1, arg2, &env->fp_status);
>      int status = get_float_exception_flags(&env->fp_status);
>
> @@ -997,6 +1030,17 @@ static void float_invalid_op_div(CPUPPCState *env,
> bool set_fprc,
>  /* fdiv - fdiv. */
>  float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2)
>  {
> +    CPU_DoubleU u1, u2;
> +
> +    u1.d = arg1;
> +    u2.d = arg2;
> +    CPU_DoubleU retDouble;
> +    retDouble.nd = u1.nd / u2.nd;
> +    if (likely(float64_is_zero_or_normal(retDouble.d)))
> +    {
> +        /* TODO: Handling inexact */
> +        return retDouble.d;
> +    }
>      float64 ret = float64_div(arg1, arg2, &env->fp_status);
>      int status = get_float_exception_flags(&env->fp_status);
>
> --
> 2.23.0.windows.1
>
>
Richard Henderson May 4, 2020, 10:10 p.m. UTC | #2
On 5/4/20 12:29 PM, luoyonggang@gmail.com wrote:

> Re: [PATCH] ppc: Use hard-float in ppc fp_hlper as early as possible. This would increase the performance better than enable hard-float it in soft-float.c; Just using fadd fsub fmul fdiv as a simple bench demo. With this patch, performance are increased 2x. and 1.3x than the one enable hard-float in soft-float.c Both version are not considerate inexact fp exception yet.

Use a return after the one sentence title to separate it from the body of the
description.


>  float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2)
>  {
> +    CPU_DoubleU u1, u2;
> +
> +    u1.d = arg1;
> +    u2.d = arg2;
> +    CPU_DoubleU retDouble;
> +    retDouble.nd = u1.nd + u2.nd;
> +    if (likely(float64_is_zero_or_normal(retDouble.d)))
> +    {
> +        /* TODO: Handling inexact */
> +        return retDouble.d;
> +    }

First, you need to verify that the current rounding mode is
float_round_nearest_even.  Otherwise you are actively computing wrong results
for other rounding modes.

Second, including zero result in your acceptance test misses out on underflow
exceptions.

Third, what is your plan for inexact?  There's no point in continuing this
thread unless you fill in the TODO a bit more.

https://cafehayek.com/wp-content/uploads/2014/03/miracle_cartoon.jpg


r~
Aleksandar Markovic May 4, 2020, 11:03 p.m. UTC | #3
пон, 4. мај 2020. у 21:31 <luoyonggang@gmail.com> је написао/ла:
>
> From: Yonggang Luo <luoyonggang@gmail.com>
>
> Just post as an idea to improve PPC fp performance.
> With this idea, we have no need to adjust the helper orders.
>
> Signed-off-by: Yonggang Luo <luoyonggang@gmail.com>
> ---
>  target/ppc/fpu_helper.c | 44 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 44 insertions(+)
>
> diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
> index 2bd49a2cdf..79051e4540 100644
> --- a/target/ppc/fpu_helper.c
> +++ b/target/ppc/fpu_helper.c
> @@ -926,6 +926,17 @@ static void float_invalid_op_addsub(CPUPPCState *env, bool set_fpcc,
>  /* fadd - fadd. */
>  float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2)
>  {
> +    CPU_DoubleU u1, u2;
> +
> +    u1.d = arg1;
> +    u2.d = arg2;
> +    CPU_DoubleU retDouble;
> +    retDouble.nd = u1.nd + u2.nd;

Besides what Richard mentioned, you neglect here "flush-denormals-to-zero"
property of FPUs. You implicitly assume that the host has the same behavior
as the target (ppc). But that simply may not be the case, leading to the wrong
result.

Yours,
Aleksandar

> +    if (likely(float64_is_zero_or_normal(retDouble.d)))
> +    {
> +        /* TODO: Handling inexact */
> +        return retDouble.d;
> +    }
>      float64 ret = float64_add(arg1, arg2, &env->fp_status);
>      int status = get_float_exception_flags(&env->fp_status);
>
> @@ -941,6 +952,17 @@ float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2)
>  /* fsub - fsub. */
>  float64 helper_fsub(CPUPPCState *env, float64 arg1, float64 arg2)
>  {
> +    CPU_DoubleU u1, u2;
> +
> +    u1.d = arg1;
> +    u2.d = arg2;
> +    CPU_DoubleU retDouble;
> +    retDouble.nd = u1.nd - u2.nd;
> +    if (likely(float64_is_zero_or_normal(retDouble.d)))
> +    {
> +        /* TODO: Handling inexact */
> +        return retDouble.d;
> +    }
>      float64 ret = float64_sub(arg1, arg2, &env->fp_status);
>      int status = get_float_exception_flags(&env->fp_status);
>
> @@ -967,6 +989,17 @@ static void float_invalid_op_mul(CPUPPCState *env, bool set_fprc,
>  /* fmul - fmul. */
>  float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2)
>  {
> +    CPU_DoubleU u1, u2;
> +
> +    u1.d = arg1;
> +    u2.d = arg2;
> +    CPU_DoubleU retDouble;
> +    retDouble.nd = u1.nd * u2.nd;
> +    if (likely(float64_is_zero_or_normal(retDouble.d)))
> +    {
> +        /* TODO: Handling inexact */
> +        return retDouble.d;
> +    }
>      float64 ret = float64_mul(arg1, arg2, &env->fp_status);
>      int status = get_float_exception_flags(&env->fp_status);
>
> @@ -997,6 +1030,17 @@ static void float_invalid_op_div(CPUPPCState *env, bool set_fprc,
>  /* fdiv - fdiv. */
>  float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2)
>  {
> +    CPU_DoubleU u1, u2;
> +
> +    u1.d = arg1;
> +    u2.d = arg2;
> +    CPU_DoubleU retDouble;
> +    retDouble.nd = u1.nd / u2.nd;
> +    if (likely(float64_is_zero_or_normal(retDouble.d)))
> +    {
> +        /* TODO: Handling inexact */
> +        return retDouble.d;
> +    }
>      float64 ret = float64_div(arg1, arg2, &env->fp_status);
>      int status = get_float_exception_flags(&env->fp_status);
>
> --
> 2.23.0.windows.1
>
>
diff mbox series

Patch

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 2bd49a2cdf..79051e4540 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -926,6 +926,17 @@  static void float_invalid_op_addsub(CPUPPCState *env, bool set_fpcc,
 /* fadd - fadd. */
 float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2)
 {
+    CPU_DoubleU u1, u2;
+
+    u1.d = arg1;
+    u2.d = arg2;
+    CPU_DoubleU retDouble;
+    retDouble.nd = u1.nd + u2.nd;
+    if (likely(float64_is_zero_or_normal(retDouble.d)))
+    {
+        /* TODO: Handling inexact */
+        return retDouble.d;
+    }
     float64 ret = float64_add(arg1, arg2, &env->fp_status);
     int status = get_float_exception_flags(&env->fp_status);
 
@@ -941,6 +952,17 @@  float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2)
 /* fsub - fsub. */
 float64 helper_fsub(CPUPPCState *env, float64 arg1, float64 arg2)
 {
+    CPU_DoubleU u1, u2;
+
+    u1.d = arg1;
+    u2.d = arg2;
+    CPU_DoubleU retDouble;
+    retDouble.nd = u1.nd - u2.nd;
+    if (likely(float64_is_zero_or_normal(retDouble.d)))
+    {
+        /* TODO: Handling inexact */
+        return retDouble.d;
+    }
     float64 ret = float64_sub(arg1, arg2, &env->fp_status);
     int status = get_float_exception_flags(&env->fp_status);
 
@@ -967,6 +989,17 @@  static void float_invalid_op_mul(CPUPPCState *env, bool set_fprc,
 /* fmul - fmul. */
 float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2)
 {
+    CPU_DoubleU u1, u2;
+
+    u1.d = arg1;
+    u2.d = arg2;
+    CPU_DoubleU retDouble;
+    retDouble.nd = u1.nd * u2.nd;
+    if (likely(float64_is_zero_or_normal(retDouble.d)))
+    {
+        /* TODO: Handling inexact */
+        return retDouble.d;
+    }
     float64 ret = float64_mul(arg1, arg2, &env->fp_status);
     int status = get_float_exception_flags(&env->fp_status);
 
@@ -997,6 +1030,17 @@  static void float_invalid_op_div(CPUPPCState *env, bool set_fprc,
 /* fdiv - fdiv. */
 float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2)
 {
+    CPU_DoubleU u1, u2;
+
+    u1.d = arg1;
+    u2.d = arg2;
+    CPU_DoubleU retDouble;
+    retDouble.nd = u1.nd / u2.nd;
+    if (likely(float64_is_zero_or_normal(retDouble.d)))
+    {
+        /* TODO: Handling inexact */
+        return retDouble.d;
+    }
     float64 ret = float64_div(arg1, arg2, &env->fp_status);
     int status = get_float_exception_flags(&env->fp_status);