diff mbox series

[03/11] softfloat: Introduce float_flag_inorm_denormal

Message ID 20210527041405.391567-4-richard.henderson@linaro.org (mailing list archive)
State New, archived
Headers show
Series softfloat: Improve denormal handling | expand

Commit Message

Richard Henderson May 27, 2021, 4:13 a.m. UTC
Create a new exception flag for reporting input denormals that are not
flushed to zero, they are normalized and treated as normal numbers.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-types.h | 15 ++++---
 fpu/softfloat.c               | 84 +++++++++++------------------------
 fpu/softfloat-parts.c.inc     |  1 +
 3 files changed, 36 insertions(+), 64 deletions(-)

Comments

Michael Morrell May 28, 2021, 5:41 p.m. UTC | #1
I'm probably missing something, but why do we need both "float_flag_inorm_denormal" and "float_flag_iflush_denormal"?

Couldn't the code that sets these flags set just a single flag for all denormal inputs and the code that checks these flags check that single flag
combined with the "flush_inputs_to_zero" flag to accomplish what the two separate "input denormal" flags do?

   Michael

-----Original Message-----
From: Richard Henderson <richard.henderson@linaro.org> 
Sent: Wednesday, May 26, 2021 9:14 PM
To: qemu-devel@nongnu.org
Cc: Michael Morrell <mmorrell@tachyum.com>; alex.bennee@linaro.org
Subject: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal

Create a new exception flag for reporting input denormals that are not flushed to zero, they are normalized and treated as normal numbers.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-types.h | 15 ++++---
 fpu/softfloat.c               | 84 +++++++++++------------------------
 fpu/softfloat-parts.c.inc     |  1 +
 3 files changed, 36 insertions(+), 64 deletions(-)

diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h index e2d70ff556..174100e50e 100644
--- a/include/fpu/softfloat-types.h
+++ b/include/fpu/softfloat-types.h
@@ -143,13 +143,14 @@ typedef enum __attribute__((__packed__)) {
  */
 
 enum {
-    float_flag_invalid   =  1,
-    float_flag_divbyzero =  4,
-    float_flag_overflow  =  8,
-    float_flag_underflow = 16,
-    float_flag_inexact   = 32,
-    float_flag_iflush_denormal = 64,
-    float_flag_oflush_denormal = 128
+    float_flag_invalid         = 0x0001,
+    float_flag_divbyzero       = 0x0002,
+    float_flag_overflow        = 0x0004,
+    float_flag_underflow       = 0x0008,
+    float_flag_inexact         = 0x0010,
+    float_flag_inorm_denormal  = 0x0020,  /* denormal input, normalized */
+    float_flag_iflush_denormal = 0x0040,  /* denormal input, flushed to zero */
+    float_flag_oflush_denormal = 0x0080,  /* denormal result, flushed 
+ to zero */
 };
 
 /*
diff --git a/fpu/softfloat.c b/fpu/softfloat.c index cb077cf111..e54cdb274d 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -126,61 +126,23 @@ this code that are retained.
  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
  * and the result is < the minimum normal.
  */
-#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
+
+#define GEN_INPUT_FLUSH(name, soft_t)                                   \
     static inline void name(soft_t *a, float_status *s)                 \
     {                                                                   \
         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
-            *a = soft_t ## _set_sign(soft_t ## _zero,                   \
-                                     soft_t ## _is_neg(*a));            \
-            float_raise(float_flag_iflush_denormal, s);                  \
+            if (s->flush_inputs_to_zero) {                              \
+                *a = soft_t ## _set_sign(0, soft_t ## _is_neg(*a));     \
+                float_raise(float_flag_iflush_denormal, s);             \
+            } else {                                                    \
+                float_raise(float_flag_inorm_denormal, s);              \
+            }                                                           \
         }                                                               \
     }
 
-GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) -GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) -#undef GEN_INPUT_FLUSH__NOCHECK
-
-#define GEN_INPUT_FLUSH1(name, soft_t)                  \
-    static inline void name(soft_t *a, float_status *s) \
-    {                                                   \
-        if (likely(!s->flush_inputs_to_zero)) {         \
-            return;                                     \
-        }                                               \
-        soft_t ## _input_flush__nocheck(a, s);          \
-    }
-
-GEN_INPUT_FLUSH1(float32_input_flush1, float32) -GEN_INPUT_FLUSH1(float64_input_flush1, float64) -#undef GEN_INPUT_FLUSH1
-
-#define GEN_INPUT_FLUSH2(name, soft_t)                                  \
-    static inline void name(soft_t *a, soft_t *b, float_status *s)      \
-    {                                                                   \
-        if (likely(!s->flush_inputs_to_zero)) {                         \
-            return;                                                     \
-        }                                                               \
-        soft_t ## _input_flush__nocheck(a, s);                          \
-        soft_t ## _input_flush__nocheck(b, s);                          \
-    }
-
-GEN_INPUT_FLUSH2(float32_input_flush2, float32) -GEN_INPUT_FLUSH2(float64_input_flush2, float64) -#undef GEN_INPUT_FLUSH2
-
-#define GEN_INPUT_FLUSH3(name, soft_t)                                  \
-    static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
-    {                                                                   \
-        if (likely(!s->flush_inputs_to_zero)) {                         \
-            return;                                                     \
-        }                                                               \
-        soft_t ## _input_flush__nocheck(a, s);                          \
-        soft_t ## _input_flush__nocheck(b, s);                          \
-        soft_t ## _input_flush__nocheck(c, s);                          \
-    }
-
-GEN_INPUT_FLUSH3(float32_input_flush3, float32) -GEN_INPUT_FLUSH3(float64_input_flush3, float64) -#undef GEN_INPUT_FLUSH3
+GEN_INPUT_FLUSH(float32_input_flush, float32) 
+GEN_INPUT_FLUSH(float64_input_flush, float64) #undef GEN_INPUT_FLUSH
 
 /*
  * Choose whether to use fpclassify or float32/64_* primitives in the generated @@ -353,7 +315,8 @@ float32_gen2(float32 xa, float32 xb, float_status *s,
         goto soft;
     }
 
-    float32_input_flush2(&ua.s, &ub.s, s);
+    float32_input_flush(&ua.s, s);
+    float32_input_flush(&ub.s, s);
     if (unlikely(!pre(ua, ub))) {
         goto soft;
     }
@@ -384,7 +347,8 @@ float64_gen2(float64 xa, float64 xb, float_status *s,
         goto soft;
     }
 
-    float64_input_flush2(&ua.s, &ub.s, s);
+    float64_input_flush(&ua.s, s);
+    float64_input_flush(&ub.s, s);
     if (unlikely(!pre(ua, ub))) {
         goto soft;
     }
@@ -2161,7 +2125,9 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
         goto soft;
     }
 
-    float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
+    float32_input_flush(&ua.s, s);
+    float32_input_flush(&ub.s, s);
+    float32_input_flush(&uc.s, s);
     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
         goto soft;
     }
@@ -2232,7 +2198,9 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
         goto soft;
     }
 
-    float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
+    float64_input_flush(&ua.s, s);
+    float64_input_flush(&ub.s, s);
+    float64_input_flush(&uc.s, s);
     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
         goto soft;
     }
@@ -3988,7 +3956,8 @@ float32_hs_compare(float32 xa, float32 xb, float_status *s, bool is_quiet)
         goto soft;
     }
 
-    float32_input_flush2(&ua.s, &ub.s, s);
+    float32_input_flush(&ua.s, s);
+    float32_input_flush(&ub.s, s);
     if (isgreaterequal(ua.h, ub.h)) {
         if (isgreater(ua.h, ub.h)) {
             return float_relation_greater; @@ -4038,7 +4007,8 @@ float64_hs_compare(float64 xa, float64 xb, float_status *s, bool is_quiet)
         goto soft;
     }
 
-    float64_input_flush2(&ua.s, &ub.s, s);
+    float64_input_flush(&ua.s, s);
+    float64_input_flush(&ub.s, s);
     if (isgreaterequal(ua.h, ub.h)) {
         if (isgreater(ua.h, ub.h)) {
             return float_relation_greater; @@ -4230,7 +4200,7 @@ float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
         goto soft;
     }
 
-    float32_input_flush1(&ua.s, s);
+    float32_input_flush(&ua.s, s);
     if (QEMU_HARDFLOAT_1F32_USE_FP) {
         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
                        fpclassify(ua.h) == FP_ZERO) || @@ -4257,7 +4227,7 @@ float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
         goto soft;
     }
 
-    float64_input_flush1(&ua.s, s);
+    float64_input_flush(&ua.s, s);
     if (QEMU_HARDFLOAT_1F64_USE_FP) {
         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
                        fpclassify(ua.h) == FP_ZERO) || diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc index 72e2b24539..16d4425419 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -119,6 +119,7 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
             int shift = frac_normalize(p);
             p->cls = float_class_normal;
             p->exp = fmt->frac_shift - fmt->exp_bias - shift + 1;
+            float_raise(float_flag_inorm_denormal, status);
         }
     } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
         p->cls = float_class_normal;
--
2.25.1
Richard Henderson May 29, 2021, 3:21 p.m. UTC | #2
On 5/28/21 10:41 AM, Michael Morrell wrote:
> I'm probably missing something, but why do we need both "float_flag_inorm_denormal" and "float_flag_iflush_denormal"?
> 
> Couldn't the code that sets these flags set just a single flag for all denormal inputs and the code that checks these flags check that single flag
> combined with the "flush_inputs_to_zero" flag to accomplish what the two separate "input denormal" flags do?

The thing that you're missing is that many guests leave the accumulated 
softfloat exceptions in the float_status structure until the guest FPSCR 
register is read. Unless the guest needs to raise an exception immediately, 
there's no reason to do otherwise.

With this setup, you have no temporal connection between "any denormal" and 
"flush-to-zero is set", thus two flags.


r~
Alex Bennée June 7, 2021, 3:35 p.m. UTC | #3
Richard Henderson <richard.henderson@linaro.org> writes:

> Create a new exception flag for reporting input denormals that are not
> flushed to zero, they are normalized and treated as normal numbers.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  include/fpu/softfloat-types.h | 15 ++++---
>  fpu/softfloat.c               | 84 +++++++++++------------------------
>  fpu/softfloat-parts.c.inc     |  1 +
>  3 files changed, 36 insertions(+), 64 deletions(-)
>
> diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
> index e2d70ff556..174100e50e 100644
> --- a/include/fpu/softfloat-types.h
> +++ b/include/fpu/softfloat-types.h
> @@ -143,13 +143,14 @@ typedef enum __attribute__((__packed__)) {
>   */
>  
>  enum {
> -    float_flag_invalid   =  1,
> -    float_flag_divbyzero =  4,
> -    float_flag_overflow  =  8,
> -    float_flag_underflow = 16,
> -    float_flag_inexact   = 32,
> -    float_flag_iflush_denormal = 64,
> -    float_flag_oflush_denormal = 128
> +    float_flag_invalid         = 0x0001,
> +    float_flag_divbyzero       = 0x0002,
> +    float_flag_overflow        = 0x0004,
> +    float_flag_underflow       = 0x0008,
> +    float_flag_inexact         = 0x0010,
> +    float_flag_inorm_denormal  = 0x0020,  /* denormal input, normalized */
> +    float_flag_iflush_denormal = 0x0040,  /* denormal input, flushed to zero */
> +    float_flag_oflush_denormal = 0x0080,  /* denormal result, flushed to zero */
>  };
>  
>  /*
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index cb077cf111..e54cdb274d 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -126,61 +126,23 @@ this code that are retained.
>   * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
>   * and the result is < the minimum normal.
>   */
> -#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
> +
> +#define GEN_INPUT_FLUSH(name, soft_t)                                   \
>      static inline void name(soft_t *a, float_status *s)                 \
>      {                                                                   \
>          if (unlikely(soft_t ## _is_denormal(*a))) {                     \
> -            *a = soft_t ## _set_sign(soft_t ## _zero,                   \
> -                                     soft_t ## _is_neg(*a));            \
> -            float_raise(float_flag_iflush_denormal, s);                  \
> +            if (s->flush_inputs_to_zero) {                              \
> +                *a = soft_t ## _set_sign(0, soft_t ## _is_neg(*a));     \
> +                float_raise(float_flag_iflush_denormal, s);             \
> +            } else {                                                    \
> +                float_raise(float_flag_inorm_denormal, s);              \
> +            }                                                           \
>          }                                                               \
>      }

So I'm guessing Emilio had the original flush code split was to avoid
multiple checks against s->flush_inputs_to_zero in the code. The was
possibly a good reason, comparing the before/after of float32_mul:

  Dump of assembler code for function float32_mul:
     0x0000000000934240 <+0>:	movzbl 0x1(%rdx),%eax
     0x0000000000934244 <+4>:	test   $0x20,%al
     0x0000000000934246 <+6>:	je     0x9342b0 <float32_mul+112>
     0x0000000000934248 <+8>:	cmpb   $0x0,(%rdx)
     0x000000000093424b <+11>:	jne    0x9342b0 <float32_mul+112>
     0x000000000093424d <+13>:	cmpb   $0x0,0x5(%rdx)
     0x0000000000934251 <+17>:	jne    0x9342d0 <float32_mul+144>
     0x0000000000934253 <+19>:	mov    %edi,%eax
     0x0000000000934255 <+21>:	shr    $0x17,%eax
     0x0000000000934258 <+24>:	add    $0x1,%eax
     0x000000000093425b <+27>:	test   $0xfe,%al
     0x000000000093425d <+29>:	je     0x9342a8 <float32_mul+104>
     0x000000000093425f <+31>:	mov    %esi,%eax
     0x0000000000934261 <+33>:	shr    $0x17,%eax
     0x0000000000934264 <+36>:	add    $0x1,%eax
     0x0000000000934267 <+39>:	test   $0xfe,%al
     0x0000000000934269 <+41>:	jne    0x934273 <float32_mul+51>
     0x000000000093426b <+43>:	test   $0x7fffffff,%esi
     0x0000000000934271 <+49>:	jne    0x9342b0 <float32_mul+112>
     0x0000000000934273 <+51>:	mov    %esi,-0xc(%rsp)
     0x0000000000934277 <+55>:	movss  -0xc(%rsp),%xmm0
     0x000000000093427d <+61>:	mov    %edi,-0xc(%rsp)
     0x0000000000934281 <+65>:	movss  -0xc(%rsp),%xmm2
     0x0000000000934287 <+71>:	mulss  %xmm2,%xmm0
     0x000000000093428b <+75>:	movd   %xmm0,%eax
     0x000000000093428f <+79>:	andps  0x3b805a(%rip),%xmm0        # 0xcec2f0
     0x0000000000934296 <+86>:	ucomiss 0x3b8047(%rip),%xmm0        # 0xcec2e4
     0x000000000093429d <+93>:	jbe    0x9342b8 <float32_mul+120>
     0x000000000093429f <+95>:	orb    $0x8,0x1(%rdx)
     0x00000000009342a3 <+99>:	retq   
     0x00000000009342a4 <+100>:	nopl   0x0(%rax)
     0x00000000009342a8 <+104>:	test   $0x7fffffff,%edi
     0x00000000009342ae <+110>:	je     0x93425f <float32_mul+31>
     0x00000000009342b0 <+112>:	jmpq   0x9290d0 <soft_f32_mul>
     0x00000000009342b5 <+117>:	nopl   (%rax)
     0x00000000009342b8 <+120>:	movss  0x3b8020(%rip),%xmm1        # 0xcec2e0
     0x00000000009342c0 <+128>:	comiss %xmm0,%xmm1
     0x00000000009342c3 <+131>:	jae    0x934320 <float32_mul+224>
     0x00000000009342c5 <+133>:	retq   
     0x00000000009342c6 <+134>:	nopw   %cs:0x0(%rax,%rax,1)
     0x00000000009342d0 <+144>:	test   $0x7f800000,%edi
     0x00000000009342d6 <+150>:	jne    0x9342f0 <float32_mul+176>
     0x00000000009342d8 <+152>:	test   $0x7fffffff,%edi
     0x00000000009342de <+158>:	je     0x9342f0 <float32_mul+176>
     0x00000000009342e0 <+160>:	or     $0x40,%eax
     0x00000000009342e3 <+163>:	and    $0x80000000,%edi
     0x00000000009342e9 <+169>:	mov    %al,0x1(%rdx)
     0x00000000009342ec <+172>:	nopl   0x0(%rax)
     0x00000000009342f0 <+176>:	test   $0x7f800000,%esi
     0x00000000009342f6 <+182>:	jne    0x934253 <float32_mul+19>
     0x00000000009342fc <+188>:	test   $0x7fffffff,%esi
     0x0000000000934302 <+194>:	je     0x934253 <float32_mul+19>
     0x0000000000934308 <+200>:	and    $0x80000000,%esi
     0x000000000093430e <+206>:	orb    $0x40,0x1(%rdx)
     0x0000000000934312 <+210>:	jmpq   0x934253 <float32_mul+19>
     0x0000000000934317 <+215>:	nopw   0x0(%rax,%rax,1)
     0x0000000000934320 <+224>:	mov    %edi,%ecx
     0x0000000000934322 <+226>:	or     %esi,%ecx
     0x0000000000934324 <+228>:	and    $0x7fffffff,%ecx
     0x000000000093432a <+234>:	jne    0x9342b0 <float32_mul+112>
     0x000000000093432c <+236>:	jmp    0x9342c5 <float32_mul+133>
  End of assembler dump.

And after this change:

  Dump of assembler code for function float32_mul:
     0x0000000000895d60 <+0>:	movzbl 0x1(%rdx),%eax
     0x0000000000895d64 <+4>:	test   $0x10,%al
     0x0000000000895d66 <+6>:	je     0x895e30 <float32_mul+208>
     0x0000000000895d6c <+12>:	cmpb   $0x0,(%rdx)
     0x0000000000895d6f <+15>:	jne    0x895e30 <float32_mul+208>
     0x0000000000895d75 <+21>:	test   $0x7f800000,%edi
     0x0000000000895d7b <+27>:	jne    0x895da0 <float32_mul+64>
     0x0000000000895d7d <+29>:	test   $0x7fffffff,%edi
     0x0000000000895d83 <+35>:	je     0x895da0 <float32_mul+64>
     0x0000000000895d85 <+37>:	cmpb   $0x0,0x5(%rdx)
     0x0000000000895d89 <+41>:	je     0x895e60 <float32_mul+256>
     0x0000000000895d8f <+47>:	or     $0x40,%eax
     0x0000000000895d92 <+50>:	and    $0x80000000,%edi
     0x0000000000895d98 <+56>:	mov    %al,0x1(%rdx)
     0x0000000000895d9b <+59>:	nopl   0x0(%rax,%rax,1)
     0x0000000000895da0 <+64>:	test   $0x7f800000,%esi
     0x0000000000895da6 <+70>:	jne    0x895dd0 <float32_mul+112>
     0x0000000000895da8 <+72>:	test   $0x7fffffff,%esi
     0x0000000000895dae <+78>:	je     0x895dd0 <float32_mul+112>
     0x0000000000895db0 <+80>:	cmpb   $0x0,0x5(%rdx)
     0x0000000000895db4 <+84>:	movzbl 0x1(%rdx),%eax
     0x0000000000895db8 <+88>:	je     0x895e50 <float32_mul+240>
     0x0000000000895dbe <+94>:	or     $0x40,%eax
     0x0000000000895dc1 <+97>:	and    $0x80000000,%esi
     0x0000000000895dc7 <+103>:	mov    %al,0x1(%rdx)
     0x0000000000895dca <+106>:	nopw   0x0(%rax,%rax,1)
     0x0000000000895dd0 <+112>:	mov    %edi,%eax
     0x0000000000895dd2 <+114>:	shr    $0x17,%eax
     0x0000000000895dd5 <+117>:	add    $0x1,%eax
     0x0000000000895dd8 <+120>:	test   $0xfe,%al
     0x0000000000895dda <+122>:	je     0x895e28 <float32_mul+200>
     0x0000000000895ddc <+124>:	mov    %esi,%eax
     0x0000000000895dde <+126>:	shr    $0x17,%eax
     0x0000000000895de1 <+129>:	add    $0x1,%eax
     0x0000000000895de4 <+132>:	test   $0xfe,%al
     0x0000000000895de6 <+134>:	jne    0x895df0 <float32_mul+144>
     0x0000000000895de8 <+136>:	test   $0x7fffffff,%esi
     0x0000000000895dee <+142>:	jne    0x895e30 <float32_mul+208>
     0x0000000000895df0 <+144>:	mov    %esi,-0xc(%rsp)
     0x0000000000895df4 <+148>:	movss  -0xc(%rsp),%xmm0
     0x0000000000895dfa <+154>:	mov    %edi,-0xc(%rsp)
     0x0000000000895dfe <+158>:	movss  -0xc(%rsp),%xmm2
     0x0000000000895e04 <+164>:	mulss  %xmm2,%xmm0
     0x0000000000895e08 <+168>:	movd   %xmm0,%eax
     0x0000000000895e0c <+172>:	andps  0x46bb5d(%rip),%xmm0        # 0xd01970
     0x0000000000895e13 <+179>:	ucomiss 0x46bb4a(%rip),%xmm0        # 0xd01964
     0x0000000000895e1a <+186>:	jbe    0x895e38 <float32_mul+216>
     0x0000000000895e1c <+188>:	orb    $0x4,0x1(%rdx)
     0x0000000000895e20 <+192>:	retq   
     0x0000000000895e21 <+193>:	nopl   0x0(%rax)
     0x0000000000895e28 <+200>:	test   $0x7fffffff,%edi
     0x0000000000895e2e <+206>:	je     0x895ddc <float32_mul+124>
     0x0000000000895e30 <+208>:	jmpq   0x88a8c0 <soft_f32_mul>
     0x0000000000895e35 <+213>:	nopl   (%rax)
     0x0000000000895e38 <+216>:	movss  0x46bb20(%rip),%xmm1        # 0xd01960
     0x0000000000895e40 <+224>:	comiss %xmm0,%xmm1
     0x0000000000895e43 <+227>:	jae    0x895e70 <float32_mul+272>
     0x0000000000895e45 <+229>:	retq   
     0x0000000000895e46 <+230>:	nopw   %cs:0x0(%rax,%rax,1)
     0x0000000000895e50 <+240>:	or     $0x20,%eax
     0x0000000000895e53 <+243>:	mov    %al,0x1(%rdx)
     0x0000000000895e56 <+246>:	jmpq   0x895dd0 <float32_mul+112>
     0x0000000000895e5b <+251>:	nopl   0x0(%rax,%rax,1)
     0x0000000000895e60 <+256>:	or     $0x20,%eax
     0x0000000000895e63 <+259>:	mov    %al,0x1(%rdx)
     0x0000000000895e66 <+262>:	jmpq   0x895da0 <float32_mul+64>
     0x0000000000895e6b <+267>:	nopl   0x0(%rax,%rax,1)
     0x0000000000895e70 <+272>:	mov    %esi,%ecx
     0x0000000000895e72 <+274>:	or     %edi,%ecx
     0x0000000000895e74 <+276>:	and    $0x7fffffff,%ecx
     0x0000000000895e7a <+282>:	jne    0x895e30 <float32_mul+208>
     0x0000000000895e7c <+284>:	jmp    0x895e45 <float32_mul+229>
  End of assembler dump.

However I'm not sure how much of that increase is down to the change of
macro expansion and how much is due to the extra leg for the flushing.

Anyway other than that observation seems OK to me:

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Alex Bennée June 7, 2021, 4:28 p.m. UTC | #4
Alex Bennée <alex.bennee@linaro.org> writes:

> Richard Henderson <richard.henderson@linaro.org> writes:
>
>> Create a new exception flag for reporting input denormals that are not
>> flushed to zero, they are normalized and treated as normal numbers.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
<snip>
>
> Anyway other than that observation seems OK to me:
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

I of course meant:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Richard Henderson June 7, 2021, 4:41 p.m. UTC | #5
On 6/7/21 8:35 AM, Alex Bennée wrote:
> So I'm guessing Emilio had the original flush code split was to avoid
> multiple checks against s->flush_inputs_to_zero in the code. The was
> possibly a good reason, comparing the before/after of float32_mul:

I assumed that the most important thing now is that we test floatN_is_denormal 
only once -- the test for flush_inputs_to_zero is fairly trivial.

If you've got a better ordering of operations for this, do tell.


r~
Alex Bennée June 7, 2021, 5:19 p.m. UTC | #6
Richard Henderson <richard.henderson@linaro.org> writes:

> On 6/7/21 8:35 AM, Alex Bennée wrote:
>> So I'm guessing Emilio had the original flush code split was to avoid
>> multiple checks against s->flush_inputs_to_zero in the code. The was
>> possibly a good reason, comparing the before/after of float32_mul:
>
> I assumed that the most important thing now is that we test
> floatN_is_denormal only once -- the test for flush_inputs_to_zero is
> fairly trivial.
>
> If you've got a better ordering of operations for this, do tell.

What I really want is to know which instructions translate into the if
(s->flush_inputs_to_zero) and verifying that is only checked once. Maybe
I'm just suspicious of compilers ability to optimise things away...
Richard Henderson June 7, 2021, 8:52 p.m. UTC | #7
On 6/7/21 10:19 AM, Alex Bennée wrote:
>> If you've got a better ordering of operations for this, do tell.
> 
> What I really want is to know which instructions translate into the if
> (s->flush_inputs_to_zero) and verifying that is only checked once. Maybe
> I'm just suspicious of compilers ability to optimise things away...




>   Dump of assembler code for function float32_mul:
>      0x0000000000895d60 <+0>:	movzbl 0x1(%rdx),%eax
>      0x0000000000895d64 <+4>:	test   $0x10,%al
>      0x0000000000895d66 <+6>:	je     0x895e30 <float32_mul+208>

s->float_exception_flags & float_flag_inexact

>      0x0000000000895d6c <+12>:	cmpb   $0x0,(%rdx)
>      0x0000000000895d6f <+15>:	jne    0x895e30 <float32_mul+208>

s->float_rounding_mode == float_round_nearest_even

>      0x0000000000895d75 <+21>:	test   $0x7f800000,%edi
>      0x0000000000895d7b <+27>:	jne    0x895da0 <float32_mul+64>
>      0x0000000000895d7d <+29>:	test   $0x7fffffff,%edi
>      0x0000000000895d83 <+35>:	je     0x895da0 <float32_mul+64>

float32_is_denormal

>      0x0000000000895d85 <+37>:	cmpb   $0x0,0x5(%rdx)
>      0x0000000000895d89 <+41>:	je     0x895e60 <float32_mul+256>

s->flush_inputs_to_zero

>      0x0000000000895d8f <+47>:	or     $0x40,%eax
>      0x0000000000895d92 <+50>:	and    $0x80000000,%edi
>      0x0000000000895d98 <+56>:	mov    %al,0x1(%rdx)

flush-to-zero and set iflush_denormal

>      0x0000000000895da0 <+64>:	test   $0x7f800000,%esi
>      0x0000000000895da6 <+70>:	jne    0x895dd0 <float32_mul+112>
>      0x0000000000895da8 <+72>:	test   $0x7fffffff,%esi
>      0x0000000000895dae <+78>:	je     0x895dd0 <float32_mul+112>

float32_is_denormal (second operand)

>      0x0000000000895db0 <+80>:	cmpb   $0x0,0x5(%rdx)
>      0x0000000000895db4 <+84>:	movzbl 0x1(%rdx),%eax
>      0x0000000000895db8 <+88>:	je     0x895e50 <float32_mul+240>
>      0x0000000000895dbe <+94>:	or     $0x40,%eax
>      0x0000000000895dc1 <+97>:	and    $0x80000000,%esi

s->flush_inputs_to_zero,
flush-to-zero,
set iflush_denormal.

...

>      0x0000000000895e50 <+240>:	or     $0x20,%eax
>      0x0000000000895e53 <+243>:	mov    %al,0x1(%rdx)
>      0x0000000000895e56 <+246>:	jmpq   0x895dd0 <float32_mul+112>

set inorm_denormal (second operand)

>      0x0000000000895e60 <+256>:	or     $0x20,%eax
>      0x0000000000895e63 <+259>:	mov    %al,0x1(%rdx)
>      0x0000000000895e66 <+262>:	jmpq   0x895da0 <float32_mul+64>

set inorm_denormal (first operand)

There do seem to be 3 reads/writes to exception_flags for float_raise.


r~
Michael Morrell July 14, 2021, 4:44 p.m. UTC | #8
Just curious.  What's the expected timeline to get these denormal patches in the tree?

-----Original Message-----
From: Richard Henderson <richard.henderson@linaro.org> 
Sent: Saturday, May 29, 2021 8:21 AM
To: Michael Morrell <mmorrell@tachyum.com>; qemu-devel@nongnu.org
Cc: alex.bennee@linaro.org
Subject: Re: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal

On 5/28/21 10:41 AM, Michael Morrell wrote:
> I'm probably missing something, but why do we need both "float_flag_inorm_denormal" and "float_flag_iflush_denormal"?
> 
> Couldn't the code that sets these flags set just a single flag for all 
> denormal inputs and the code that checks these flags check that single flag combined with the "flush_inputs_to_zero" flag to accomplish what the two separate "input denormal" flags do?

The thing that you're missing is that many guests leave the accumulated softfloat exceptions in the float_status structure until the guest FPSCR register is read. Unless the guest needs to raise an exception immediately, there's no reason to do otherwise.

With this setup, you have no temporal connection between "any denormal" and "flush-to-zero is set", thus two flags.


r~
Richard Henderson July 14, 2021, 4:57 p.m. UTC | #9
On 7/14/21 9:44 AM, Michael Morrell wrote:
> Just curious.  What's the expected timeline to get these denormal patches in the tree?

Next development cycle, at minimum.  I need to fix the problems vs NaNs that you identified.


r~
Michael Morrell July 14, 2021, 5:50 p.m. UTC | #10
OK, thanks for the update.  I also appreciate you looking at the NaN issue.

   Michael

-----Original Message-----
From: Richard Henderson <richard.henderson@linaro.org> 
Sent: Wednesday, July 14, 2021 9:57 AM
To: Michael Morrell <mmorrell@tachyum.com>; qemu-devel@nongnu.org
Subject: Re: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal

On 7/14/21 9:44 AM, Michael Morrell wrote:
> Just curious.  What's the expected timeline to get these denormal patches in the tree?

Next development cycle, at minimum.  I need to fix the problems vs NaNs that you identified.


r~
Michael Morrell Jan. 12, 2022, 12:02 a.m. UTC | #11
Richard,

It's been 6 months so I thought I'd check in again.   Do you have an estimate of when this will go in?

   Michael

-----Original Message-----
From: Michael Morrell 
Sent: Wednesday, July 14, 2021 10:50 AM
To: 'Richard Henderson' <richard.henderson@linaro.org>; qemu-devel@nongnu.org
Subject: RE: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal

OK, thanks for the update.  I also appreciate you looking at the NaN issue.

   Michael

-----Original Message-----
From: Richard Henderson <richard.henderson@linaro.org> 
Sent: Wednesday, July 14, 2021 9:57 AM
To: Michael Morrell <mmorrell@tachyum.com>; qemu-devel@nongnu.org
Subject: Re: [PATCH 03/11] softfloat: Introduce float_flag_inorm_denormal

On 7/14/21 9:44 AM, Michael Morrell wrote:
> Just curious.  What's the expected timeline to get these denormal patches in the tree?

Next development cycle, at minimum.  I need to fix the problems vs NaNs that you identified.


r~
diff mbox series

Patch

diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
index e2d70ff556..174100e50e 100644
--- a/include/fpu/softfloat-types.h
+++ b/include/fpu/softfloat-types.h
@@ -143,13 +143,14 @@  typedef enum __attribute__((__packed__)) {
  */
 
 enum {
-    float_flag_invalid   =  1,
-    float_flag_divbyzero =  4,
-    float_flag_overflow  =  8,
-    float_flag_underflow = 16,
-    float_flag_inexact   = 32,
-    float_flag_iflush_denormal = 64,
-    float_flag_oflush_denormal = 128
+    float_flag_invalid         = 0x0001,
+    float_flag_divbyzero       = 0x0002,
+    float_flag_overflow        = 0x0004,
+    float_flag_underflow       = 0x0008,
+    float_flag_inexact         = 0x0010,
+    float_flag_inorm_denormal  = 0x0020,  /* denormal input, normalized */
+    float_flag_iflush_denormal = 0x0040,  /* denormal input, flushed to zero */
+    float_flag_oflush_denormal = 0x0080,  /* denormal result, flushed to zero */
 };
 
 /*
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index cb077cf111..e54cdb274d 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -126,61 +126,23 @@  this code that are retained.
  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
  * and the result is < the minimum normal.
  */
-#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
+
+#define GEN_INPUT_FLUSH(name, soft_t)                                   \
     static inline void name(soft_t *a, float_status *s)                 \
     {                                                                   \
         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
-            *a = soft_t ## _set_sign(soft_t ## _zero,                   \
-                                     soft_t ## _is_neg(*a));            \
-            float_raise(float_flag_iflush_denormal, s);                  \
+            if (s->flush_inputs_to_zero) {                              \
+                *a = soft_t ## _set_sign(0, soft_t ## _is_neg(*a));     \
+                float_raise(float_flag_iflush_denormal, s);             \
+            } else {                                                    \
+                float_raise(float_flag_inorm_denormal, s);              \
+            }                                                           \
         }                                                               \
     }
 
-GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
-GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
-#undef GEN_INPUT_FLUSH__NOCHECK
-
-#define GEN_INPUT_FLUSH1(name, soft_t)                  \
-    static inline void name(soft_t *a, float_status *s) \
-    {                                                   \
-        if (likely(!s->flush_inputs_to_zero)) {         \
-            return;                                     \
-        }                                               \
-        soft_t ## _input_flush__nocheck(a, s);          \
-    }
-
-GEN_INPUT_FLUSH1(float32_input_flush1, float32)
-GEN_INPUT_FLUSH1(float64_input_flush1, float64)
-#undef GEN_INPUT_FLUSH1
-
-#define GEN_INPUT_FLUSH2(name, soft_t)                                  \
-    static inline void name(soft_t *a, soft_t *b, float_status *s)      \
-    {                                                                   \
-        if (likely(!s->flush_inputs_to_zero)) {                         \
-            return;                                                     \
-        }                                                               \
-        soft_t ## _input_flush__nocheck(a, s);                          \
-        soft_t ## _input_flush__nocheck(b, s);                          \
-    }
-
-GEN_INPUT_FLUSH2(float32_input_flush2, float32)
-GEN_INPUT_FLUSH2(float64_input_flush2, float64)
-#undef GEN_INPUT_FLUSH2
-
-#define GEN_INPUT_FLUSH3(name, soft_t)                                  \
-    static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
-    {                                                                   \
-        if (likely(!s->flush_inputs_to_zero)) {                         \
-            return;                                                     \
-        }                                                               \
-        soft_t ## _input_flush__nocheck(a, s);                          \
-        soft_t ## _input_flush__nocheck(b, s);                          \
-        soft_t ## _input_flush__nocheck(c, s);                          \
-    }
-
-GEN_INPUT_FLUSH3(float32_input_flush3, float32)
-GEN_INPUT_FLUSH3(float64_input_flush3, float64)
-#undef GEN_INPUT_FLUSH3
+GEN_INPUT_FLUSH(float32_input_flush, float32)
+GEN_INPUT_FLUSH(float64_input_flush, float64)
+#undef GEN_INPUT_FLUSH
 
 /*
  * Choose whether to use fpclassify or float32/64_* primitives in the generated
@@ -353,7 +315,8 @@  float32_gen2(float32 xa, float32 xb, float_status *s,
         goto soft;
     }
 
-    float32_input_flush2(&ua.s, &ub.s, s);
+    float32_input_flush(&ua.s, s);
+    float32_input_flush(&ub.s, s);
     if (unlikely(!pre(ua, ub))) {
         goto soft;
     }
@@ -384,7 +347,8 @@  float64_gen2(float64 xa, float64 xb, float_status *s,
         goto soft;
     }
 
-    float64_input_flush2(&ua.s, &ub.s, s);
+    float64_input_flush(&ua.s, s);
+    float64_input_flush(&ub.s, s);
     if (unlikely(!pre(ua, ub))) {
         goto soft;
     }
@@ -2161,7 +2125,9 @@  float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
         goto soft;
     }
 
-    float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
+    float32_input_flush(&ua.s, s);
+    float32_input_flush(&ub.s, s);
+    float32_input_flush(&uc.s, s);
     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
         goto soft;
     }
@@ -2232,7 +2198,9 @@  float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
         goto soft;
     }
 
-    float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
+    float64_input_flush(&ua.s, s);
+    float64_input_flush(&ub.s, s);
+    float64_input_flush(&uc.s, s);
     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
         goto soft;
     }
@@ -3988,7 +3956,8 @@  float32_hs_compare(float32 xa, float32 xb, float_status *s, bool is_quiet)
         goto soft;
     }
 
-    float32_input_flush2(&ua.s, &ub.s, s);
+    float32_input_flush(&ua.s, s);
+    float32_input_flush(&ub.s, s);
     if (isgreaterequal(ua.h, ub.h)) {
         if (isgreater(ua.h, ub.h)) {
             return float_relation_greater;
@@ -4038,7 +4007,8 @@  float64_hs_compare(float64 xa, float64 xb, float_status *s, bool is_quiet)
         goto soft;
     }
 
-    float64_input_flush2(&ua.s, &ub.s, s);
+    float64_input_flush(&ua.s, s);
+    float64_input_flush(&ub.s, s);
     if (isgreaterequal(ua.h, ub.h)) {
         if (isgreater(ua.h, ub.h)) {
             return float_relation_greater;
@@ -4230,7 +4200,7 @@  float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
         goto soft;
     }
 
-    float32_input_flush1(&ua.s, s);
+    float32_input_flush(&ua.s, s);
     if (QEMU_HARDFLOAT_1F32_USE_FP) {
         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
                        fpclassify(ua.h) == FP_ZERO) ||
@@ -4257,7 +4227,7 @@  float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
         goto soft;
     }
 
-    float64_input_flush1(&ua.s, s);
+    float64_input_flush(&ua.s, s);
     if (QEMU_HARDFLOAT_1F64_USE_FP) {
         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
                        fpclassify(ua.h) == FP_ZERO) ||
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index 72e2b24539..16d4425419 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -119,6 +119,7 @@  static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
             int shift = frac_normalize(p);
             p->cls = float_class_normal;
             p->exp = fmt->frac_shift - fmt->exp_bias - shift + 1;
+            float_raise(float_flag_inorm_denormal, status);
         }
     } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
         p->cls = float_class_normal;