diff mbox series

[v8,28/50] x86emul: support AVX512F floating point manipulation insns

Message ID 5C8B84C5020000780021F242@prv1-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich March 15, 2019, 10:56 a.m. UTC
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v7: Fix vector length check for scalar insns. ea.type == OP_* ->
    ea.type != OP_*. Re-base.
v5: New.

Comments

Andrew Cooper May 29, 2019, 12:51 p.m. UTC | #1
On 15/03/2019 10:56, Jan Beulich wrote:
> @@ -9681,6 +9696,21 @@ x86_emulate(
>          op_bytes = 4;
>          goto simd_imm8_zmm;
>  
> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x26): /* vgetmantp{s,d} $imm8,[xyz]mm/mem,[xyz]mm{k} */
> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x54): /* vfixupimmp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
> +        host_and_vcpu_must_have(avx512f);
> +        if ( ea.type != OP_REG || !evex.brs )
> +            avx512_vlen_check(false);
> +        goto simd_imm8_zmm;
> +
> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x27): /* vgetmants{s,d} $imm8,xmm/mem,xmm,xmm{k} */
> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x55): /* vfixupimms{s,d} $imm8,xmm/mem,xmm,xmm{k} */
> +        host_and_vcpu_must_have(avx512f);
> +        generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);

Why the ea.type != OP_REG restriction?  These four instructions do take
memory operands.

~Andrew

> +        if ( !evex.brs )
> +            avx512_vlen_check(true);
> +        goto simd_imm8_zmm;
> +
>      case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
>      case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
>          if ( !vex.w )
>
>
>
Jan Beulich May 29, 2019, 1:15 p.m. UTC | #2
>>> On 29.05.19 at 14:51, <andrew.cooper3@citrix.com> wrote:
> On 15/03/2019 10:56, Jan Beulich wrote:
>> @@ -9681,6 +9696,21 @@ x86_emulate(
>>          op_bytes = 4;
>>          goto simd_imm8_zmm;
>>  
>> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x26): /* vgetmantp{s,d} $imm8,[xyz]mm/mem,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x54): /* vfixupimmp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>> +        host_and_vcpu_must_have(avx512f);
>> +        if ( ea.type != OP_REG || !evex.brs )
>> +            avx512_vlen_check(false);
>> +        goto simd_imm8_zmm;
>> +
>> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x27): /* vgetmants{s,d} $imm8,xmm/mem,xmm,xmm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x55): /* vfixupimms{s,d} $imm8,xmm/mem,xmm,xmm{k} */
>> +        host_and_vcpu_must_have(avx512f);
>> +        generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
> 
> Why the ea.type != OP_REG restriction?  These four instructions do take
> memory operands.

Did you perhaps read the && as || ? Scalar operations (not just the
ones here) don't support broadcast (with a memory operand), but
may support embedded rounding (or, like here, just SAE; with a
register operand). The exact same construct exists e.g. at the
simd_zmm_scalar_sae label (and the block of case labels ahead of
it actually also gets added to by the patch here).

Jan
Andrew Cooper June 10, 2019, 2:01 p.m. UTC | #3
On 29/05/2019 14:15, Jan Beulich wrote:
>>>> On 29.05.19 at 14:51, <andrew.cooper3@citrix.com> wrote:
>> On 15/03/2019 10:56, Jan Beulich wrote:
>>> @@ -9681,6 +9696,21 @@ x86_emulate(
>>>          op_bytes = 4;
>>>          goto simd_imm8_zmm;
>>>  
>>> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x26): /* vgetmantp{s,d} $imm8,[xyz]mm/mem,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x54): /* vfixupimmp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>>> +        host_and_vcpu_must_have(avx512f);
>>> +        if ( ea.type != OP_REG || !evex.brs )
>>> +            avx512_vlen_check(false);
>>> +        goto simd_imm8_zmm;
>>> +
>>> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x27): /* vgetmants{s,d} $imm8,xmm/mem,xmm,xmm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x55): /* vfixupimms{s,d} $imm8,xmm/mem,xmm,xmm{k} */
>>> +        host_and_vcpu_must_have(avx512f);
>>> +        generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
>> Why the ea.type != OP_REG restriction?  These four instructions do take
>> memory operands.
> Did you perhaps read the && as || ?

Oops - I did.

Sorry - I blame the jetlag.

~Andrew
Andrew Cooper June 10, 2019, 2:03 p.m. UTC | #4
On 15/03/2019 10:56, Jan Beulich wrote:
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -140,6 +140,8 @@  static const struct test avx512f_all[] =
     INSN(cvtusi2sd,    f2,   0f, 7b,    el,   dq64, el),
     INSN(cvtusi2ss,    f3,   0f, 7b,    el,   dq64, el),
     INSN_FP(div,             0f, 5e),
+    INSN(fixupimm,     66, 0f3a, 54,    vl,     sd, vl),
+    INSN(fixupimm,     66, 0f3a, 55,    el,     sd, el),
     INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
     INSN(fmadd132,     66, 0f38, 99,    el,     sd, el),
     INSN(fmadd213,     66, 0f38, a8,    vl,     sd, vl),
@@ -170,6 +172,10 @@  static const struct test avx512f_all[] =
     INSN(fnmsub213,    66, 0f38, af,    el,     sd, el),
     INSN(fnmsub231,    66, 0f38, be,    vl,     sd, vl),
     INSN(fnmsub231,    66, 0f38, bf,    el,     sd, el),
+    INSN(getexp,       66, 0f38, 42,    vl,     sd, vl),
+    INSN(getexp,       66, 0f38, 43,    el,     sd, el),
+    INSN(getmant,      66, 0f3a, 26,    vl,     sd, vl),
+    INSN(getmant,      66, 0f3a, 27,    el,     sd, el),
     INSN_FP(max,             0f, 5f),
     INSN_FP(min,             0f, 5d),
     INSN_SFP(mov,            0f, 10),
@@ -286,6 +292,8 @@  static const struct test avx512f_all[] =
     INSN(rndscaless,   66, 0f3a, 0a,    el,      d, el),
     INSN(rsqrt14,      66, 0f38, 4e,    vl,     sd, vl),
     INSN(rsqrt14,      66, 0f38, 4f,    el,     sd, el),
+    INSN(scalef,       66, 0f38, 2c,    vl,     sd, vl),
+    INSN(scalef,       66, 0f38, 2d,    el,     sd, el),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
     INSN_FP(sub,             0f, 5c),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -174,6 +174,11 @@  static inline bool _to_bool(byte_vec_t b
     asm ( op : [out] "=&x" (r_) : [in] "m" (x) ); \
     (vec_t){ r_[0] }; \
 })
+# define scalar_2op(x, y, op) ({ \
+    typeof((x)[0]) __attribute__((vector_size(16))) r_ = { x[0] }; \
+    asm ( op : [out] "=&x" (r_) : [in1] "[out]" (r_), [in2] "m" (y) ); \
+    (vec_t){ r_[0] }; \
+})
 #endif
 
 #if VEC_SIZE == 16 && FLOAT_SIZE == 4 && defined(__SSE__)
@@ -210,6 +215,8 @@  static inline vec_t movlhps(vec_t x, vec
 })
 #elif defined(FLOAT_SIZE) && VEC_SIZE == FLOAT_SIZE && defined(__AVX512F__)
 # if FLOAT_SIZE == 4
+#  define getexp(x) scalar_1op(x, "vgetexpss %[in], %[out], %[out]")
+#  define getmant(x) scalar_1op(x, "vgetmantss $0, %[in], %[out], %[out]")
 #  ifdef __AVX512ER__
 #   define recip(x) scalar_1op(x, "vrcp28ss %[in], %[out], %[out]")
 #   define rsqrt(x) scalar_1op(x, "vrsqrt28ss %[in], %[out], %[out]")
@@ -217,9 +224,12 @@  static inline vec_t movlhps(vec_t x, vec
 #   define recip(x) scalar_1op(x, "vrcp14ss %[in], %[out], %[out]")
 #   define rsqrt(x) scalar_1op(x, "vrsqrt14ss %[in], %[out], %[out]")
 #  endif
+#  define scale(x, y) scalar_2op(x, y, "vscalefss %[in2], %[in1], %[out]")
 #  define sqrt(x) scalar_1op(x, "vsqrtss %[in], %[out], %[out]")
 #  define trunc(x) scalar_1op(x, "vrndscaless $0b1011, %[in], %[out], %[out]")
 # elif FLOAT_SIZE == 8
+#  define getexp(x) scalar_1op(x, "vgetexpsd %[in], %[out], %[out]")
+#  define getmant(x) scalar_1op(x, "vgetmantsd $0, %[in], %[out], %[out]")
 #  ifdef __AVX512ER__
 #   define recip(x) scalar_1op(x, "vrcp28sd %[in], %[out], %[out]")
 #   define rsqrt(x) scalar_1op(x, "vrsqrt28sd %[in], %[out], %[out]")
@@ -227,6 +237,7 @@  static inline vec_t movlhps(vec_t x, vec
 #   define recip(x) scalar_1op(x, "vrcp14sd %[in], %[out], %[out]")
 #   define rsqrt(x) scalar_1op(x, "vrsqrt14sd %[in], %[out], %[out]")
 #  endif
+#  define scale(x, y) scalar_2op(x, y, "vscalefsd %[in2], %[in1], %[out]")
 #  define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
 #  define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]")
 # endif
@@ -274,9 +285,12 @@  static inline vec_t movlhps(vec_t x, vec
 #   define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0)
 #   define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0)
 #  endif
+#  define getexp(x) BR(getexpps, _mask, x, undef(), ~0)
+#  define getmant(x) BR(getmantps, _mask, x, 0, undef(), ~0)
 #  define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
 #  define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
 #  define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+#  define scale(x, y) BR(scalefps, _mask, x, y, undef(), ~0)
 #  if VEC_SIZE == 64 && defined(__AVX512ER__)
 #   define recip(x) BR(rcp28ps, _mask, x, undef(), ~0)
 #   define rsqrt(x) BR(rsqrt28ps, _mask, x, undef(), ~0)
@@ -336,9 +350,12 @@  static inline vec_t movlhps(vec_t x, vec
 #   define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0)
 #   define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0)
 #  endif
+#  define getexp(x) BR(getexppd, _mask, x, undef(), ~0)
+#  define getmant(x) BR(getmantpd, _mask, x, 0, undef(), ~0)
 #  define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
 #  define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
 #  define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
+#  define scale(x, y) BR(scalefpd, _mask, x, y, undef(), ~0)
 #  if VEC_SIZE == 64 && defined(__AVX512ER__)
 #   define recip(x) BR(rcp28pd, _mask, x, undef(), ~0)
 #   define rsqrt(x) BR(rsqrt28pd, _mask, x, undef(), ~0)
@@ -1766,6 +1783,28 @@  int simd_test(void)
 # endif
 #endif
 
+#if defined(getexp) && defined(getmant)
+    touch(src);
+    x = getmant(src);
+    touch(src);
+    y = getexp(src);
+    touch(src);
+    for ( j = i = 0; i < ELEM_COUNT; ++i )
+    {
+        if ( y[i] != j ) return __LINE__;
+
+        if ( !((i + 1) & (i + 2)) )
+            ++j;
+
+        if ( !(i & (i + 1)) && x[i] != 1 ) return __LINE__;
+    }
+# ifdef scale
+    touch(y);
+    z = scale(x, y);
+    if ( !eq(src, z) ) return __LINE__;
+# endif
+#endif
+
 #if (defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)) || \
     (defined(__AVX512F__) && defined(FLOAT_SIZE))
     return -fma_test();
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3924,6 +3924,44 @@  int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vfixupimmpd $0,8(%edx){1to8},%zmm3,%zmm4...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vfixupimmpd);
+        static const struct {
+            double d[4];
+        }
+        src = { { -1, 0, 1, 2 } },
+        dst = { { 3, 4, 5, 6 } },
+        out = { { .5, -1, 90, 2 } };
+
+        asm volatile ( "vbroadcastf64x4 %1, %%zmm3\n\t"
+                       "vbroadcastf64x4 %2, %%zmm4\n"
+                       put_insn(vfixupimmpd,
+                                "vfixupimmpd $0, 8(%0)%{1to8%}, %%zmm3, %%zmm4")
+                       :: "d" (NULL), "m" (src), "m" (dst) );
+
+        set_insn(vfixupimmpd);
+        /*
+         * Nibble (token) mapping (unused ones simply set to zero):
+         * 2 (ZERO)    ->  -1 (0x9)
+         * 3 (POS_ONE) ->  90 (0xc)
+         * 6 (NEG)     -> 1/2 (0xb)
+         * 7 (POS)     -> src (0x1)
+         */
+        res[2] = 0x1b00c900;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "vmovupd %%zmm4, %0" : "=m" (res[0]) );
+        if ( rc != X86EMUL_OKAY || !check_eip(vfixupimmpd) ||
+             memcmp(res + 0, &out, sizeof(out)) ||
+             memcmp(res + 8, &out, sizeof(out)) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -459,7 +459,8 @@  static const struct ext0f38_table {
     [0x26 ... 0x29] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
+    [0x2c] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x2d] = { .simd_size = simd_packed_fp, .d8s = d8s_dq },
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x31] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
@@ -470,6 +471,8 @@  static const struct ext0f38_table {
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x42] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
+    [0x43] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x4c] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
     [0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
@@ -563,6 +566,8 @@  static const struct ext0f3a_table {
     [0x22] = { .simd_size = simd_none, .d8s = d8s_dq64 },
     [0x23] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x26] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
+    [0x27] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128, .d8s = 4 },
     [0x3a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
@@ -577,6 +582,8 @@  static const struct ext0f3a_table {
     [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x54] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x55] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x68 ... 0x69] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -2684,6 +2691,10 @@  x86_decode_0f38(
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
 
+    case X86EMUL_OPC_EVEX_66(0, 0x2d): /* vscalefs{s,d} */
+        state->simd_size = simd_scalar_vexw;
+        break;
+
     case X86EMUL_OPC_EVEX_66(0, 0x7a): /* vpbroadcastb */
     case X86EMUL_OPC_EVEX_66(0, 0x7b): /* vpbroadcastw */
     case X86EMUL_OPC_EVEX_66(0, 0x7c): /* vpbroadcast{d,q} */
@@ -9095,6 +9106,8 @@  x86_emulate(
         host_and_vcpu_must_have(fma);
         goto simd_0f_ymm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2c): /* vscalefp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x42): /* vgetexpp{s,d} [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -9118,6 +9131,8 @@  x86_emulate(
             avx512_vlen_check(false);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2d): /* vscalefs{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x43): /* vgetexps{s,d} xmm/mem,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm{k} */
@@ -9681,6 +9696,21 @@  x86_emulate(
         op_bytes = 4;
         goto simd_imm8_zmm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x26): /* vgetmantp{s,d} $imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x54): /* vfixupimmp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512f);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        goto simd_imm8_zmm;
+
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x27): /* vgetmants{s,d} $imm8,xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x55): /* vfixupimms{s,d} $imm8,xmm/mem,xmm,xmm{k} */
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
+        if ( !evex.brs )
+            avx512_vlen_check(true);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
         if ( !vex.w )