diff mbox series

[v8,19/50] x86emul: support AVX512F floating-point conversion insns

Message ID 5C8B82BF020000780021F1C7@prv1-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich March 15, 2019, 10:47 a.m. UTC
VCVTPS2PD, sharing its main opcode with others, needs a "manual"
override of disp8scale.

The simd_size change for twobyte_table[0x5a] is benign to pre-existing
code, but allows decode_disp8scale() to work as is here.

Also correct the comment on an AVX counterpart.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v7: ea.type == OP_* -> ea.type != OP_*. Re-base.
v6: Re-base over changes earlier in the series.
v5: Re-base over changes earlier in the series.
v4: New.

Comments

Andrew Cooper May 21, 2019, 11:33 a.m. UTC | #1
On 15/03/2019 10:47, Jan Beulich wrote:
> @@ -9312,7 +9386,8 @@ x86_emulate(
>  
>          if ( ea.type == OP_MEM )
>          {
> -            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
> +            rc = ops->write(ea.mem.seg, truncate_ea(ea.mem.off + first_byte),
> +                            (void *)mmvalp + first_byte, op_bytes, ctxt);
>              if ( rc != X86EMUL_OKAY )
>              {
>                  asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );

This hunk doesn't appear to fit with the rest of the patch, because it
isn't the first use of first_byte.

Have we been subtly broken before?

~Andrew
Jan Beulich May 21, 2019, 3:46 p.m. UTC | #2
>>> On 21.05.19 at 13:33, <andrew.cooper3@citrix.com> wrote:
> On 15/03/2019 10:47, Jan Beulich wrote:
>> @@ -9312,7 +9386,8 @@ x86_emulate(
>>  
>>          if ( ea.type == OP_MEM )
>>          {
>> -            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
>> +            rc = ops->write(ea.mem.seg, truncate_ea(ea.mem.off + first_byte),
>> +                            (void *)mmvalp + first_byte, op_bytes, ctxt);
>>              if ( rc != X86EMUL_OKAY )
>>              {
>>                  asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );
> 
> This hunk doesn't appear to fit with the rest of the patch, because it
> isn't the first use of first_byte.
> 
> Have we been subtly broken before?

I don't think so, no, but I admit I'm not sure I understand what
you're saying above. The use of first_byte here is of course not
the first use - it gets set in the hunk further up. The AVX form of
VCVTPS2PH does not support fault suppression (as that's an
AVX512 feature), and hence no such adjustment was needed
here before.

Jan
Andrew Cooper May 23, 2019, 4:08 p.m. UTC | #3
On 21/05/2019 16:46, Jan Beulich wrote:
>>>> On 21.05.19 at 13:33, <andrew.cooper3@citrix.com> wrote:
>> On 15/03/2019 10:47, Jan Beulich wrote:
>>> @@ -9312,7 +9386,8 @@ x86_emulate(
>>>  
>>>          if ( ea.type == OP_MEM )
>>>          {
>>> -            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
>>> +            rc = ops->write(ea.mem.seg, truncate_ea(ea.mem.off + first_byte),
>>> +                            (void *)mmvalp + first_byte, op_bytes, ctxt);
>>>              if ( rc != X86EMUL_OKAY )
>>>              {
>>>                  asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );
>> This hunk doesn't appear to fit with the rest of the patch, because it
>> isn't the first use of first_byte.
>>
>> Have we been subtly broken before?
> I don't think so, no, but I admit I'm not sure I understand what
> you're saying above. The use of first_byte here is of course not
> the first use - it gets set in the hunk further up. The AVX form of
> VCVTPS2PH does not support fault suppression (as that's an
> AVX512 feature), and hence no such adjustment was needed
> here before.

Ah - what I hadn't spotted was that this is the special case for
vcvtps2ph, so this change is fine in context.

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -109,6 +109,12 @@  static const struct test avx512f_all[] =
     INSN_FP(cmp,             0f, c2),
     INSN(comisd,       66,   0f, 2f,    el,      q, el),
     INSN(comiss,         ,   0f, 2f,    el,      d, el),
+    INSN(cvtpd2ps,     66,   0f, 5a,    vl,      q, vl),
+    INSN(cvtph2ps,     66, 0f38, 13,    vl_2, d_nb, vl),
+    INSN(cvtps2pd,       ,   0f, 5a,    vl_2,    d, vl),
+    INSN(cvtps2ph,     66, 0f3a, 1d,    vl_2, d_nb, vl),
+    INSN(cvtsd2ss,     f2,   0f, 5a,    el,      q, el),
+    INSN(cvtss2sd,     f3,   0f, 5a,    el,      d, el),
     INSN_FP(div,             0f, 5e),
     INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
     INSN(fmadd132,     66, 0f38, 99,    el,     sd, el),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -181,7 +181,9 @@  static inline bool _to_bool(byte_vec_t b
 #  define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
 #  define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
 #  define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+#  define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0)
 #  define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0)
+#  define widen1(x) ((vec_t)BR(cvtps2pd, _mask, x, (vdf_t)undef(), ~0))
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
 #   define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -68,6 +68,7 @@  typedef short __attribute__((vector_size
 typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
 #if VEC_SIZE >= 8
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t;
 #endif
 
 #if ELEM_SIZE == 1
@@ -93,6 +94,7 @@  typedef char __attribute__((vector_size(
 typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
 typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
 typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t;
 # endif
 
 # if ELEM_COUNT >= 4
@@ -328,6 +330,13 @@  REN(pandn, , d);
 REN(por, , d);
 REN(pxor, , d);
 #  endif
+OVR(cvtpd2psx);
+OVR(cvtpd2psy);
+OVR(cvtph2ps);
+OVR(cvtps2pd);
+OVR(cvtps2ph);
+OVR(cvtsd2ss);
+OVR(cvtss2sd);
 OVR(movddup);
 OVR(movntdq);
 OVR(movntdqa);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3871,6 +3871,49 @@  int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vcvtph2ps 32(%ecx),%zmm7{%k4}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vcvtph2ps);
+        decl_insn(evex_vcvtps2ph);
+
+        asm volatile ( "vpternlogd $0x81, %%zmm7, %%zmm7, %%zmm7\n\t"
+                       "kmovw %1,%%k4\n"
+                       put_insn(evex_vcvtph2ps, "vcvtph2ps 32(%0), %%zmm7%{%%k4%}")
+                       :: "c" (NULL), "r" (0x3333) );
+
+        set_insn(evex_vcvtph2ps);
+        memset(res, 0xff, 128);
+        res[8] = 0x40003c00; /* (1.0, 2.0) */
+        res[10] = 0x44004200; /* (3.0, 4.0) */
+        res[12] = 0x3400b800; /* (-.5, .25) */
+        res[14] = 0xbc000000; /* (0.0, -1.) */
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "vmovups %%zmm7, %0" : "=m" (res[16]) );
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtph2ps) )
+            goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vcvtps2ph $0,%zmm3,64(%edx){%k4}...");
+        asm volatile ( "vmovups %0, %%zmm3\n"
+                       put_insn(evex_vcvtps2ph, "vcvtps2ph $0, %%zmm3, 128(%1)%{%%k4%}")
+                       :: "m" (res[16]), "d" (NULL) );
+
+        set_insn(evex_vcvtps2ph);
+        regs.edx = (unsigned long)res;
+        memset(res + 32, 0xcc, 32);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtps2ph) )
+            goto fail;
+        res[15] = res[13] = res[11] = res[9] = 0xcccccccc;
+        if ( memcmp(res + 8, res + 32, 32) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -310,7 +310,8 @@  static const struct twobyte_table {
     [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
     [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
     [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
-    [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+    [0x5a] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl },
+    [0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
@@ -437,7 +438,7 @@  static const struct ext0f38_table {
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x13] = { .simd_size = simd_other, .two_op = 1 },
+    [0x13] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 2 },
@@ -541,7 +542,7 @@  static const struct ext0f3a_table {
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
     [0x1a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
     [0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
-    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
+    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x20] = { .simd_size = simd_none, .d8s = 0 },
     [0x21] = { .simd_size = simd_other, .d8s = 2 },
@@ -3071,6 +3072,11 @@  x86_decode(
                 modrm_mod = 3;
                 break;
 
+            case 0x5a: /* vcvtps2pd needs special casing */
+                if ( disp8scale && !evex.pfx && !evex.brs )
+                    --disp8scale;
+                break;
+
             case 0x7e: /* vmovq xmm/m64,xmm needs special casing */
                 if ( disp8scale == 2 && evex.pfx == vex_f3 )
                     disp8scale = 3;
@@ -5998,6 +6004,7 @@  x86_emulate(
     CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5d):    /* vmin{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5e):    /* vdiv{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5f):    /* vmax{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    avx512f_all_fp:
         generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
                                (ea.type != OP_REG && evex.brs &&
                                 (evex.pfx & VEX_PREFIX_SCALAR_MASK))),
@@ -6557,7 +6564,7 @@  x86_emulate(
         goto simd_zmm;
 
     CASE_SIMD_ALL_FP(, 0x0f, 0x5a):        /* cvt{p,s}{s,d}2{p,s}{s,d} xmm/mem,xmm */
-    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a):    /* vcvtp{s,d}2p{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a):    /* vcvtp{s,d}2p{s,d} {x,y}mm/mem,{x,y}mm */
                                            /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm */
         op_bytes = 4 << (((vex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + vex.l) +
                          !!(vex.pfx & VEX_PREFIX_DOUBLE_MASK));
@@ -6566,6 +6573,12 @@  x86_emulate(
             goto simd_0f_sse2;
         goto simd_0f_avx;
 
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5a):   /* vcvtp{s,d}2p{s,d} [xyz]mm/mem,[xyz]mm{k} */
+                                           /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm{k} */
+        op_bytes = 4 << (((evex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + evex.lr) +
+                         evex.w);
+        goto avx512f_all_fp;
+
     CASE_SIMD_PACKED_FP(, 0x0f, 0x5b):     /* cvt{ps,dq}2{dq,ps} xmm/mem,xmm */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x5b): /* vcvt{ps,dq}2{dq,ps} {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_F3(0x0f, 0x5b):       /* cvttps2dq xmm/mem,xmm */
@@ -8455,6 +8468,15 @@  x86_emulate(
         op_bytes = 8 << vex.l;
         goto simd_0f_ymm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x13): /* vcvtph2ps {x,y}mm/mem,[xyz]mm{k} */
+        generate_exception_if(evex.w || (ea.type != OP_REG && evex.brs), EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( !evex.brs )
+            avx512_vlen_check(false);
+        op_bytes = 8 << evex.lr;
+        elem_bytes = 2;
+        goto simd_zmm;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
         generate_exception_if(!vex.l || vex.w, EXC_UD);
@@ -9283,27 +9305,79 @@  x86_emulate(
         goto avx512f_imm8_no_sae;
 
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,[xyz]mm,{x,y}mm/mem{k} */
     {
         uint32_t mxcsr;
 
-        generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
-        host_and_vcpu_must_have(f16c);
         fail_if(!ops->write);
+        if ( evex_encoded() )
+        {
+            generate_exception_if((evex.w || evex.reg != 0xf || !evex.RX ||
+                                   (ea.type != OP_REG && (evex.z || evex.brs))),
+                                  EXC_UD);
+            host_and_vcpu_must_have(avx512f);
+            avx512_vlen_check(false);
+            opc = init_evex(stub);
+        }
+        else
+        {
+            generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
+            host_and_vcpu_must_have(f16c);
+            opc = init_prefixes(stub);
+        }
+
+        op_bytes = 8 << evex.lr;
 
-        opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
         if ( ea.type == OP_MEM )
         {
             /* Convert memory operand to (%rAX). */
             vex.b = 1;
+            evex.b = 1;
             opc[1] &= 0x38;
         }
         opc[2] = imm1;
-        insn_bytes = PFX_BYTES + 3;
+        if ( evex_encoded() )
+        {
+            unsigned int full = 0;
+
+            insn_bytes = EVEX_PFX_BYTES + 3;
+            copy_EVEX(opc, evex);
+
+            if ( ea.type == OP_MEM && evex.opmsk )
+            {
+                full = 0xffff >> (16 - op_bytes / 2);
+                op_mask &= full;
+                if ( !op_mask )
+                    goto complete_insn;
+
+                first_byte = __builtin_ctz(op_mask);
+                op_mask >>= first_byte;
+                full >>= first_byte;
+                first_byte <<= 1;
+                op_bytes = (32 - __builtin_clz(op_mask)) << 1;
+
+                /*
+                 * We may need to read (parts of) the memory operand for the
+                 * purpose of merging in order to avoid splitting the write
+                 * below into multiple ones.
+                 */
+                if ( op_mask != full &&
+                     (rc = ops->read(ea.mem.seg,
+                                     truncate_ea(ea.mem.off + first_byte),
+                                     (void *)mmvalp + first_byte, op_bytes,
+                                     ctxt)) != X86EMUL_OKAY )
+                    goto done;
+            }
+        }
+        else
+        {
+            insn_bytes = PFX_BYTES + 3;
+            copy_VEX(opc, vex);
+        }
         opc[3] = 0xc3;
 
-        copy_VEX(opc, vex);
         /* Latch MXCSR - we may need to restore it below. */
         invoke_stub("stmxcsr %[mxcsr]", "",
                     "=m" (*mmvalp), [mxcsr] "=m" (mxcsr) : "a" (mmvalp));
@@ -9312,7 +9386,8 @@  x86_emulate(
 
         if ( ea.type == OP_MEM )
         {
-            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
+            rc = ops->write(ea.mem.seg, truncate_ea(ea.mem.off + first_byte),
+                            (void *)mmvalp + first_byte, op_bytes, ctxt);
             if ( rc != X86EMUL_OKAY )
             {
                 asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );