@@ -109,6 +109,12 @@ static const struct test avx512f_all[] =
INSN_FP(cmp, 0f, c2),
INSN(comisd, 66, 0f, 2f, el, q, el),
INSN(comiss, , 0f, 2f, el, d, el),
+ INSN(cvtpd2ps, 66, 0f, 5a, vl, q, vl),
+ INSN(cvtph2ps, 66, 0f38, 13, vl_2, d_nb, vl),
+ INSN(cvtps2pd, , 0f, 5a, vl_2, d, vl),
+ INSN(cvtps2ph, 66, 0f3a, 1d, vl_2, d_nb, vl),
+ INSN(cvtsd2ss, f2, 0f, 5a, el, q, el),
+ INSN(cvtss2sd, f3, 0f, 5a, el, d, el),
INSN_FP(div, 0f, 5e),
INSN(fmadd132, 66, 0f38, 98, vl, sd, vl),
INSN(fmadd132, 66, 0f38, 99, el, sd, el),
@@ -181,7 +181,9 @@ static inline bool _to_bool(byte_vec_t b
# define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
# define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
# define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+# define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0)
# define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0)
+# define widen1(x) ((vec_t)BR(cvtps2pd, _mask, x, (vdf_t)undef(), ~0))
# if VEC_SIZE == 16
# define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
# define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
@@ -68,6 +68,7 @@ typedef short __attribute__((vector_size
typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
#if VEC_SIZE >= 8
typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t;
#endif
#if ELEM_SIZE == 1
@@ -93,6 +94,7 @@ typedef char __attribute__((vector_size(
typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t;
# endif
# if ELEM_COUNT >= 4
@@ -328,6 +330,13 @@ REN(pandn, , d);
REN(por, , d);
REN(pxor, , d);
# endif
+OVR(cvtpd2psx);
+OVR(cvtpd2psy);
+OVR(cvtph2ps);
+OVR(cvtps2pd);
+OVR(cvtps2ph);
+OVR(cvtsd2ss);
+OVR(cvtss2sd);
OVR(movddup);
OVR(movntdq);
OVR(movntdqa);
@@ -3871,6 +3871,49 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing vcvtph2ps 32(%ecx),%zmm7{%k4}...");
+ if ( stack_exec && cpu_has_avx512f )
+ {
+ decl_insn(evex_vcvtph2ps);
+ decl_insn(evex_vcvtps2ph);
+
+ asm volatile ( "vpternlogd $0x81, %%zmm7, %%zmm7, %%zmm7\n\t"
+ "kmovw %1,%%k4\n"
+ put_insn(evex_vcvtph2ps, "vcvtph2ps 32(%0), %%zmm7%{%%k4%}")
+ :: "c" (NULL), "r" (0x3333) );
+
+ set_insn(evex_vcvtph2ps);
+ memset(res, 0xff, 128);
+ res[8] = 0x40003c00; /* (1.0, 2.0) */
+ res[10] = 0x44004200; /* (3.0, 4.0) */
+ res[12] = 0x3400b800; /* (-.5, .25) */
+ res[14] = 0xbc000000; /* (0.0, -1.) */
+ regs.ecx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ asm volatile ( "vmovups %%zmm7, %0" : "=m" (res[16]) );
+ if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtph2ps) )
+ goto fail;
+ printf("okay\n");
+
+ printf("%-40s", "Testing vcvtps2ph $0,%zmm3,64(%edx){%k4}...");
+ asm volatile ( "vmovups %0, %%zmm3\n"
+ put_insn(evex_vcvtps2ph, "vcvtps2ph $0, %%zmm3, 128(%1)%{%%k4%}")
+ :: "m" (res[16]), "d" (NULL) );
+
+ set_insn(evex_vcvtps2ph);
+ regs.edx = (unsigned long)res;
+ memset(res + 32, 0xcc, 32);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtps2ph) )
+ goto fail;
+ res[15] = res[13] = res[11] = res[9] = 0xcccccccc;
+ if ( memcmp(res + 8, res + 32, 32) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
#undef decl_insn
#undef put_insn
#undef set_insn
@@ -310,7 +310,8 @@ static const struct twobyte_table {
[0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
[0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
[0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
- [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0x5a] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl },
+ [0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
[0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
[0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
@@ -437,7 +438,7 @@ static const struct ext0f38_table {
[0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x0e ... 0x0f] = { .simd_size = simd_packed_fp },
[0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
- [0x13] = { .simd_size = simd_other, .two_op = 1 },
+ [0x13] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
[0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
[0x18] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 2 },
@@ -541,7 +542,7 @@ static const struct ext0f3a_table {
[0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
[0x1a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
[0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
- [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
+ [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
[0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x20] = { .simd_size = simd_none, .d8s = 0 },
[0x21] = { .simd_size = simd_other, .d8s = 2 },
@@ -3071,6 +3072,11 @@ x86_decode(
modrm_mod = 3;
break;
+ case 0x5a: /* vcvtps2pd needs special casing */
+ if ( disp8scale && !evex.pfx && !evex.brs )
+ --disp8scale;
+ break;
+
case 0x7e: /* vmovq xmm/m64,xmm needs special casing */
if ( disp8scale == 2 && evex.pfx == vex_f3 )
disp8scale = 3;
@@ -5998,6 +6004,7 @@ x86_emulate(
CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5d): /* vmin{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5f): /* vmax{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ avx512f_all_fp:
generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
(ea.type != OP_REG && evex.brs &&
(evex.pfx & VEX_PREFIX_SCALAR_MASK))),
@@ -6557,7 +6564,7 @@ x86_emulate(
goto simd_zmm;
CASE_SIMD_ALL_FP(, 0x0f, 0x5a): /* cvt{p,s}{s,d}2{p,s}{s,d} xmm/mem,xmm */
- CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a): /* vcvtp{s,d}2p{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a): /* vcvtp{s,d}2p{s,d} {x,y}mm/mem,{x,y}mm */
/* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm */
op_bytes = 4 << (((vex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + vex.l) +
!!(vex.pfx & VEX_PREFIX_DOUBLE_MASK));
@@ -6566,6 +6573,12 @@ x86_emulate(
goto simd_0f_sse2;
goto simd_0f_avx;
+ CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5a): /* vcvtp{s,d}2p{s,d} [xyz]mm/mem,[xyz]mm{k} */
+ /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm{k} */
+ op_bytes = 4 << (((evex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + evex.lr) +
+ evex.w);
+ goto avx512f_all_fp;
+
CASE_SIMD_PACKED_FP(, 0x0f, 0x5b): /* cvt{ps,dq}2{dq,ps} xmm/mem,xmm */
CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x5b): /* vcvt{ps,dq}2{dq,ps} {x,y}mm/mem,{x,y}mm */
case X86EMUL_OPC_F3(0x0f, 0x5b): /* cvttps2dq xmm/mem,xmm */
@@ -8455,6 +8468,15 @@ x86_emulate(
op_bytes = 8 << vex.l;
goto simd_0f_ymm;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x13): /* vcvtph2ps {x,y}mm/mem,[xyz]mm{k} */
+ generate_exception_if(evex.w || (ea.type != OP_REG && evex.brs), EXC_UD);
+ host_and_vcpu_must_have(avx512f);
+ if ( !evex.brs )
+ avx512_vlen_check(false);
+ op_bytes = 8 << evex.lr;
+ elem_bytes = 2;
+ goto simd_zmm;
+
case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
generate_exception_if(!vex.l || vex.w, EXC_UD);
@@ -9283,27 +9305,79 @@ x86_emulate(
goto avx512f_imm8_no_sae;
case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,[xyz]mm,{x,y}mm/mem{k} */
{
uint32_t mxcsr;
- generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
- host_and_vcpu_must_have(f16c);
fail_if(!ops->write);
+ if ( evex_encoded() )
+ {
+ generate_exception_if((evex.w || evex.reg != 0xf || !evex.RX ||
+ (ea.type != OP_REG && (evex.z || evex.brs))),
+ EXC_UD);
+ host_and_vcpu_must_have(avx512f);
+ avx512_vlen_check(false);
+ opc = init_evex(stub);
+ }
+ else
+ {
+ generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
+ host_and_vcpu_must_have(f16c);
+ opc = init_prefixes(stub);
+ }
+
+ op_bytes = 8 << evex.lr;
- opc = init_prefixes(stub);
opc[0] = b;
opc[1] = modrm;
if ( ea.type == OP_MEM )
{
/* Convert memory operand to (%rAX). */
vex.b = 1;
+ evex.b = 1;
opc[1] &= 0x38;
}
opc[2] = imm1;
- insn_bytes = PFX_BYTES + 3;
+ if ( evex_encoded() )
+ {
+ unsigned int full = 0;
+
+ insn_bytes = EVEX_PFX_BYTES + 3;
+ copy_EVEX(opc, evex);
+
+ if ( ea.type == OP_MEM && evex.opmsk )
+ {
+ full = 0xffff >> (16 - op_bytes / 2);
+ op_mask &= full;
+ if ( !op_mask )
+ goto complete_insn;
+
+ first_byte = __builtin_ctz(op_mask);
+ op_mask >>= first_byte;
+ full >>= first_byte;
+ first_byte <<= 1;
+ op_bytes = (32 - __builtin_clz(op_mask)) << 1;
+
+ /*
+ * We may need to read (parts of) the memory operand for the
+ * purpose of merging in order to avoid splitting the write
+ * below into multiple ones.
+ */
+ if ( op_mask != full &&
+ (rc = ops->read(ea.mem.seg,
+ truncate_ea(ea.mem.off + first_byte),
+ (void *)mmvalp + first_byte, op_bytes,
+ ctxt)) != X86EMUL_OKAY )
+ goto done;
+ }
+ }
+ else
+ {
+ insn_bytes = PFX_BYTES + 3;
+ copy_VEX(opc, vex);
+ }
opc[3] = 0xc3;
- copy_VEX(opc, vex);
/* Latch MXCSR - we may need to restore it below. */
invoke_stub("stmxcsr %[mxcsr]", "",
"=m" (*mmvalp), [mxcsr] "=m" (mxcsr) : "a" (mmvalp));
@@ -9312,7 +9386,8 @@ x86_emulate(
if ( ea.type == OP_MEM )
{
- rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
+ rc = ops->write(ea.mem.seg, truncate_ea(ea.mem.off + first_byte),
+ (void *)mmvalp + first_byte, op_bytes, ctxt);
if ( rc != X86EMUL_OKAY )
{
asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );
VCVTPS2PD, sharing its main opcode with others, needs a "manual" override of disp8scale. The simd_size change for twobyte_table[0x5a] is benign to pre-existing code, but allows decode_disp8scale() to work as is here. Also correct the comment on an AVX counterpart. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v7: ea.type == OP_* -> ea.type != OP_*. Re-base. v6: Re-base over changes earlier in the series. v5: Re-base over changes earlier in the series. v4: New.