@@ -193,6 +193,8 @@ static const struct test avx512f_all[] =
INSN_PFP_NB(movu, 0f, 10),
INSN_PFP_NB(movu, 0f, 11),
INSN_FP(mul, 0f, 59),
+ INSN(pabsd, 66, 0f38, 1e, vl, d, vl),
+ INSN(pabsq, 66, 0f38, 1f, vl, q, vl),
INSN(paddd, 66, 0f, fe, vl, d, vl),
INSN(paddq, 66, 0f, d4, vl, q, vl),
INSN(pand, 66, 0f, db, vl, dq, vl),
@@ -276,6 +278,10 @@ static const struct test avx512f_all[] =
INSN(punpckldq, 66, 0f, 62, vl, d, vl),
INSN(punpcklqdq, 66, 0f, 6c, vl, q, vl),
INSN(pxor, 66, 0f, ef, vl, dq, vl),
+ INSN(rndscalepd, 66, 0f3a, 09, vl, q, vl),
+ INSN(rndscaleps, 66, 0f3a, 08, vl, d, vl),
+ INSN(rndscalesd, 66, 0f3a, 0b, el, q, el),
+ INSN(rndscaless, 66, 0f3a, 0a, el, d, el),
INSN_PFP(shuf, 0f, c6),
INSN_FP(sqrt, 0f, 51),
INSN_FP(sub, 0f, 5c),
@@ -336,6 +342,8 @@ static const struct test avx512bw_all[]
INSN(movdqu8, f2, 0f, 7f, vl, b, vl),
INSN(movdqu16, f2, 0f, 6f, vl, w, vl),
INSN(movdqu16, f2, 0f, 7f, vl, w, vl),
+ INSN(pabsb, 66, 0f38, 1c, vl, b, vl),
+ INSN(pabsw, 66, 0f38, 1d, vl, w, vl),
INSN(packssdw, 66, 0f, 6b, vl, d_nb, vl),
INSN(packsswb, 66, 0f, 63, vl, w, vl),
INSN(packusdw, 66, 0f38, 2b, vl, d_nb, vl),
@@ -211,8 +211,10 @@ static inline vec_t movlhps(vec_t x, vec
#elif defined(FLOAT_SIZE) && VEC_SIZE == FLOAT_SIZE && defined(__AVX512F__)
# if FLOAT_SIZE == 4
# define sqrt(x) scalar_1op(x, "vsqrtss %[in], %[out], %[out]")
+# define trunc(x) scalar_1op(x, "vrndscaless $0b1011, %[in], %[out], %[out]")
# elif FLOAT_SIZE == 8
# define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
+# define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]")
# endif
#elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
(VEC_SIZE == 64 || defined(__AVX512VL__))
@@ -263,6 +265,7 @@ static inline vec_t movlhps(vec_t x, vec
# define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
# define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0)
# define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0)
+# define trunc(x) BR(rndscaleps_, _mask, x, 0b1011, undef(), ~0)
# define widen1(x) ((vec_t)BR(cvtps2pd, _mask, x, (vdf_t)undef(), ~0))
# if VEC_SIZE == 16
# define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
@@ -316,6 +319,7 @@ static inline vec_t movlhps(vec_t x, vec
# define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
# define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
# define sqrt(x) BR(sqrtpd, _mask, x, undef(), ~0)
+# define trunc(x) BR(rndscalepd_, _mask, x, 0b1011, undef(), ~0)
# if VEC_SIZE == 16
# define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
# define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
@@ -548,6 +552,7 @@ static inline vec_t movlhps(vec_t x, vec
# endif
# endif
# if INT_SIZE == 4
+# define abs(x) B(pabsd, _mask, x, undef(), ~0)
# define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
# define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
# define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
@@ -558,6 +563,7 @@ static inline vec_t movlhps(vec_t x, vec
# define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
# define widen1(x) ((vec_t)B(pmovzxdq, _mask, (vsi_half_t)(x), (vdi_t)undef(), ~0))
# elif INT_SIZE == 8
+# define abs(x) ((vec_t)B(pabsq, _mask, (vdi_t)(x), (vdi_t)undef(), ~0))
# define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
# define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
# elif UINT_SIZE == 8
@@ -625,6 +631,7 @@ static inline vec_t movlhps(vec_t x, vec
# define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0))
# endif
# if INT_SIZE == 1
+# define abs(x) ((vec_t)B(pabsb, _mask, (vqi_t)(x), (vqi_t)undef(), ~0))
# define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
# define min(x, y) ((vec_t)B(pminsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
# define widen1(x) ((vec_t)B(pmovsxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0))
@@ -637,6 +644,7 @@ static inline vec_t movlhps(vec_t x, vec
# define widen2(x) ((vec_t)B(pmovzxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0))
# define widen3(x) ((vec_t)B(pmovzxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0))
# elif INT_SIZE == 2
+# define abs(x) B(pabsw, _mask, x, undef(), ~0)
# define max(x, y) B(pmaxsw, _mask, x, y, undef(), ~0)
# define min(x, y) B(pminsw, _mask, x, y, undef(), ~0)
# define mul_hi(x, y) B(pmulhw, _mask, x, y, undef(), ~0)
@@ -948,19 +956,11 @@ static inline vec_t movlhps(vec_t x, vec
#if VEC_SIZE == FLOAT_SIZE
# define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
# define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
-# ifdef __SSE4_1__
+# if defined(__SSE4_1__) && !defined(__AVX512F__)
# if FLOAT_SIZE == 4
-# define trunc(x) ({ \
- float __attribute__((vector_size(16))) r_; \
- asm ( "roundss $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
- (vec_t){ r_[0] }; \
-})
+# define trunc(x) scalar_1op(x, "roundss $0b1011, %[in], %[out]")
# elif FLOAT_SIZE == 8
-# define trunc(x) ({ \
- double __attribute__((vector_size(16))) r_; \
- asm ( "roundsd $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
- (vec_t){ r_[0] }; \
-})
+# define trunc(x) scalar_1op(x, "roundsd $0b1011, %[in], %[out]")
# endif
# endif
#endif
@@ -184,6 +184,8 @@ DECL_OCTET(half);
# define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask
# define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask
# define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask
+# define __builtin_ia32_rndscalepd_512_mask __builtin_ia32_rndscalepd_mask
+# define __builtin_ia32_rndscaleps_512_mask __builtin_ia32_rndscaleps_mask
# define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
# define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
# define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
@@ -245,6 +247,7 @@ OVR_INT(broadcast);
OVR_SFP(broadcast);
OVR_SFP(comi);
OVR_VFP(cvtdq2);
+OVR_INT(abs);
OVR_FP(add);
OVR_INT(add);
OVR_BW(adds);
@@ -446,7 +446,7 @@ static const struct ext0f38_table {
[0x19] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 3 },
[0x1a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
[0x1b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
- [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0x1c ... 0x1f] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
[0x20] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
[0x21] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
[0x22] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_8 },
@@ -531,8 +531,8 @@ static const struct ext0f3a_table {
[0x02] = { .simd_size = simd_packed_int },
[0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0x06] = { .simd_size = simd_packed_fp },
- [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
- [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc },
+ [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
+ [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc, .d8s = d8s_dq },
[0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
[0x0e ... 0x0f] = { .simd_size = simd_packed_int },
[0x14] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 0 },
@@ -6917,6 +6917,8 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f, 0xf9): /* vpsubw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xfc): /* vpaddb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xfd): /* vpaddw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x1c): /* vpabsb [xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x1d): /* vpabsw [xyz]mm/mem,[xyz]mm{k} */
host_and_vcpu_must_have(avx512bw);
generate_exception_if(evex.brs, EXC_UD);
elem_bytes = 1 << (b & 1);
@@ -8303,6 +8305,8 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f, 0xfa): /* vpsubd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xfb): /* vpsubq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xfe): /* vpaddd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x1e): /* vpabsd [xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x1f): /* vpabsq [xyz]mm/mem,[xyz]mm{k} */
generate_exception_if(evex.w != (b & 1), EXC_UD);
goto avx512f_no_sae;
@@ -9331,6 +9335,17 @@ x86_emulate(
host_and_vcpu_must_have(sse4_1);
goto simd_0f3a_common;
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x0a): /* vrndscaless $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x0b): /* vrndscalesd $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x08): /* vrndscaleps $imm8,[xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x09): /* vrndscalepd $imm8,[xyz]mm/mem,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512f);
+ generate_exception_if(evex.w != (b & 1), EXC_UD);
+ avx512_vlen_check(b & 2);
+ goto simd_imm8_zmm;
+
case X86EMUL_OPC(0x0f3a, 0x0f): /* palignr $imm8,mm/m64,mm */
case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
host_and_vcpu_must_have(ssse3);
Plus their AVX512BW counterparts. Take the opportunity and also eliminate a pair of open coded instances of scalar_1op(). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v7: Re-base. v6: Re-base over changes earlier in the series. v5: New.