diff mbox

[v2,01/17] x86emul: support remaining AVX insns

Message ID 59BAB873020000780017B379@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich Sept. 14, 2017, 3:12 p.m. UTC
I.e. those not being equivalents of SSEn ones.

There's one necessary change to generic code: Faulting behavior of
VMASKMOVP{S,D} requires us to do partial reads/writes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Move vpmaskmov{d,q} handling to AVX2 patch.

Comments

George Dunlap Oct. 9, 2017, 3:36 p.m. UTC | #1
On 09/14/2017 04:12 PM, Jan Beulich wrote:
> I.e. those not being equivalents of SSEn ones.
> 
> There's one necessary change to generic code: Faulting behavior of
> VMASKMOVP{S,D} requires us to do partial reads/writes.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v2: Move vpmaskmov{d,q} handling to AVX2 patch.
> 
> --- a/.gitignore
> +++ b/.gitignore
> @@ -224,7 +224,7 @@
>  tools/tests/x86_emulator/*.bin
>  tools/tests/x86_emulator/*.tmp
>  tools/tests/x86_emulator/asm
> -tools/tests/x86_emulator/avx*.h
> +tools/tests/x86_emulator/avx*.[ch]
>  tools/tests/x86_emulator/blowfish.h
>  tools/tests/x86_emulator/sse*.[ch]
>  tools/tests/x86_emulator/test_x86_emulator
> --- a/tools/tests/x86_emulator/Makefile
> +++ b/tools/tests/x86_emulator/Makefile
> @@ -11,8 +11,8 @@ all: $(TARGET)
>  run: $(TARGET)
>  	./$(TARGET)
>  
> -SIMD := sse sse2 sse4
> -TESTCASES := blowfish $(SIMD) $(addsuffix -avx,$(filter sse%,$(SIMD)))
> +SIMD := sse sse2 sse4 avx
> +TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
>  
>  blowfish-cflags := ""
>  blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
> @@ -26,34 +26,36 @@ sse2-flts := 4 8
>  sse4-vecs := $(sse2-vecs)
>  sse4-ints := $(sse2-ints)
>  sse4-flts := $(sse2-flts)
> +avx-vecs := 16 32
> +avx-ints :=
> +avx-flts := 4 8
>  
>  # When converting SSE to AVX, have the compiler avoid XMM0 to widen
>  # coverage of the VEX.vvvv checks in the emulator. We must not do this,
>  # however, for SSE4.1 and later, as there are instructions with XMM0 as
>  # an implicit operand.
> -sse2avx-sse  := -ffixed-xmm0 -Wa,-msse2avx
> -sse2avx-sse2 := $(sse2avx-sse)
> +sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
>  sse2avx-sse4 := -Wa,-msse2avx
>  
> +# For AVX and later, have the compiler avoid XMM0 to widen coverage of
> +# the VEX.vvvv checks in the emulator.
> +non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
> +
>  define simd-defs
>  $(1)-cflags := \
>  	$(foreach vec,$($(1)-vecs), \
>  	  $(foreach int,$($(1)-ints), \
> -	    "-D_$(vec)i$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
> -	    "-D_$(vec)u$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
> +	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
> +	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
>  	  $(foreach flt,$($(1)-flts), \
> -	    "-D_$(vec)f$(flt) -m$(1) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
> +	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
>  	$(foreach flt,$($(1)-flts), \
> -	  "-D_f$(flt) -m$(1) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
> +	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
>  $(1)-avx-cflags := \
>  	$(foreach vec,$($(1)-vecs), \
>  	  $(foreach int,$($(1)-ints), \
>  	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
> -	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
> -	  $(foreach flt,$($(1)-flts), \
> -	    "-D_$(vec)f$(flt) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
> -	$(foreach flt,$($(1)-flts), \
> -	  "-D_f$(flt) -m$(1) -mfpmath=sse $(sse2avx-$(1)) -O2 -DFLOAT_SIZE=$(flt)")
> +	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
>  endef
>  
>  $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
> --- a/tools/tests/x86_emulator/simd.c
> +++ b/tools/tests/x86_emulator/simd.c
> @@ -70,7 +70,13 @@ typedef long long __attribute__((vector_
>  #if VEC_SIZE == 8 && defined(__SSE__)
>  # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
>  #elif VEC_SIZE == 16
> -# if defined(__SSE4_1__)
> +# if defined(__AVX__) && defined(FLOAT_SIZE)
> +#  if ELEM_SIZE == 4
> +#   define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
> +#  elif ELEM_SIZE == 8
> +#   define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
> +#  endif
> +# elif defined(__SSE4_1__)
>  #  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
>  # elif defined(__SSE__) && ELEM_SIZE == 4
>  #  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
> @@ -81,6 +87,12 @@ typedef long long __attribute__((vector_
>  #   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
>  #  endif
>  # endif
> +#elif VEC_SIZE == 32
> +# if defined(__AVX__) && ELEM_SIZE == 4
> +#  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
> +# elif defined(__AVX__) && ELEM_SIZE == 8
> +#  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
> +# endif
>  #endif
>  
>  #ifndef to_bool
> @@ -105,6 +117,12 @@ static inline bool _to_bool(byte_vec_t b
>  # elif FLOAT_SIZE == 8
>  #  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
>  # endif
> +#elif VEC_SIZE == 32 && defined(__AVX__)
> +# if FLOAT_SIZE == 4
> +#  define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
> +# elif FLOAT_SIZE == 8
> +#  define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
> +# endif
>  #endif
>  
>  #if VEC_SIZE == FLOAT_SIZE
> @@ -116,7 +134,25 @@ static inline bool _to_bool(byte_vec_t b
>  #endif
>  
>  #if FLOAT_SIZE == 4 && defined(__SSE__)
> -# if VEC_SIZE == 16
> +# if VEC_SIZE == 32 && defined(__AVX__)
> +#  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
> +#  define max(x, y) __builtin_ia32_maxps256(x, y)
> +#  define min(x, y) __builtin_ia32_minps256(x, y)
> +#  define recip(x) __builtin_ia32_rcpps256(x)
> +#  define rsqrt(x) __builtin_ia32_rsqrtps256(x)
> +#  define sqrt(x) __builtin_ia32_sqrtps256(x)
> +#  define swap(x) ({ \
> +    vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
> +    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
> +})
> +#  define swap2(x) ({ \
> +    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
> +    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
> +})
> +# elif VEC_SIZE == 16
> +#  ifdef __AVX__
> +#   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
> +#  endif
>  #  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
>  #  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
>  #  define max(x, y) __builtin_ia32_maxps(x, y)
> @@ -125,13 +161,39 @@ static inline bool _to_bool(byte_vec_t b
>  #  define rsqrt(x) __builtin_ia32_rsqrtps(x)
>  #  define sqrt(x) __builtin_ia32_sqrtps(x)
>  #  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
> +#  ifdef __AVX__
> +#   define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
> +#  endif
>  # elif VEC_SIZE == 4
>  #  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
>  #  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
>  #  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
>  # endif
>  #elif FLOAT_SIZE == 8 && defined(__SSE2__)
> -# if VEC_SIZE == 16
> +# if VEC_SIZE == 32 && defined(__AVX__)
> +#  define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
> +#  define max(x, y) __builtin_ia32_maxpd256(x, y)
> +#  define min(x, y) __builtin_ia32_minpd256(x, y)
> +#  define recip(x) ({ \
> +    float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
> +    t_ = __builtin_ia32_vextractf128_ps256( \
> +             __builtin_ia32_rcpps256( \
> +                 __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
> +    __builtin_ia32_cvtps2pd256(t_); \
> +})
> +#  define rsqrt(x) ({ \
> +    float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
> +    float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
> +    t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
> +    t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
> +    __builtin_ia32_cvtps2pd256(t1_); \
> +})
> +#  define sqrt(x) __builtin_ia32_sqrtpd256(x)
> +#  define swap(x) ({ \
> +    vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
> +    __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
> +})
> +# elif VEC_SIZE == 16
>  #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
>  #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
>  #  define max(x, y) __builtin_ia32_maxpd(x, y)
> @@ -140,6 +202,10 @@ static inline bool _to_bool(byte_vec_t b
>  #  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
>  #  define sqrt(x) __builtin_ia32_sqrtpd(x)
>  #  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
> +#  ifdef __AVX__
> +#   define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
> +                                                       __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
> +#  endif
>  # elif VEC_SIZE == 8
>  #  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
>  #  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
> @@ -201,6 +267,31 @@ static inline bool _to_bool(byte_vec_t b
>  #  define hadd(x, y) __builtin_ia32_haddpd(x, y)
>  #  define hsub(x, y) __builtin_ia32_hsubpd(x, y)
>  # endif
> +#elif VEC_SIZE == 32 && defined(__AVX__)
> +# if FLOAT_SIZE == 4
> +#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
> +#  define dup_hi(x) __builtin_ia32_movshdup256(x)
> +#  define dup_lo(x) __builtin_ia32_movsldup256(x)
> +#  define hadd(x, y) ({ \
> +        vec_t t_ = __builtin_ia32_haddps256(x, y); \
> +        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
> +})
> +#  define hsub(x, y) ({ \
> +        vec_t t_ = __builtin_ia32_hsubps256(x, y); \
> +        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
> +})
> +# elif FLOAT_SIZE == 8
> +#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
> +#  define dup_lo(x) __builtin_ia32_movddup256(x)
> +#  define hadd(x, y) ({ \
> +        vec_t t_ = __builtin_ia32_haddpd256(x, y); \
> +        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
> +})
> +#  define hsub(x, y) ({ \
> +        vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
> +        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
> +})
> +# endif
>  #endif
>  #if VEC_SIZE == 16 && defined(__SSSE3__)
>  # if INT_SIZE == 1
> @@ -282,6 +373,31 @@ static inline bool _to_bool(byte_vec_t b
>  #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
>  # endif
>  #endif
> +#if VEC_SIZE == 32 && defined(__AVX__)
> +# if FLOAT_SIZE == 4
> +#  define dot_product(x, y) ({ \
> +    vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
> +    (vec_t){t_[0] + t_[4]}; \
> +})
> +#  define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
> +#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
> +#  define select2(d, x, y, m) ({ \
> +    vsi_t m_ = (vsi_t)(m); \
> +    *(d) = __builtin_ia32_maskloadps256(&(x),  m_); \
> +    __builtin_ia32_maskstoreps256(d, ~m_, y); \
> +})
> +#  define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
> +# elif FLOAT_SIZE == 8
> +#  define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
> +#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
> +#  define select2(d, x, y, m) ({ \
> +    vdi_t m_ = (vdi_t)(m); \
> +    *(d) = __builtin_ia32_maskloadpd256(&(x),  m_); \
> +    __builtin_ia32_maskstorepd256(d, ~m_, y); \
> +})
> +#  define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
> +# endif
> +#endif
>  #if VEC_SIZE == FLOAT_SIZE
>  # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
>  # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
> @@ -555,6 +671,15 @@ int simd_test(void)
>      if ( !to_bool(swap(src) == inv) ) return __LINE__;
>  #endif
>  
> +#ifdef swap2
> +    touch(src);
> +    if ( !to_bool(swap2(src) == inv) ) return __LINE__;
> +#endif
> +
> +#if defined(broadcast)
> +    if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
> +#endif
> +
>  #if defined(interleave_lo) && defined(interleave_hi)
>      touch(src);
>      x = interleave_lo(inv, src);
> @@ -652,6 +777,15 @@ int simd_test(void)
>  # endif
>      if ( !to_bool(z == y) ) return __LINE__;
>  #endif
> +
> +#ifdef select2
> +# ifdef UINT_SIZE
> +    select2(&z, src, inv, alt);
> +# else
> +    select2(&z, src, inv, alt > 0);
> +# endif
> +    if ( !to_bool(z == y) ) return __LINE__;
> +#endif
>  
>  #ifdef mix
>      touch(src);
> --- a/tools/tests/x86_emulator/test_x86_emulator.c
> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> @@ -8,9 +8,9 @@
>  #include "sse.h"
>  #include "sse2.h"
>  #include "sse4.h"
> -#include "sse-avx.h"
>  #include "sse2-avx.h"
>  #include "sse4-avx.h"
> +#include "avx.h"
>  
>  #define verbose false /* Switch to true for far more logging. */
>  
> @@ -44,7 +44,6 @@ static bool simd_check_avx(void)
>  {
>      return cpu_has_avx;
>  }
> -#define simd_check_sse_avx   simd_check_avx
>  #define simd_check_sse2_avx  simd_check_avx
>  #define simd_check_sse4_avx  simd_check_avx
>  
> @@ -122,12 +121,6 @@ static const struct {
>      SIMD(SSE4 packed u32,        sse4,      16u4),
>      SIMD(SSE4 packed s64,        sse4,      16i8),
>      SIMD(SSE4 packed u64,        sse4,      16u8),
> -    SIMD(SSE/AVX scalar single,  sse_avx,     f4),
> -    SIMD(SSE/AVX packed single,  sse_avx,   16f4),
> -    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),
> -    SIMD(SSE2/AVX packed single, sse2_avx,  16f4),
> -    SIMD(SSE2/AVX scalar double, sse2_avx,    f8),
> -    SIMD(SSE2/AVX packed double, sse2_avx,  16f8),
>      SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
>      SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
>      SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
> @@ -136,10 +129,6 @@ static const struct {
>      SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
>      SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
>      SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
> -    SIMD(SSE4/AVX scalar single, sse4_avx,    f4),
> -    SIMD(SSE4/AVX packed single, sse4_avx,  16f4),
> -    SIMD(SSE4/AVX scalar double, sse4_avx,    f8),
> -    SIMD(SSE4/AVX packed double, sse4_avx,  16f8),
>      SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
>      SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
>      SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
> @@ -148,6 +137,12 @@ static const struct {
>      SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
>      SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
>      SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
> +    SIMD(AVX scalar single,      avx,         f4),
> +    SIMD(AVX 128bit single,      avx,       16f4),
> +    SIMD(AVX 256bit single,      avx,       32f4),
> +    SIMD(AVX scalar double,      avx,         f8),
> +    SIMD(AVX 128bit double,      avx,       16f8),
> +    SIMD(AVX 256bit double,      avx,       32f8),
>  #undef SIMD_
>  #undef SIMD
>  };
> @@ -2859,6 +2854,81 @@ int main(int argc, char **argv)
>          printf("okay\n");
>      }
>      else
> +        printf("skipped\n");
> +
> +    /*
> +     * The following "maskmov" tests are not only making sure the written data
> +     * is correct, but verify (by placing operands on the mapping boundaries)
> +     * that elements controlled by clear mask bits aren't being accessed.
> +     */
> +    printf("%-40s", "Testing vmaskmovps %xmm1,%xmm2,(%edx)...");
> +    if ( stack_exec && cpu_has_avx )
> +    {
> +        decl_insn(vmaskmovps);
> +
> +        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n\t"
> +                       "vcmpeqss %%xmm1, %%xmm1, %%xmm2\n\t"
> +                       put_insn(vmaskmovps, "vmaskmovps %%xmm1, %%xmm2, (%0)")
> +                       :: "d" (NULL) );
> +
> +        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
> +        set_insn(vmaskmovps);
> +        regs.edx = (unsigned long)res + MMAP_SZ - 4;
> +        rc = x86_emulate(&ctxt, &emulops);
> +        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
> +             res[MMAP_SZ / sizeof(*res) - 1] ||
> +             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
> +                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
> +            goto fail;
> +
> +        asm volatile ( "vinsertps $0b00110111, %xmm2, %xmm2, %xmm2" );
> +        memset(res, 0xdb, 32);
> +        set_insn(vmaskmovps);
> +        regs.edx = (unsigned long)(res - 3);
> +        rc = x86_emulate(&ctxt, &emulops);
> +        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
> +             res[0] || memcmp(res + 1, res + 4, 12) )
> +            goto fail;
> +
> +        printf("okay\n");
> +    }
> +    else
> +        printf("skipped\n");
> +
> +    printf("%-40s", "Testing vmaskmovpd %xmm1,%xmm2,(%edx)...");
> +    if ( stack_exec && cpu_has_avx )
> +    {
> +        decl_insn(vmaskmovpd);
> +
> +        asm volatile ( "vxorpd %%xmm1, %%xmm1, %%xmm1\n\t"
> +                       "vcmpeqsd %%xmm1, %%xmm1, %%xmm2\n\t"
> +                       put_insn(vmaskmovpd, "vmaskmovpd %%xmm1, %%xmm2, (%0)")
> +                       :: "d" (NULL) );
> +
> +        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
> +        set_insn(vmaskmovpd);
> +        regs.edx = (unsigned long)res + MMAP_SZ - 8;
> +        rc = x86_emulate(&ctxt, &emulops);
> +        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
> +             res[MMAP_SZ / sizeof(*res) - 1] ||
> +             res[MMAP_SZ / sizeof(*res) - 2] ||
> +             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
> +                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
> +            goto fail;
> +
> +        asm volatile ( "vmovddup %xmm2, %xmm2\n\t"
> +                       "vmovsd %xmm1, %xmm2, %xmm2" );
> +        memset(res, 0xdb, 32);
> +        set_insn(vmaskmovpd);
> +        regs.edx = (unsigned long)(res - 2);
> +        rc = x86_emulate(&ctxt, &emulops);
> +        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
> +             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
> +            goto fail;
> +
> +        printf("okay\n");
> +    }
> +    else
>          printf("skipped\n");
>  
>      printf("%-40s", "Testing stmxcsr (%edx)...");
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -226,6 +226,12 @@ enum simd_opsize {
>       */
>      simd_scalar_fp,
>  
> +    /*
> +     * 128 bits of integer or floating point data, with no further
> +     * formatting information.
> +     */
> +    simd_128,
> +
>      /* Operand size encoded in non-standard way. */
>      simd_other
>  };
> @@ -361,14 +367,19 @@ static const struct {
>      uint8_t vsib:1;
>  } ext0f38_table[256] = {
>      [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
> +    [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
>      [0x10] = { .simd_size = simd_packed_int },
>      [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
>      [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
> +    [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
> +    [0x1a] = { .simd_size = simd_128, .two_op = 1 },
>      [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
>      [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
>      [0x28 ... 0x29] = { .simd_size = simd_packed_int },
>      [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
>      [0x2b] = { .simd_size = simd_packed_int },
> +    [0x2c ... 0x2d] = { .simd_size = simd_other },
> +    [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
>      [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
>      [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
>      [0x40] = { .simd_size = simd_packed_int },
> @@ -391,11 +402,15 @@ static const struct {
>      uint8_t two_op:1;
>      uint8_t four_op:1;
>  } ext0f3a_table[256] = {
> +    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
> +    [0x06] = { .simd_size = simd_packed_fp },
>      [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
>      [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
>      [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
>      [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
>      [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
> +    [0x18] = { .simd_size = simd_128 },
> +    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
>      [0x20] = { .simd_size = simd_none },
>      [0x21] = { .simd_size = simd_other },
>      [0x22] = { .simd_size = simd_none },
> @@ -469,15 +484,18 @@ union vex {
>      buf_ + 3; \
>  })
>  
> +#define copy_VEX(ptr, vex) ({ \
> +    if ( !mode_64bit() ) \
> +        (vex).reg |= 8; \
> +    (ptr)[0 - PFX_BYTES] = 0xc4; \
> +    (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
> +    (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
> +    container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
> +})
> +
>  #define copy_REX_VEX(ptr, rex, vex) do { \
>      if ( (vex).opcx != vex_none ) \
> -    { \
> -        if ( !mode_64bit() ) \
> -            vex.reg |= 8; \
> -        (ptr)[0 - PFX_BYTES] = 0xc4; \
> -        (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
> -        (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
> -    } \
> +        copy_VEX(ptr, vex); \
>      else \
>      { \
>          if ( (vex).pfx ) \
> @@ -2941,6 +2959,10 @@ x86_decode(
>          op_bytes = 4 << (ctxt->opcode & 1);
>          break;
>  
> +    case simd_128:
> +        op_bytes = 16;
> +        break;
> +
>      default:
>          op_bytes = 0;
>          break;
> @@ -2967,6 +2989,7 @@ x86_emulate(
>      struct x86_emulate_state state;
>      int rc;
>      uint8_t b, d, *opc = NULL;
> +    unsigned int first_byte = 0;
>      bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
>  	    !is_branch_step(ctxt, ops);
>      bool sfence = false;
> @@ -7119,6 +7142,18 @@ x86_emulate(
>          fic.insn_bytes = PFX_BYTES + 3;
>          break;
>  
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
> +        generate_exception_if(!vex.l, EXC_UD);
> +        /* fall through */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
> +        generate_exception_if(ea.type != OP_MEM, EXC_UD);

Just checking my understanding here: The reason for this check is that
as of this patch, we still only support AVX instructions, not AVX2 (and
later) variants, which have non-memory-source variants?

I'm not trying to complain, but I want to reflect back some of what I'm
experiencing trying to review this series.  After having gotten somewhat
up to speed on the instructions and terminology, and the general layout
of the existing code, I understand that the basic method for adding a
new instruction of this type is:

1. Add instruction re-execution support
  a. Add decode information into the appropriate tables (in this case
ext0f38_table[] and ext0f3a_table[]
  b. Add case statements which
   - Check for the appropriate conditions
   - Set up anything that needs setting up for the instruction
re-execution (in the "if (state->simd_size)" clause).

2. Add testing

And of course 1b may involve extending functionality, such as adding
simd_128 or handling new source / destination modes or combinations.

So a proper review would involve making sure that all the above had been
done properly: That the right instruction was decoded, the right tables
set up, the right conditions checked, the right inputs made to the
re-execution code further down; and then of course making sure that
these instructions were properly added to the testing matrix.

So I look up the instructions named above (vbroadcast*) and verified
that they matched the codes listed.  The manual says first two generate
an exception if vex.l == 0; so far so good.

But what's the ea.type != MEM thing?  The manual certainly says there
are instructions that don't reference memory.  A bit of looking and
poking around, and I formulated the question above.  The test for AVX
support is made in a completely different part of the file, and no
mention of AVX2 / whatever is done here.

OK, go up and check the tables -- additions to ext0f38 at 0x18, 0x19,
and 0x1a seem to match the description.

Now what?  How do I verify that everything else on the path will work as
it should?

And I still don't have any idea how to start on verifying that these
instructions have been added to the test framework properly.

I'm only 3 instructions in, with at least 20 more to go.

There is certainly something to be said for saying that if you're going
to review a change to code you have to understand the underlying code
itself; and with a piece of code as complicated as this, there's just no
way around that being a big learning curve.

But as a practical matter, very few people are that familiar with this
code.  Even without all the other random distractions with security
issues and such, I doubt anyone who hadn't specifically been trying to
put forward the effort to help you would have made it through reviewing
this patch without deciding their time would be better spent elsewhere.

So it really seems to be that if you want someone to review this code --
particularly anyone besides Andy, but probably even with Andy -- you
have to go further into making it easier for someone able to read a
manual and read code, but not intimately familiar with either the x86
instruction set, the emulator, or the testing framework, to verify that
the changes you're making are correct.

Maybe after the code freeze I'll see if I can re-work this patch (or
portions of it) into something which I think would be easier to review;
both as an exercise for myself (to make sure I understand what's going
on), and an an example for what I'm talking about.  Given the rate this
is going, there's no way I'm going to be able to give an R-b for this
patch before the freeze.

 -George
Jan Beulich Oct. 9, 2017, 4:12 p.m. UTC | #2
>>> On 09.10.17 at 17:36, <george.dunlap@citrix.com> wrote:
> On 09/14/2017 04:12 PM, Jan Beulich wrote:
>> @@ -7119,6 +7142,18 @@ x86_emulate(
>>          fic.insn_bytes = PFX_BYTES + 3;
>>          break;
>>  
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
>> +        generate_exception_if(!vex.l, EXC_UD);
>> +        /* fall through */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
>> +        generate_exception_if(ea.type != OP_MEM, EXC_UD);
> 
> Just checking my understanding here: The reason for this check is that
> as of this patch, we still only support AVX instructions, not AVX2 (and
> later) variants, which have non-memory-source variants?

That's a good point actually. Yes, for this patch alone it's fine to
handle just memory operands. But the later AVX2 patch doesn't
generalize this - I've managed to overlook this aspect. Based on
how other support has been added, it could have been done the
other way as well (using suitable conditionals), but now that this
patch has gone in, I'll have to do it in the AVX2 patch.

> I'm not trying to complain, but I want to reflect back some of what I'm
> experiencing trying to review this series.  After having gotten somewhat
> up to speed on the instructions and terminology, and the general layout
> of the existing code, I understand that the basic method for adding a
> new instruction of this type is:
> 
> 1. Add instruction re-execution support
>   a. Add decode information into the appropriate tables (in this case
> ext0f38_table[] and ext0f3a_table[]
>   b. Add case statements which
>    - Check for the appropriate conditions
>    - Set up anything that needs setting up for the instruction
> re-execution (in the "if (state->simd_size)" clause).
> 
> 2. Add testing

Whatever that means. Generally I'm trying to add tests for new
code paths, or for insns using unusual operand sizes. But I'm not
going to claim the testing being added is always completely
covering all new insns.

> And of course 1b may involve extending functionality, such as adding
> simd_128 or handling new source / destination modes or combinations.
> 
> So a proper review would involve making sure that all the above had been
> done properly: That the right instruction was decoded, the right tables
> set up, the right conditions checked, the right inputs made to the
> re-execution code further down; and then of course making sure that
> these instructions were properly added to the testing matrix.
> 
> So I look up the instructions named above (vbroadcast*) and verified
> that they matched the codes listed.  The manual says first two generate
> an exception if vex.l == 0; so far so good.
> 
> But what's the ea.type != MEM thing?  The manual certainly says there
> are instructions that don't reference memory.  A bit of looking and
> poking around, and I formulated the question above.  The test for AVX
> support is made in a completely different part of the file, and no
> mention of AVX2 / whatever is done here.
> 
> OK, go up and check the tables -- additions to ext0f38 at 0x18, 0x19,
> and 0x1a seem to match the description.
> 
> Now what?  How do I verify that everything else on the path will work as
> it should?

Well, if we assume that the code paths outside the big switch
statements work, then it is really only the table addition and the
customization in the new case label which needs verifying. If otoh
you suspect that the glue between existing (common) and new
(per opcode) code isn't right, then going through the common
parts with the specifics of the new instruction in mind won't be
avoidable. But isn't that how you'd review other code as well?

> And I still don't have any idea how to start on verifying that these
> instructions have been added to the test framework properly.

Is seeing the respective compiler intrinsics being used (or, where
those aren't suitable, inline asm()-s with the very instructions) not
enough? Plus, as said above, don't expect every single insn to be
tested, or even every single flavor of one insn.

> I'm only 3 instructions in, with at least 20 more to go.
> 
> There is certainly something to be said for saying that if you're going
> to review a change to code you have to understand the underlying code
> itself; and with a piece of code as complicated as this, there's just no
> way around that being a big learning curve.
> 
> But as a practical matter, very few people are that familiar with this
> code.  Even without all the other random distractions with security
> issues and such, I doubt anyone who hadn't specifically been trying to
> put forward the effort to help you would have made it through reviewing
> this patch without deciding their time would be better spent elsewhere.
> 
> So it really seems to be that if you want someone to review this code --
> particularly anyone besides Andy, but probably even with Andy -- you
> have to go further into making it easier for someone able to read a
> manual and read code, but not intimately familiar with either the x86
> instruction set, the emulator, or the testing framework, to verify that
> the changes you're making are correct.

I can fully understand what you say, yet this doesn't help me see
how I could have helped the situation. I could have split the patch
into smaller pieces (one insn at a time, for example), but that wouldn't
have changed the amount of checking you'd have to do. It would
merely reduce the granularity at which you could send back comments
or an ack.

I specifically cannot see how I could have helped you with a more
verbose description, since everything the patch does that isn't
mentioned there is simply cloning existing code to support the new
insns, which the title names. In particular I don't think listing
individual instructions being added would be helpful. And in no
case do I think that reproducing any information the SDM has
would be useful.

> Maybe after the code freeze I'll see if I can re-work this patch (or
> portions of it) into something which I think would be easier to review;
> both as an exercise for myself (to make sure I understand what's going
> on), and an an example for what I'm talking about.  Given the rate this
> is going, there's no way I'm going to be able to give an R-b for this
> patch before the freeze.

Given this patch has gone in with Andrew's ack (which wasn't sent
on the list), perhaps it would be worthwhile instead doing any such
for one of the later patches (the F16C patch, adding just two insns,
may be a good candidate if you're not meaning to also split up
patches into smaller pieces, whereas e.g. the AVX2 one would be
about the same size and number of added insns as the one here is).

Jan
George Dunlap Oct. 10, 2017, 2:17 p.m. UTC | #3
On 10/09/2017 05:12 PM, Jan Beulich wrote:
>>>> On 09.10.17 at 17:36, <george.dunlap@citrix.com> wrote:
>> On 09/14/2017 04:12 PM, Jan Beulich wrote:
>>> @@ -7119,6 +7142,18 @@ x86_emulate(
>>>          fic.insn_bytes = PFX_BYTES + 3;
>>>          break;
>>>  
>>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
>>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
>>> +        generate_exception_if(!vex.l, EXC_UD);
>>> +        /* fall through */
>>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
>>> +        generate_exception_if(ea.type != OP_MEM, EXC_UD);
>>
>> Just checking my understanding here: The reason for this check is that
>> as of this patch, we still only support AVX instructions, not AVX2 (and
>> later) variants, which have non-memory-source variants?
> 
> That's a good point actually. Yes, for this patch alone it's fine to
> handle just memory operands. But the later AVX2 patch doesn't
> generalize this - I've managed to overlook this aspect. Based on
> how other support has been added, it could have been done the
> other way as well (using suitable conditionals), but now that this
> patch has gone in, I'll have to do it in the AVX2 patch.
> 
>> I'm not trying to complain, but I want to reflect back some of what I'm
>> experiencing trying to review this series.  After having gotten somewhat
>> up to speed on the instructions and terminology, and the general layout
>> of the existing code, I understand that the basic method for adding a
>> new instruction of this type is:
>>
>> 1. Add instruction re-execution support
>>   a. Add decode information into the appropriate tables (in this case
>> ext0f38_table[] and ext0f3a_table[]
>>   b. Add case statements which
>>    - Check for the appropriate conditions
>>    - Set up anything that needs setting up for the instruction
>> re-execution (in the "if (state->simd_size)" clause).
>>
>> 2. Add testing
> 
> Whatever that means. Generally I'm trying to add tests for new
> code paths, or for insns using unusual operand sizes. But I'm not
> going to claim the testing being added is always completely
> covering all new insns.
> 
>> And of course 1b may involve extending functionality, such as adding
>> simd_128 or handling new source / destination modes or combinations.
>>
>> So a proper review would involve making sure that all the above had been
>> done properly: That the right instruction was decoded, the right tables
>> set up, the right conditions checked, the right inputs made to the
>> re-execution code further down; and then of course making sure that
>> these instructions were properly added to the testing matrix.
>>
>> So I look up the instructions named above (vbroadcast*) and verified
>> that they matched the codes listed.  The manual says first two generate
>> an exception if vex.l == 0; so far so good.
>>
>> But what's the ea.type != MEM thing?  The manual certainly says there
>> are instructions that don't reference memory.  A bit of looking and
>> poking around, and I formulated the question above.  The test for AVX
>> support is made in a completely different part of the file, and no
>> mention of AVX2 / whatever is done here.
>>
>> OK, go up and check the tables -- additions to ext0f38 at 0x18, 0x19,
>> and 0x1a seem to match the description.
>>
>> Now what?  How do I verify that everything else on the path will work as
>> it should?
> 
> Well, if we assume that the code paths outside the big switch
> statements work, then it is really only the table addition and the
> customization in the new case label which needs verifying. If otoh
> you suspect that the glue between existing (common) and new
> (per opcode) code isn't right, then going through the common
> parts with the specifics of the new instruction in mind won't be
> avoidable. But isn't that how you'd review other code as well?

Well the question isn't whether I suspect there's something wrong; it's
checking to see if the author haven't missed something.  In order to
know that a particular instruction will DTRT when going through the
common code, I need to have an idea not only what the instruction needs,
but what the common code is doing.

And no, this process isn't different than what a reviewer has to do for
any patch; it's just particularly difficult for this patch.

>> And I still don't have any idea how to start on verifying that these
>> instructions have been added to the test framework properly.
> 
> Is seeing the respective compiler intrinsics being used (or, where
> those aren't suitable, inline asm()-s with the very instructions) not
> enough? Plus, as said above, don't expect every single insn to be
> tested, or even every single flavor of one insn.
> 
>> I'm only 3 instructions in, with at least 20 more to go.
>>
>> There is certainly something to be said for saying that if you're going
>> to review a change to code you have to understand the underlying code
>> itself; and with a piece of code as complicated as this, there's just no
>> way around that being a big learning curve.
>>
>> But as a practical matter, very few people are that familiar with this
>> code.  Even without all the other random distractions with security
>> issues and such, I doubt anyone who hadn't specifically been trying to
>> put forward the effort to help you would have made it through reviewing
>> this patch without deciding their time would be better spent elsewhere.
>>
>> So it really seems to be that if you want someone to review this code --
>> particularly anyone besides Andy, but probably even with Andy -- you
>> have to go further into making it easier for someone able to read a
>> manual and read code, but not intimately familiar with either the x86
>> instruction set, the emulator, or the testing framework, to verify that
>> the changes you're making are correct.
> 
> I can fully understand what you say, yet this doesn't help me see
> how I could have helped the situation. I could have split the patch
> into smaller pieces (one insn at a time, for example), but that wouldn't
> have changed the amount of checking you'd have to do. It would
> merely reduce the granularity at which you could send back comments
> or an ack.

No, but it would, for instance, have been easier to know which bits of
the patch went with which instruction.

> I specifically cannot see how I could have helped you with a more
> verbose description, since everything the patch does that isn't
> mentioned there is simply cloning existing code to support the new
> insns, which the title names. In particular I don't think listing
> individual instructions being added would be helpful. And in no
> case do I think that reproducing any information the SDM has
> would be useful.

Right, and I'm aware that my mail was unfortunately full of problems and
not solutions; which is why I said I'd try to re-work this patch as an
exercise for myself / example of what I mean.  (The end result might
even be that I come back and say that the patch you've posted is the
optimum from a comprehension / review perspective, although at the
moment I think that's unlikely.)  And I certainly don't think that
quoting large sections of the reference manual is useful; it is
reasonable to expect people reviewing this to have read the introduction
to the instruction format, and be able to look up the instruction name
in the Intel manual.

I won't write about things in detail now, but the basic idea comes down
to how much the reviewer has to *figure out* vs how much the reviewer is
*verifying*.  Verifying claims about what the code does before or after
the patch is a lot easier than figuring out important aspects about what
the code does before and after a patch.

And I want to emphasize: I'm not saying there was something wrong with
the patches; I'm just saying, as they are they it would be very unlikely
that someone not already very familiar with the code would review them.
And so if you want other people to review it, you may need to put in
some extra effort to make that possible.  (And hopefully I'll have
something useful to say about what effort would be useful in a little bit.)

>> Maybe after the code freeze I'll see if I can re-work this patch (or
>> portions of it) into something which I think would be easier to review;
>> both as an exercise for myself (to make sure I understand what's going
>> on), and an an example for what I'm talking about.  Given the rate this
>> is going, there's no way I'm going to be able to give an R-b for this
>> patch before the freeze.
> 
> Given this patch has gone in with Andrew's ack (which wasn't sent
> on the list), perhaps it would be worthwhile instead doing any such
> for one of the later patches (the F16C patch, adding just two insns,
> may be a good candidate if you're not meaning to also split up
> patches into smaller pieces, whereas e.g. the AVX2 one would be
> about the same size and number of added insns as the one here is).

Right; I'll take a look.

 -George
diff mbox

Patch

--- a/.gitignore
+++ b/.gitignore
@@ -224,7 +224,7 @@ 
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
 tools/tests/x86_emulator/asm
-tools/tests/x86_emulator/avx*.h
+tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,8 +11,8 @@  all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4
-TESTCASES := blowfish $(SIMD) $(addsuffix -avx,$(filter sse%,$(SIMD)))
+SIMD := sse sse2 sse4 avx
+TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -26,34 +26,36 @@  sse2-flts := 4 8
 sse4-vecs := $(sse2-vecs)
 sse4-ints := $(sse2-ints)
 sse4-flts := $(sse2-flts)
+avx-vecs := 16 32
+avx-ints :=
+avx-flts := 4 8
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
 # however, for SSE4.1 and later, as there are instructions with XMM0 as
 # an implicit operand.
-sse2avx-sse  := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse2 := $(sse2avx-sse)
+sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
 sse2avx-sse4 := -Wa,-msse2avx
 
+# For AVX and later, have the compiler avoid XMM0 to widen coverage of
+# the VEX.vvvv checks in the emulator.
+non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
+
 define simd-defs
 $(1)-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
 	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
+	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
 $(1)-avx-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
 	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
-	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
-	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse $(sse2avx-$(1)) -O2 -DFLOAT_SIZE=$(flt)")
+	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -70,7 +70,13 @@  typedef long long __attribute__((vector_
 #if VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
 #elif VEC_SIZE == 16
-# if defined(__SSE4_1__)
+# if defined(__AVX__) && defined(FLOAT_SIZE)
+#  if ELEM_SIZE == 4
+#   define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
+#  elif ELEM_SIZE == 8
+#   define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
+#  endif
+# elif defined(__SSE4_1__)
 #  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
 # elif defined(__SSE__) && ELEM_SIZE == 4
 #  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
@@ -81,6 +87,12 @@  typedef long long __attribute__((vector_
 #   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
 #  endif
 # endif
+#elif VEC_SIZE == 32
+# if defined(__AVX__) && ELEM_SIZE == 4
+#  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
+# elif defined(__AVX__) && ELEM_SIZE == 8
+#  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
+# endif
 #endif
 
 #ifndef to_bool
@@ -105,6 +117,12 @@  static inline bool _to_bool(byte_vec_t b
 # elif FLOAT_SIZE == 8
 #  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
+# elif FLOAT_SIZE == 8
+#  define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
+# endif
 #endif
 
 #if VEC_SIZE == FLOAT_SIZE
@@ -116,7 +134,25 @@  static inline bool _to_bool(byte_vec_t b
 #endif
 
 #if FLOAT_SIZE == 4 && defined(__SSE__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
+#  define max(x, y) __builtin_ia32_maxps256(x, y)
+#  define min(x, y) __builtin_ia32_minps256(x, y)
+#  define recip(x) __builtin_ia32_rcpps256(x)
+#  define rsqrt(x) __builtin_ia32_rsqrtps256(x)
+#  define sqrt(x) __builtin_ia32_sqrtps256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+#  define swap2(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
+#  ifdef __AVX__
+#   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
+#  endif
 #  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
 #  define max(x, y) __builtin_ia32_maxps(x, y)
@@ -125,13 +161,39 @@  static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_rsqrtps(x)
 #  define sqrt(x) __builtin_ia32_sqrtps(x)
 #  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
+#  endif
 # elif VEC_SIZE == 4
 #  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
 #  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
 #  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
 # endif
 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
+#  define max(x, y) __builtin_ia32_maxpd256(x, y)
+#  define min(x, y) __builtin_ia32_minpd256(x, y)
+#  define recip(x) ({ \
+    float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
+    t_ = __builtin_ia32_vextractf128_ps256( \
+             __builtin_ia32_rcpps256( \
+                 __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
+    __builtin_ia32_cvtps2pd256(t_); \
+})
+#  define rsqrt(x) ({ \
+    float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
+    float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
+    t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
+    t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
+    __builtin_ia32_cvtps2pd256(t1_); \
+})
+#  define sqrt(x) __builtin_ia32_sqrtpd256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
+    __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
 #  define max(x, y) __builtin_ia32_maxpd(x, y)
@@ -140,6 +202,10 @@  static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
 #  define sqrt(x) __builtin_ia32_sqrtpd(x)
 #  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
+                                                       __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
+#  endif
 # elif VEC_SIZE == 8
 #  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
 #  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
@@ -201,6 +267,31 @@  static inline bool _to_bool(byte_vec_t b
 #  define hadd(x, y) __builtin_ia32_haddpd(x, y)
 #  define hsub(x, y) __builtin_ia32_hsubpd(x, y)
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
+#  define dup_hi(x) __builtin_ia32_movshdup256(x)
+#  define dup_lo(x) __builtin_ia32_movsldup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
+#  define dup_lo(x) __builtin_ia32_movddup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+# endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSSE3__)
 # if INT_SIZE == 1
@@ -282,6 +373,31 @@  static inline bool _to_bool(byte_vec_t b
 #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
 # endif
 #endif
+#if VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define dot_product(x, y) ({ \
+    vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
+    (vec_t){t_[0] + t_[4]}; \
+})
+#  define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vsi_t m_ = (vsi_t)(m); \
+    *(d) = __builtin_ia32_maskloadps256(&(x),  m_); \
+    __builtin_ia32_maskstoreps256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
+# elif FLOAT_SIZE == 8
+#  define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vdi_t m_ = (vdi_t)(m); \
+    *(d) = __builtin_ia32_maskloadpd256(&(x),  m_); \
+    __builtin_ia32_maskstorepd256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
+# endif
+#endif
 #if VEC_SIZE == FLOAT_SIZE
 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
@@ -555,6 +671,15 @@  int simd_test(void)
     if ( !to_bool(swap(src) == inv) ) return __LINE__;
 #endif
 
+#ifdef swap2
+    touch(src);
+    if ( !to_bool(swap2(src) == inv) ) return __LINE__;
+#endif
+
+#if defined(broadcast)
+    if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
@@ -652,6 +777,15 @@  int simd_test(void)
 # endif
     if ( !to_bool(z == y) ) return __LINE__;
 #endif
+
+#ifdef select2
+# ifdef UINT_SIZE
+    select2(&z, src, inv, alt);
+# else
+    select2(&z, src, inv, alt > 0);
+# endif
+    if ( !to_bool(z == y) ) return __LINE__;
+#endif
 
 #ifdef mix
     touch(src);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -8,9 +8,9 @@ 
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
-#include "sse-avx.h"
 #include "sse2-avx.h"
 #include "sse4-avx.h"
+#include "avx.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -44,7 +44,6 @@  static bool simd_check_avx(void)
 {
     return cpu_has_avx;
 }
-#define simd_check_sse_avx   simd_check_avx
 #define simd_check_sse2_avx  simd_check_avx
 #define simd_check_sse4_avx  simd_check_avx
 
@@ -122,12 +121,6 @@  static const struct {
     SIMD(SSE4 packed u32,        sse4,      16u4),
     SIMD(SSE4 packed s64,        sse4,      16i8),
     SIMD(SSE4 packed u64,        sse4,      16u8),
-    SIMD(SSE/AVX scalar single,  sse_avx,     f4),
-    SIMD(SSE/AVX packed single,  sse_avx,   16f4),
-    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),
-    SIMD(SSE2/AVX packed single, sse2_avx,  16f4),
-    SIMD(SSE2/AVX scalar double, sse2_avx,    f8),
-    SIMD(SSE2/AVX packed double, sse2_avx,  16f8),
     SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
     SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
     SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
@@ -136,10 +129,6 @@  static const struct {
     SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
     SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
     SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
-    SIMD(SSE4/AVX scalar single, sse4_avx,    f4),
-    SIMD(SSE4/AVX packed single, sse4_avx,  16f4),
-    SIMD(SSE4/AVX scalar double, sse4_avx,    f8),
-    SIMD(SSE4/AVX packed double, sse4_avx,  16f8),
     SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
     SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
     SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
@@ -148,6 +137,12 @@  static const struct {
     SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
     SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
     SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
+    SIMD(AVX scalar single,      avx,         f4),
+    SIMD(AVX 128bit single,      avx,       16f4),
+    SIMD(AVX 256bit single,      avx,       32f4),
+    SIMD(AVX scalar double,      avx,         f8),
+    SIMD(AVX 128bit double,      avx,       16f8),
+    SIMD(AVX 256bit double,      avx,       32f8),
 #undef SIMD_
 #undef SIMD
 };
@@ -2859,6 +2854,81 @@  int main(int argc, char **argv)
         printf("okay\n");
     }
     else
+        printf("skipped\n");
+
+    /*
+     * The following "maskmov" tests are not only making sure the written data
+     * is correct, but verify (by placing operands on the mapping boundaries)
+     * that elements controlled by clear mask bits aren't being accessed.
+     */
+    printf("%-40s", "Testing vmaskmovps %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovps);
+
+        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqss %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovps, "vmaskmovps %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)res + MMAP_SZ - 4;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
+            goto fail;
+
+        asm volatile ( "vinsertps $0b00110111, %xmm2, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)(res - 3);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[0] || memcmp(res + 1, res + 4, 12) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmaskmovpd %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovpd);
+
+        asm volatile ( "vxorpd %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqsd %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovpd, "vmaskmovpd %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)res + MMAP_SZ - 8;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             res[MMAP_SZ / sizeof(*res) - 2] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
+            goto fail;
+
+        asm volatile ( "vmovddup %xmm2, %xmm2\n\t"
+                       "vmovsd %xmm1, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)(res - 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
         printf("skipped\n");
 
     printf("%-40s", "Testing stmxcsr (%edx)...");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -226,6 +226,12 @@  enum simd_opsize {
      */
     simd_scalar_fp,
 
+    /*
+     * 128 bits of integer or floating point data, with no further
+     * formatting information.
+     */
+    simd_128,
+
     /* Operand size encoded in non-standard way. */
     simd_other
 };
@@ -361,14 +367,19 @@  static const struct {
     uint8_t vsib:1;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
     [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x1a] = { .simd_size = simd_128, .two_op = 1 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x2b] = { .simd_size = simd_packed_int },
+    [0x2c ... 0x2d] = { .simd_size = simd_other },
+    [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
@@ -391,11 +402,15 @@  static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
+    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
+    [0x18] = { .simd_size = simd_128 },
+    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -469,15 +484,18 @@  union vex {
     buf_ + 3; \
 })
 
+#define copy_VEX(ptr, vex) ({ \
+    if ( !mode_64bit() ) \
+        (vex).reg |= 8; \
+    (ptr)[0 - PFX_BYTES] = 0xc4; \
+    (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+    (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+    container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
+})
+
 #define copy_REX_VEX(ptr, rex, vex) do { \
     if ( (vex).opcx != vex_none ) \
-    { \
-        if ( !mode_64bit() ) \
-            vex.reg |= 8; \
-        (ptr)[0 - PFX_BYTES] = 0xc4; \
-        (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
-        (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
-    } \
+        copy_VEX(ptr, vex); \
     else \
     { \
         if ( (vex).pfx ) \
@@ -2941,6 +2959,10 @@  x86_decode(
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
+    case simd_128:
+        op_bytes = 16;
+        break;
+
     default:
         op_bytes = 0;
         break;
@@ -2967,6 +2989,7 @@  x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
+    unsigned int first_byte = 0;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
     bool sfence = false;
@@ -7119,6 +7142,18 @@  x86_emulate(
         fic.insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0c): /* vpermilps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0d): /* vpermilpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx;
+
     case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
@@ -7152,6 +7187,10 @@  x86_emulate(
         host_and_vcpu_must_have(sse4_1);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0e): /* vtestps {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0f): /* vtestpd {x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
     case X86EMUL_OPC_66(0x0f38, 0x17):     /* ptest xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
         if ( vex.opcx == vex_none )
@@ -7232,6 +7271,69 @@  x86_emulate(
         }
         goto movdqa;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd {x,y}mm,{x,y}mm,mem */
+    {
+        typeof(vex) *pvex;
+
+        generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
+        host_and_vcpu_must_have(avx);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /*
+         * While we can't reasonably provide fully correct behavior here
+         * (in particular, for writes, avoiding the memory read in anticipation
+         * of all elements in the range eventually being written), we can (and
+         * should) still limit the memory access to the smallest possible range
+         * (suppressing it altogether if all mask bits are clear), to provide
+         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+         * for that purpose.
+         */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        if ( !(b & 1) )
+            pvex->pfx = vex_none;
+        opc[0] = 0x50; /* vmovmskp{s,d} */
+        /* Use %rax as GPR destination and VEX.vvvv as source. */
+        pvex->r = 1;
+        pvex->b = !mode_64bit() || (vex.reg >> 3);
+        opc[1] = 0xc0 | (~vex.reg & 7);
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+        put_stub(stub);
+
+        if ( !ea.val )
+            goto complete_insn;
+
+        op_bytes = 4 << (b & 1);
+        first_byte = __builtin_ctz(ea.val);
+        ea.val >>= first_byte;
+        first_byte *= op_bytes;
+        op_bytes *= 32 - __builtin_clz(ea.val);
+
+        /*
+         * Even for the memory write variant a memory read is needed, unless
+         * all set mask bits are contiguous.
+         */
+        if ( ea.val & (ea.val + 1) )
+            d = (d & ~SrcMask) | SrcMem;
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert memory operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        opc[1] = modrm & 0x38;
+        fic.insn_bytes = PFX_BYTES + 2;
+
+        break;
+    }
+
     case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */
         host_and_vcpu_must_have(sse4_2);
         goto simd_0f38_common;
@@ -7427,6 +7529,16 @@  x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x04): /* vpermilps $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,{x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
     case X86EMUL_OPC(0x0f3a, 0x0f):    /* palignr $imm8,mm/m64,mm */
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
@@ -7780,7 +7892,9 @@  x86_emulate(
             switch ( d & SrcMask )
             {
             case SrcMem:
-                rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+                rc = ops->read(ea.mem.seg, ea.mem.off + first_byte,
+                               (void *)mmvalp + first_byte, op_bytes,
+                               ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 /* fall through */
@@ -7798,7 +7912,21 @@  x86_emulate(
             if ( (d & DstMask) == DstMem )
             {
                 fail_if(!ops->write); /* Check before running the stub. */
-                ASSERT(d & Mov);
+                if ( (d & SrcMask) == SrcMem )
+                    d |= Mov; /* Force memory write to occur below. */
+
+                switch ( ctxt->opcode )
+                {
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd */
+                    /* These have merge semantics; force write to occur. */
+                    d |= Mov;
+                    break;
+                default:
+                    ASSERT(d & Mov);
+                    break;
+                }
+
                 dst.type = OP_MEM;
                 dst.bytes = op_bytes;
                 dst.mem = ea.mem;
@@ -7846,8 +7974,9 @@  x86_emulate(
         else
         {
             fail_if(!ops->write);
-            rc = ops->write(dst.mem.seg, dst.mem.off,
-                            !state->simd_size ? &dst.val : (void *)mmvalp,
+            rc = ops->write(dst.mem.seg, dst.mem.off + first_byte,
+                            !state->simd_size ? &dst.val
+                                              : (void *)mmvalp + first_byte,
                             dst.bytes, ctxt);
             if ( sfence )
                 asm volatile ( "sfence" ::: "memory" );