diff mbox series

[10/11] x86emul: handle AVX512-FP16 conversion to/from (packed) int{32,64} insns

Message ID 2f99e91d-6a91-f860-45d0-9c8b67c9b2b8@suse.com (mailing list archive)
State New, archived
Headers show
Series x86: support AVX512-FP16 | expand

Commit Message

Jan Beulich June 15, 2022, 10:31 a.m. UTC
Signed-off-by: Jan Beulich <jbeulich@suse.com>

Comments

Andrew Cooper Aug. 10, 2022, 7:09 p.m. UTC | #1
On 15/06/2022 11:31, Jan Beulich wrote:
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/decode.c
> +++ b/xen/arch/x86/x86_emulate/decode.c
> @@ -1489,12 +1489,25 @@ int x86emul_decode(struct x86_emulate_st
>                      s->simd_size = simd_scalar_vexw;
>                  break;
>  
> +            case 0x2a: /* vcvtsi2sh */
> +                break;
> +
> +            case 0x2c: case 0x2d: /* vcvt{,t}sh2si */
> +                if ( s->evex.pfx == vex_f3 )
> +                    s->fp16 = true;
> +                break;
> +
>              case 0x2e: case 0x2f: /* v{,u}comish */
>                  if ( !s->evex.pfx )
>                      s->fp16 = true;
>                  s->simd_size = simd_none;
>                  break;
>  
> +            case 0x5b: /* vcvt{d,q}q2ph, vcvt{,t}ph2dq */
> +                if ( s->evex.pfx && s->evex.pfx != vex_f2 )
> +                    s->fp16 = true;
> +                break;
> +
>              case 0x6e: /* vmovw r/m16, xmm */
>                  d = (d & ~SrcMask) | SrcMem16;
>                  /* fall through */
> @@ -1504,6 +1517,17 @@ int x86emul_decode(struct x86_emulate_st
>                  s->simd_size = simd_none;
>                  break;
>  
> +            case 0x78: case 0x79: /* vcvt{,t}ph2u{d,q}q, vcvt{,t}sh2usi */
> +                if ( s->evex.pfx != vex_f2 )
> +                    s->fp16 = true;
> +                break;
> +
> +            case 0x7a: /* vcvttph2qq, vcvtu{d,q}q2ph */
> +            case 0x7b: /* vcvtph2qq, vcvtusi2sh */
> +                if ( s->evex.pfx == vex_66 )
> +                    s->fp16 = true;
> +                break;
> +
>              case 0x7c: /* vcvttph2{,u}w */
>              case 0x7d: /* vcvtph2{,u}w / vcvt{,u}w2ph */
>                  d = DstReg | SrcMem | TwoOp;
> @@ -1515,10 +1539,34 @@ int x86emul_decode(struct x86_emulate_st
>  
>              switch ( b )
>              {
> +            case 0x78:
> +            case 0x79:
> +                /* vcvt{,t}ph2u{d,q}q need special casing */
> +                if ( s->evex.pfx <= vex_66 )
> +                {
> +                    if ( !s->evex.brs )
> +                        disp8scale -= 1 + (s->evex.pfx == vex_66);
> +                    break;
> +                }
> +                /* vcvt{,t}sh2usi needs special casing: fall through */
> +            case 0x2c: case 0x2d: /* vcvt{,t}sh2si need special casing */
> +                disp8scale = 1;
> +                break;
> +
>              case 0x5a: /* vcvtph2pd needs special casing */
>                  if ( !s->evex.pfx && !s->evex.brs )
>                      disp8scale -= 2;
>                  break;
> +
> +            case 0x5b: /* vcvt{,t}ph2dq need special casing */
> +                if ( s->evex.pfx && !s->evex.brs )
> +                    --disp8scale;
> +                break;
> +
> +            case 0x7a: case 0x7b: /* vcvt{,t}ph2qq need special casing */
> +                if ( s->evex.pfx == vex_66 && !s->evex.brs )
> +                    disp8scale = s->evex.brs ? 1 : 2 + s->evex.lr;
> +                break;
>              }
>  
>              break;

Perhaps here, in terms of overriding...

~Andrew
Jan Beulich Aug. 11, 2022, 6:37 a.m. UTC | #2
On 10.08.2022 21:09, Andrew Cooper wrote:
> On 15/06/2022 11:31, Jan Beulich wrote:
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/arch/x86/x86_emulate/decode.c
>> +++ b/xen/arch/x86/x86_emulate/decode.c
>> @@ -1489,12 +1489,25 @@ int x86emul_decode(struct x86_emulate_st
>>                      s->simd_size = simd_scalar_vexw;
>>                  break;
>>  
>> +            case 0x2a: /* vcvtsi2sh */
>> +                break;
>> +
>> +            case 0x2c: case 0x2d: /* vcvt{,t}sh2si */
>> +                if ( s->evex.pfx == vex_f3 )
>> +                    s->fp16 = true;
>> +                break;
>> +
>>              case 0x2e: case 0x2f: /* v{,u}comish */
>>                  if ( !s->evex.pfx )
>>                      s->fp16 = true;
>>                  s->simd_size = simd_none;
>>                  break;
>>  
>> +            case 0x5b: /* vcvt{d,q}q2ph, vcvt{,t}ph2dq */
>> +                if ( s->evex.pfx && s->evex.pfx != vex_f2 )
>> +                    s->fp16 = true;
>> +                break;
>> +
>>              case 0x6e: /* vmovw r/m16, xmm */
>>                  d = (d & ~SrcMask) | SrcMem16;
>>                  /* fall through */
>> @@ -1504,6 +1517,17 @@ int x86emul_decode(struct x86_emulate_st
>>                  s->simd_size = simd_none;
>>                  break;
>>  
>> +            case 0x78: case 0x79: /* vcvt{,t}ph2u{d,q}q, vcvt{,t}sh2usi */
>> +                if ( s->evex.pfx != vex_f2 )
>> +                    s->fp16 = true;
>> +                break;
>> +
>> +            case 0x7a: /* vcvttph2qq, vcvtu{d,q}q2ph */
>> +            case 0x7b: /* vcvtph2qq, vcvtusi2sh */
>> +                if ( s->evex.pfx == vex_66 )
>> +                    s->fp16 = true;
>> +                break;
>> +
>>              case 0x7c: /* vcvttph2{,u}w */
>>              case 0x7d: /* vcvtph2{,u}w / vcvt{,u}w2ph */
>>                  d = DstReg | SrcMem | TwoOp;
>> @@ -1515,10 +1539,34 @@ int x86emul_decode(struct x86_emulate_st
>>  
>>              switch ( b )
>>              {
>> +            case 0x78:
>> +            case 0x79:
>> +                /* vcvt{,t}ph2u{d,q}q need special casing */
>> +                if ( s->evex.pfx <= vex_66 )
>> +                {
>> +                    if ( !s->evex.brs )
>> +                        disp8scale -= 1 + (s->evex.pfx == vex_66);
>> +                    break;
>> +                }
>> +                /* vcvt{,t}sh2usi needs special casing: fall through */
>> +            case 0x2c: case 0x2d: /* vcvt{,t}sh2si need special casing */
>> +                disp8scale = 1;
>> +                break;
>> +
>>              case 0x5a: /* vcvtph2pd needs special casing */
>>                  if ( !s->evex.pfx && !s->evex.brs )
>>                      disp8scale -= 2;
>>                  break;
>> +
>> +            case 0x5b: /* vcvt{,t}ph2dq need special casing */
>> +                if ( s->evex.pfx && !s->evex.brs )
>> +                    --disp8scale;
>> +                break;
>> +
>> +            case 0x7a: case 0x7b: /* vcvt{,t}ph2qq need special casing */
>> +                if ( s->evex.pfx == vex_66 && !s->evex.brs )
>> +                    disp8scale = s->evex.brs ? 1 : 2 + s->evex.lr;
>> +                break;
>>              }
>>  
>>              break;
> 
> Perhaps here, in terms of overriding...

From all I can tell, the s->fp16 setting would better remain here, as
it's (embedded) prefix dependent. As are the disp8scale adjustments (if
you go look, you'll find similar pre-existing code a little higher up
in the file.)

Jan
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -612,18 +612,36 @@  static const struct test avx512_fp16_all
     INSN(cmpph,           , 0f3a, c2,    vl, fp16, vl),
     INSN(cmpsh,         f3, 0f3a, c2,    el, fp16, el),
     INSN(comish,          , map5, 2f,    el, fp16, el),
+    INSN(cvtdq2ph,        , map5, 5b,    vl,    d, vl),
     INSN(cvtpd2ph,      66, map5, 5a,    vl,    q, vl),
+    INSN(cvtph2dq,      66, map5, 5b,  vl_2, fp16, vl),
     INSN(cvtph2pd,        , map5, 5a,  vl_4, fp16, vl),
     INSN(cvtph2psx,     66, map6, 13,  vl_2, fp16, vl),
+    INSN(cvtph2qq,      66, map5, 7b,  vl_4, fp16, vl),
+    INSN(cvtph2udq,       , map5, 79,  vl_2, fp16, vl),
+    INSN(cvtph2uqq,     66, map5, 79,  vl_4, fp16, vl),
     INSN(cvtph2uw,        , map5, 7d,    vl, fp16, vl),
     INSN(cvtph2w,       66, map5, 7d,    vl, fp16, vl),
     INSN(cvtps2phx,     66, map5, 1d,    vl,    d, vl),
+    INSN(cvtqq2ph,        , map5, 5b,    vl,    q, vl),
     INSN(cvtsd2sh,      f2, map5, 5a,    el,    q, el),
     INSN(cvtsh2sd,      f3, map5, 5a,    el, fp16, el),
+    INSN(cvtsh2si,      f3, map5, 2d,    el, fp16, el),
     INSN(cvtsh2ss,        , map6, 13,    el, fp16, el),
+    INSN(cvtsh2usi,     f3, map5, 79,    el, fp16, el),
+    INSN(cvtsi2sh,      f3, map5, 2a,    el, dq64, el),
     INSN(cvtss2sh,        , map5, 1d,    el,    d, el),
+    INSN(cvttph2dq,     f3, map5, 5b,  vl_2, fp16, vl),
+    INSN(cvttph2qq,     66, map5, 7a,  vl_4, fp16, vl),
+    INSN(cvttph2udq,      , map5, 78,  vl_2, fp16, vl),
+    INSN(cvttph2uqq,    66, map5, 78,  vl_4, fp16, vl),
     INSN(cvttph2uw,       , map5, 7c,    vl, fp16, vl),
     INSN(cvttph2w,      66, map5, 7c,    vl, fp16, vl),
+    INSN(cvttsh2si,     f3, map5, 2c,    el, fp16, el),
+    INSN(cvttsh2usi,    f3, map5, 78,    el, fp16, el),
+    INSN(cvtudq2ph,     f2, map5, 7a,    vl,    d, vl),
+    INSN(cvtuqq2ph,     f2, map5, 7a,    vl,    q, vl),
+    INSN(cvtusi2sh,     f3, map5, 7b,    el, dq64, el),
     INSN(cvtuw2ph,      f2, map5, 7d,    vl, fp16, vl),
     INSN(cvtw2ph,       f3, map5, 7d,    vl, fp16, vl),
     INSN(divph,           , map5, 5e,    vl, fp16, vl),
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2033,6 +2033,9 @@  static const struct evex {
     { { 0x11 }, 2, T, W, pfx_f3, W0, LIG }, /* vmovsh */
     { { 0x1d }, 2, T, R, pfx_66, W0, Ln }, /* vcvtps2phx */
     { { 0x1d }, 2, T, R, pfx_no, W0, LIG }, /* vcvtss2sh */
+    { { 0x2a }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvtsi2sh */
+    { { 0x2c }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvttsh2si */
+    { { 0x2d }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvtsh2si */
     { { 0x2e }, 2, T, R, pfx_no, W0, LIG }, /* vucomish */
     { { 0x2f }, 2, T, R, pfx_no, W0, LIG }, /* vcomish */
     { { 0x51 }, 2, T, R, pfx_no, W0, Ln }, /* vsqrtph */
@@ -2045,6 +2048,10 @@  static const struct evex {
     { { 0x5a }, 2, T, R, pfx_66, W1, Ln }, /* vcvtpd2ph */
     { { 0x5a }, 2, T, R, pfx_f3, W0, LIG }, /* vcvtsh2sd */
     { { 0x5a }, 2, T, R, pfx_f2, W1, LIG }, /* vcvtsd2sh */
+    { { 0x5b }, 2, T, R, pfx_no, W0, Ln }, /* vcvtdq2ph */
+    { { 0x5b }, 2, T, R, pfx_no, W1, Ln }, /* vcvtqq2ph */
+    { { 0x5b }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2dq */
+    { { 0x5b }, 2, T, R, pfx_f3, W0, Ln }, /* vcvttph2dq */
     { { 0x5c }, 2, T, R, pfx_no, W0, Ln }, /* vsubph */
     { { 0x5c }, 2, T, R, pfx_f3, W0, LIG }, /* vsubsh */
     { { 0x5d }, 2, T, R, pfx_no, W0, Ln }, /* vminph */
@@ -2054,6 +2061,17 @@  static const struct evex {
     { { 0x5f }, 2, T, R, pfx_no, W0, Ln }, /* vmaxph */
     { { 0x5f }, 2, T, R, pfx_f3, W0, LIG }, /* vmaxsh */
     { { 0x6e }, 2, T, R, pfx_66, WIG, L0 }, /* vmovw */
+    { { 0x78 }, 2, T, R, pfx_no, W0, Ln }, /* vcvttph2udq */
+    { { 0x78 }, 2, T, R, pfx_66, W0, Ln }, /* vcvttph2uqq */
+    { { 0x78 }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvttsh2usi */
+    { { 0x79 }, 2, T, R, pfx_no, W0, Ln }, /* vcvtph2udq */
+    { { 0x79 }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2uqq */
+    { { 0x79 }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvtsh2usi */
+    { { 0x7a }, 2, T, R, pfx_66, W0, Ln }, /* vcvttph2qq */
+    { { 0x7a }, 2, T, R, pfx_f2, W0, Ln }, /* vcvtudq2ph */
+    { { 0x7a }, 2, T, R, pfx_f2, W1, Ln }, /* vcvtuqq2ph */
+    { { 0x7b }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2qq */
+    { { 0x7b }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvtusi2sh */
     { { 0x7c }, 2, T, R, pfx_no, W0, Ln }, /* vcvttph2uw */
     { { 0x7c }, 2, T, R, pfx_66, W0, Ln }, /* vcvttph2w */
     { { 0x7d }, 2, T, R, pfx_no, W0, Ln }, /* vcvtph2uw */
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -1489,12 +1489,25 @@  int x86emul_decode(struct x86_emulate_st
                     s->simd_size = simd_scalar_vexw;
                 break;
 
+            case 0x2a: /* vcvtsi2sh */
+                break;
+
+            case 0x2c: case 0x2d: /* vcvt{,t}sh2si */
+                if ( s->evex.pfx == vex_f3 )
+                    s->fp16 = true;
+                break;
+
             case 0x2e: case 0x2f: /* v{,u}comish */
                 if ( !s->evex.pfx )
                     s->fp16 = true;
                 s->simd_size = simd_none;
                 break;
 
+            case 0x5b: /* vcvt{d,q}q2ph, vcvt{,t}ph2dq */
+                if ( s->evex.pfx && s->evex.pfx != vex_f2 )
+                    s->fp16 = true;
+                break;
+
             case 0x6e: /* vmovw r/m16, xmm */
                 d = (d & ~SrcMask) | SrcMem16;
                 /* fall through */
@@ -1504,6 +1517,17 @@  int x86emul_decode(struct x86_emulate_st
                 s->simd_size = simd_none;
                 break;
 
+            case 0x78: case 0x79: /* vcvt{,t}ph2u{d,q}q, vcvt{,t}sh2usi */
+                if ( s->evex.pfx != vex_f2 )
+                    s->fp16 = true;
+                break;
+
+            case 0x7a: /* vcvttph2qq, vcvtu{d,q}q2ph */
+            case 0x7b: /* vcvtph2qq, vcvtusi2sh */
+                if ( s->evex.pfx == vex_66 )
+                    s->fp16 = true;
+                break;
+
             case 0x7c: /* vcvttph2{,u}w */
             case 0x7d: /* vcvtph2{,u}w / vcvt{,u}w2ph */
                 d = DstReg | SrcMem | TwoOp;
@@ -1515,10 +1539,34 @@  int x86emul_decode(struct x86_emulate_st
 
             switch ( b )
             {
+            case 0x78:
+            case 0x79:
+                /* vcvt{,t}ph2u{d,q}q need special casing */
+                if ( s->evex.pfx <= vex_66 )
+                {
+                    if ( !s->evex.brs )
+                        disp8scale -= 1 + (s->evex.pfx == vex_66);
+                    break;
+                }
+                /* vcvt{,t}sh2usi needs special casing: fall through */
+            case 0x2c: case 0x2d: /* vcvt{,t}sh2si need special casing */
+                disp8scale = 1;
+                break;
+
             case 0x5a: /* vcvtph2pd needs special casing */
                 if ( !s->evex.pfx && !s->evex.brs )
                     disp8scale -= 2;
                 break;
+
+            case 0x5b: /* vcvt{,t}ph2dq need special casing */
+                if ( s->evex.pfx && !s->evex.brs )
+                    --disp8scale;
+                break;
+
+            case 0x7a: case 0x7b: /* vcvt{,t}ph2qq need special casing */
+                if ( s->evex.pfx == vex_66 && !s->evex.brs )
+                    disp8scale = s->evex.brs ? 1 : 2 + s->evex.lr;
+                break;
             }
 
             break;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3581,6 +3581,12 @@  x86_emulate(
         state->simd_size = simd_none;
         goto simd_0f_rm;
 
+#ifndef X86EMUL_NO_SIMD
+
+    case X86EMUL_OPC_EVEX_F3(5, 0x2a):      /* vcvtsi2sh r/m,xmm,xmm */
+    case X86EMUL_OPC_EVEX_F3(5, 0x7b):      /* vcvtusi2sh r/m,xmm,xmm */
+        host_and_vcpu_must_have(avx512_fp16);
+        /* fall through */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2a): /* vcvtsi2s{s,d} r/m,xmm,xmm */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x7b): /* vcvtusi2s{s,d} r/m,xmm,xmm */
         generate_exception_if(evex.opmsk || (ea.type != OP_REG && evex.brs),
@@ -3659,7 +3665,9 @@  x86_emulate(
             opc[1] = 0x01;
 
             rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp,
-                           vex.pfx & VEX_PREFIX_DOUBLE_MASK ? 8 : 4, ctxt);
+                           vex.pfx & VEX_PREFIX_DOUBLE_MASK
+                           ? 8 : 2 << !state->fp16,
+                           ctxt);
             if ( rc != X86EMUL_OKAY )
                 goto done;
         }
@@ -3689,6 +3697,12 @@  x86_emulate(
         state->simd_size = simd_none;
         break;
 
+    case X86EMUL_OPC_EVEX_F3(5, 0x2c):      /* vcvttsh2si xmm/mem,reg */
+    case X86EMUL_OPC_EVEX_F3(5, 0x2d):      /* vcvtsh2si xmm/mem,reg */
+    case X86EMUL_OPC_EVEX_F3(5, 0x78):      /* vcvttsh2usi xmm/mem,reg */
+    case X86EMUL_OPC_EVEX_F3(5, 0x79):      /* vcvtsh2usi xmm/mem,reg */
+        host_and_vcpu_must_have(avx512_fp16);
+        /* fall through */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2c): /* vcvtts{s,d}2si xmm/mem,reg */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */
@@ -3760,8 +3774,6 @@  x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
-#ifndef X86EMUL_NO_SIMD
-
     case X86EMUL_OPC_EVEX(5, 0x2e): /* vucomish xmm/m16,xmm */
     case X86EMUL_OPC_EVEX(5, 0x2f): /* vcomish xmm/m16,xmm */
         host_and_vcpu_must_have(avx512_fp16);
@@ -7789,6 +7801,38 @@  x86_emulate(
                          2 * evex.w);
         goto avx512f_all_fp;
 
+    case X86EMUL_OPC_EVEX   (5, 0x5b): /* vcvtdq2ph [xyz]mm/mem,[xy]mm{k} */
+                                       /* vcvtqq2ph [xyz]mm/mem,xmm{k} */
+    case X86EMUL_OPC_EVEX_F2(5, 0x7a): /* vcvtudq2ph [xyz]mm/mem,[xy]mm{k} */
+                                       /* vcvtuqq2ph [xyz]mm/mem,xmm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        op_bytes = 16 << evex.lr;
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_66(5, 0x5b): /* vcvtph2dq [xy]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(5, 0x5b): /* vcvttph2dq [xy]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX   (5, 0x78): /* vcvttph2udq [xy]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX   (5, 0x79): /* vcvtph2udq [xy]mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        op_bytes = 8 << evex.lr;
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_66(5, 0x78): /* vcvttph2uqq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(5, 0x79): /* vcvtph2uqq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(5, 0x7a): /* vcvttph2qq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(5, 0x7b): /* vcvtph2qq xmm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        op_bytes = 4 << (evex.w + evex.lr);
+        goto simd_zmm;
+
     case X86EMUL_OPC_EVEX   (5, 0x7c): /* vcvttph2uw [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(5, 0x7c): /* vcvttph2w [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX   (5, 0x7d): /* vcvtph2uw [xyz]mm/mem,[xyz]mm{k} */