diff mbox series

[20/37] target/i386: reimplement 0x0f 0x60-0x6f, add AVX

Message ID 20220911230418.340941-21-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series target/i386: new decoder + AVX implementation | expand

Commit Message

Paolo Bonzini Sept. 11, 2022, 11:04 p.m. UTC
These are both MMX and SSE/AVX instructions, except for vmovdqu.  In both
cases the inputs and output is in s->ptr{0,1,2}, so the only difference
between MMX, SSE, and AVX is which helper to call.

PCMPGT, MOVD and MOVQ are implemented using gvec.

The amount of macro magic for generating functions is kept to the minimum.
In particular, the gvec cases are easy enough and have no duplication within
each function, so they are spelled out one by one.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/decode-new.c.inc |  35 ++++++++
 target/i386/tcg/emit.c.inc       | 148 +++++++++++++++++++++++++++++++
 target/i386/tcg/translate.c      |   3 +-
 3 files changed, 185 insertions(+), 1 deletion(-)

Comments

Richard Henderson Sept. 12, 2022, 11:41 a.m. UTC | #1
On 9/12/22 00:04, Paolo Bonzini wrote:
> +/*
> + * 00 = p*  Pq, Qq (if mmx not NULL; no VEX)
> + * 66 = vp* Vx, Hx, Wx
> + *
> + * These are really the same encoding, because 1) V is the same as P when VEX.V
> + * is not present 2) P and Q are the same as H and W apart from MM/XMM
> + */
> +static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
> +                                      SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)

No need to inline.

> +{
> +    assert (!!mmx == !!(decode->e.special == X86_SPECIAL_MMX));
> +
> +    if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) {
> +        /* VEX encoding is not applicable to MMX instructions.  */
> +        gen_illegal_opcode(s);
> +        return;
> +    }
> +    if (!(s->prefix & PREFIX_DATA)) {
> +        mmx(cpu_env, s->ptr0, s->ptr1, s->ptr2);
> +    } else if (!s->vex_l) {
> +        xmm(cpu_env, s->ptr0, s->ptr1, s->ptr2);
> +    } else {
> +        ymm(cpu_env, s->ptr0, s->ptr1, s->ptr2);
> +    }

And a reminder from earlier patches that generating the pointers here would be better, as 
well as zeroing the high ymm bits for vex xmm insns.

> +static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
> +{
> +    MemOp ot = decode->op[2].ot;
> +    int vec_len = sse_vec_len(s, decode);
> +    int lo_ofs = decode->op[0].offset
> +        - xmm_offset(decode->op[0].ot)
> +        + xmm_offset(ot);
> +
> +    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
> +
> +    switch (ot) {
> +    case MO_32:
> +#ifdef TARGET_X86_64
> +        tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
> +        tcg_gen_st_i32(s->tmp3_i32, cpu_env, lo_ofs);
> +        break;

Use tcg_gen_st32_tl and omit the trunc.
Alternately, zero extend in T1 and fall through...

> +    case MO_64:
> +#endif
> +        tcg_gen_st_tl(s->T1, cpu_env, lo_ofs);

This could also be

     tcg_gen_gvec_dup_i64(MO_64, offset, 8, sse_vec_max_len, s->T1);

to do the store and clear in one call.



r~
Richard Henderson Sept. 12, 2022, 1:01 p.m. UTC | #2
On 9/12/22 00:04, Paolo Bonzini wrote:
> +static void decode_0F6F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
> +{
> +    if (s->prefix & PREFIX_REPNZ) {
> +        entry->gen = NULL;

Are these lines really required with the p_00_66_f3 spec on the group entry?

> +    } else if (s->prefix & PREFIX_REPZ) {
> +        /* movdqu */
> +        entry->gen = gen_MOVDQ;
> +        entry->vex_class = 4;
> +        entry->vex_special = X86_VEX_SSEUnaligned;
> +    } else {
> +        /* MMX movq, movdqa */
> +        entry->gen = gen_MOVDQ;
> +        entry->vex_class = 1;
> +        entry->special = X86_SPECIAL_MMX;
> +    }

Also, you're overriding vex_class for both valid entries, so why does the group speicfy 
vex5?  Clearer to use X86_OP_ENTRY3 within this function and copy from static const data 
instead of overriding individual fields?


r~
Paolo Bonzini Sept. 13, 2022, 10:56 a.m. UTC | #3
On Mon, Sep 12, 2022 at 1:41 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 9/12/22 00:04, Paolo Bonzini wrote:
> > +/*
> > + * 00 = p*  Pq, Qq (if mmx not NULL; no VEX)
> > + * 66 = vp* Vx, Hx, Wx
> > + *
> > + * These are really the same encoding, because 1) V is the same as P when VEX.V
> > + * is not present 2) P and Q are the same as H and W apart from MM/XMM
> > + */
> > +static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
> > +                                      SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)
>
> No need to inline.

Yes and no, the compiler should indeed be able to figure it out, but
both the assert() and the calls are meant to be optimized out by
inlining. So this kind of function would be even an always_inline
candidate.

> > +{
> > +    assert (!!mmx == !!(decode->e.special == X86_SPECIAL_MMX));
> > +
> > +    if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) {
> > +        /* VEX encoding is not applicable to MMX instructions.  */
> > +        gen_illegal_opcode(s);
> > +        return;
> > +    }
> > +    if (!(s->prefix & PREFIX_DATA)) {
> > +        mmx(cpu_env, s->ptr0, s->ptr1, s->ptr2);
> > +    } else if (!s->vex_l) {
> > +        xmm(cpu_env, s->ptr0, s->ptr1, s->ptr2);
> > +    } else {
> > +        ymm(cpu_env, s->ptr0, s->ptr1, s->ptr2);
> > +    }
>
> And a reminder from earlier patches that generating the pointers here would be better, as
> well as zeroing the high ymm bits for vex xmm insns.

I'm not sure about that, because there are quite a few cases handled
by more complex gen_* functions, which are helper-based (so not simple
calls to gvec functions where you have maxsz/oprsz) and are not
handled by the common gen_*_sse. For example gen_CVTPI2Px,
gen_MASKMOV, gen_PSRLDQ_i, gen_SSE4a_I, gen_VCVTSI2Sx, ...  All of
these would have to add extra code to set the pointers and to clear
the high ymm bits.

For gen_load, however, i can delay the generation using something like

static inline TCGv_ptr get_ptr0(DisasContext *s)
{
    if (s->ptr0) {
        return s->ptr0;
    }
    s->ptr0 = tcg_temp_new_ptr();
    tcg_gen_add(s->ptr0, cpu_env, ...);
    return s->ptr0;
}

Most of the changes to this series are mechanical, so if you dislike
relying on DCE then why not.

For gen_writeback, keeping gen_writeback eliminates duplicated code
and keeps the phases of disas_insn_new separated, so I prefer it
slightly. For now I'd rather leave it as is; with the above get_ptr0()
function that creates s->ptr0 lazily, perhaps gen_writeback() could do
it only if s->ptr0 is set (suggesting that a helper was used), while
gvec helpers would use the oprsz<maxsz feature. There's something to
be said for keeping the initial implementation simple of course,
especially since it's already slightly better than the code produced
by the existing decoder.

> > +    switch (ot) {
> > +    case MO_32:
> > +#ifdef TARGET_X86_64
> > +        tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
> > +        tcg_gen_st_i32(s->tmp3_i32, cpu_env, lo_ofs);
> > +        break;
>
> This could also be
>
>      tcg_gen_gvec_dup_i64(MO_64, offset, 8, sse_vec_max_len, s->T1);

Yeah, it can be something like

    case MO_32:
        tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
        tcg_gen_gvec_dup_i32(MO_32, decode->op[0].offset, 4, vec_len,
s->tmp3_i32);
        break;
#ifdef TARGET_X86_64
    case MO_64:
        tcg_gen_gvec_dup_i64(MO_64, decode->op[0].offset, 8, vec_len, s->T1);
        break;
#endif

and in this case of course it's not possible to use st32_tl.

Paolo
Richard Henderson Sept. 13, 2022, 11:35 a.m. UTC | #4
On 9/13/22 11:56, Paolo Bonzini wrote:
> On Mon, Sep 12, 2022 at 1:41 PM Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> On 9/12/22 00:04, Paolo Bonzini wrote:
>>> +/*
>>> + * 00 = p*  Pq, Qq (if mmx not NULL; no VEX)
>>> + * 66 = vp* Vx, Hx, Wx
>>> + *
>>> + * These are really the same encoding, because 1) V is the same as P when VEX.V
>>> + * is not present 2) P and Q are the same as H and W apart from MM/XMM
>>> + */
>>> +static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
>>> +                                      SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)
>>
>> No need to inline.
> 
> Yes and no, the compiler should indeed be able to figure it out, but
> both the assert() and the calls are meant to be optimized out by
> inlining. So this kind of function would be even an always_inline
> candidate.

Yes, I get that, I just prefer by default to allow the compiler to figure it out. 
Obviously there are parts of the code base where we use always_inline and more, but this 
part is never going to be performance critical.

Over-use of inline generally leads to Werror from clang, for the unused function case.

> I'm not sure about that, because there are quite a few cases handled
> by more complex gen_* functions, which are helper-based (so not simple
> calls to gvec functions where you have maxsz/oprsz) and are not
> handled by the common gen_*_sse. For example gen_CVTPI2Px,
> gen_MASKMOV, gen_PSRLDQ_i, gen_SSE4a_I, gen_VCVTSI2Sx, ...  All of
> these would have to add extra code to set the pointers and to clear
> the high ymm bits.

Fair.

> For gen_load, however, i can delay the generation using something like
> 
> static inline TCGv_ptr get_ptr0(DisasContext *s)
> {
>      if (s->ptr0) {
>          return s->ptr0;
>      }
>      s->ptr0 = tcg_temp_new_ptr();
>      tcg_gen_add(s->ptr0, cpu_env, ...);
>      return s->ptr0;
> }

Sure.

> For gen_writeback, keeping gen_writeback eliminates duplicated code
> and keeps the phases of disas_insn_new separated, so I prefer it
> slightly. For now I'd rather leave it as is; with the above get_ptr0()
> function that creates s->ptr0 lazily, perhaps gen_writeback() could do
> it only if s->ptr0 is set (suggesting that a helper was used), while
> gvec helpers would use the oprsz<maxsz feature. There's something to
> be said for keeping the initial implementation simple of course,
> especially since it's already slightly better than the code produced
> by the existing decoder.

Also fair.  Let's ignore the max argument for now, and address it in a subsequent phase, 
where we also convert more operations to gvec.

>> This could also be
>>
>>       tcg_gen_gvec_dup_i64(MO_64, offset, 8, sse_vec_max_len, s->T1);
> 
> Yeah, it can be something like
> 
>      case MO_32:
>          tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
>          tcg_gen_gvec_dup_i32(MO_32, decode->op[0].offset, 4, vec_len,
> s->tmp3_i32);
>          break;


Actually, this doesn't work, because minimum vector size is 8.
This will hit the assert in check_size_align().

I've just realized that we can't just extend i32 to i64, as I was suggesting, because that 
will fall foul of big-endian host (L(0) is at the top half of Q(0)).  So best to keep your 
zero + store.


r~
diff mbox series

Patch

diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index b31daecb90..f20587c096 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -142,6 +142,23 @@  static void decode_group17(DisasContext *s, CPUX86State *env, X86OpEntry *entry,
     entry->gen = group17_gen[op];
 }
 
+static void decode_0F6F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    if (s->prefix & PREFIX_REPNZ) {
+        entry->gen = NULL;
+    } else if (s->prefix & PREFIX_REPZ) {
+        /* movdqu */
+        entry->gen = gen_MOVDQ;
+        entry->vex_class = 4;
+        entry->vex_special = X86_VEX_SSEUnaligned;
+    } else {
+        /* MMX movq, movdqa */
+        entry->gen = gen_MOVDQ;
+        entry->vex_class = 1;
+        entry->special = X86_SPECIAL_MMX;
+    }
+}
+
 static const X86OpEntry opcodes_0F38_00toEF[240] = {
 };
 
@@ -227,8 +244,26 @@  static void decode_0F3A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
 }
 
 static const X86OpEntry opcodes_0F[256] = {
+    [0x60] = X86_OP_ENTRY3(PUNPCKLBW,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x61] = X86_OP_ENTRY3(PUNPCKLWD,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x62] = X86_OP_ENTRY3(PUNPCKLDQ,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x63] = X86_OP_ENTRY3(PACKSSWB,   V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x64] = X86_OP_ENTRY3(PCMPGTB,    V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x65] = X86_OP_ENTRY3(PCMPGTW,    V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x66] = X86_OP_ENTRY3(PCMPGTD,    V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x67] = X86_OP_ENTRY3(PACKUSWB,   V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+
     [0x38] = X86_OP_GROUP0(0F38),
     [0x3a] = X86_OP_GROUP0(0F3A),
+
+    [0x68] = X86_OP_ENTRY3(PUNPCKHBW,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x69] = X86_OP_ENTRY3(PUNPCKHWD,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x6a] = X86_OP_ENTRY3(PUNPCKHDQ,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x6b] = X86_OP_ENTRY3(PACKSSDW,   V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x6c] = X86_OP_ENTRY3(PUNPCKLQDQ, V,x, H,x, W,x,  vex4 p_66 avx2_256),
+    [0x6d] = X86_OP_ENTRY3(PUNPCKHQDQ, V,x, H,x, W,x,  vex4 p_66 avx2_256),
+    [0x6e] = X86_OP_ENTRY3(MOVD_to,    V,x, None,None, E,y, vex5 mmx p_00_66),  /* wrong dest Vy on SDM! */
+    [0x6f] = X86_OP_GROUP3(0F6F,       V,x, None,None, W,x, vex5 mmx p_00_66_f3),
 };
 
 static void do_decode_0F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 36b963a0d3..3f89d3cf50 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -212,6 +212,97 @@  static void gen_writeback(DisasContext *s, X86DecodedOp *op)
     }
 }
 
+static inline int sse_vec_len(DisasContext *s, X86DecodedInsn *decode)
+{
+    if (decode->e.special == X86_SPECIAL_MMX &&
+        !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
+        return 8;
+    }
+    return s->vex_l ? 32 : 16;
+}
+
+static void gen_store_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, int src_ofs)
+{
+    MemOp ot = decode->op[0].ot;
+    int vec_len = sse_vec_len(s, decode);
+
+    if (!decode->op[0].has_ea) {
+        tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, vec_len, vec_len);
+        return;
+    }
+
+    switch (ot) {
+    case MO_64:
+        gen_stq_env_A0(s, src_ofs);
+        break;
+    case MO_128:
+        gen_sto_env_A0(s, src_ofs);
+        break;
+    case MO_256:
+        gen_sty_env_A0(s, src_ofs);
+        break;
+    default:
+        abort();
+    }
+}
+
+/*
+ * 00 = p*  Pq, Qq (if mmx not NULL; no VEX)
+ * 66 = vp* Vx, Hx, Wx
+ *
+ * These are really the same encoding, because 1) V is the same as P when VEX.V
+ * is not present 2) P and Q are the same as H and W apart from MM/XMM
+ */
+static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+                                      SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)
+{
+    assert (!!mmx == !!(decode->e.special == X86_SPECIAL_MMX));
+
+    if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) {
+        /* VEX encoding is not applicable to MMX instructions.  */
+        gen_illegal_opcode(s);
+        return;
+    }
+    if (!(s->prefix & PREFIX_DATA)) {
+        mmx(cpu_env, s->ptr0, s->ptr1, s->ptr2);
+    } else if (!s->vex_l) {
+        xmm(cpu_env, s->ptr0, s->ptr1, s->ptr2);
+    } else {
+        ymm(cpu_env, s->ptr0, s->ptr1, s->ptr2);
+    }
+}
+
+#define BINARY_INT_MMX(uname, lname)                                               \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{                                                                                  \
+    gen_binary_int_sse(s, env, decode,                                             \
+                          gen_helper_##lname##_mmx,                                \
+                          gen_helper_##lname##_xmm,                                \
+                          gen_helper_##lname##_ymm);                               \
+}
+BINARY_INT_MMX(PUNPCKLBW,  punpcklbw)
+BINARY_INT_MMX(PUNPCKLWD,  punpcklwd)
+BINARY_INT_MMX(PUNPCKLDQ,  punpckldq)
+BINARY_INT_MMX(PACKSSWB,   packsswb)
+BINARY_INT_MMX(PACKUSWB,   packuswb)
+BINARY_INT_MMX(PUNPCKHBW,  punpckhbw)
+BINARY_INT_MMX(PUNPCKHWD,  punpckhwd)
+BINARY_INT_MMX(PUNPCKHDQ,  punpckhdq)
+BINARY_INT_MMX(PACKSSDW,   packssdw)
+
+/* Instructions with no MMX equivalent.  */
+#define BINARY_INT_SSE(uname, lname)                                               \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{                                                                                  \
+    gen_binary_int_sse(s, env, decode,                                             \
+                          NULL,                                                    \
+                          gen_helper_##lname##_xmm,                                \
+                          gen_helper_##lname##_ymm);                               \
+}
+
+BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq)
+BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq)
+
 static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
 {
     TCGv carry_in = NULL;
@@ -382,6 +473,36 @@  static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     }
 }
 
+static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[2].ot;
+    int vec_len = sse_vec_len(s, decode);
+    int lo_ofs = decode->op[0].offset
+        - xmm_offset(decode->op[0].ot)
+        + xmm_offset(ot);
+
+    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+
+    switch (ot) {
+    case MO_32:
+#ifdef TARGET_X86_64
+        tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
+        tcg_gen_st_i32(s->tmp3_i32, cpu_env, lo_ofs);
+        break;
+    case MO_64:
+#endif
+        tcg_gen_st_tl(s->T1, cpu_env, lo_ofs);
+        break;
+    default:
+        abort();
+    }
+}
+
+static void gen_MOVDQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    gen_store_sse(s, env, decode, decode->op[2].offset);
+}
+
 static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -405,6 +526,33 @@  static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 
 }
 
+static void gen_PCMPGTB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    tcg_gen_gvec_cmp(TCG_COND_GT, MO_8,
+                     decode->op[0].offset, decode->op[1].offset,
+                     decode->op[2].offset, vec_len, vec_len);
+}
+
+static void gen_PCMPGTW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    tcg_gen_gvec_cmp(TCG_COND_GT, MO_16,
+                     decode->op[0].offset, decode->op[1].offset,
+                     decode->op[2].offset, vec_len, vec_len);
+}
+
+static void gen_PCMPGTD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    tcg_gen_gvec_cmp(TCG_COND_GT, MO_32,
+                     decode->op[0].offset, decode->op[1].offset,
+                     decode->op[2].offset, vec_len, vec_len);
+}
+
 static void gen_PDEP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index e147a95c5f..cf18e12d38 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -23,6 +23,7 @@ 
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
 #include "exec/cpu_ldst.h"
 #include "exec/translator.h"
 
@@ -4665,7 +4666,7 @@  static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
 #ifndef CONFIG_USER_ONLY
         use_new &= b <= limit;
 #endif
-        if (use_new && 0) {
+        if (use_new && (b >= 0x160 && b <= 0x16f)) {
             return disas_insn_new(s, cpu, b + 0x100);
         }
         break;