Message ID | 20220426125028.18844-3-lucas.araujo@eldorado.org.br (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | VSX MMA Implementation | expand |
On 4/26/22 05:50, Lucas Mateus Castro(alqotel) wrote: > +%xx_at 23:3 !function=times_4 > +@XX3_at ...... ... .. ..... ..... ........ ... &XX3 xt=%xx_at xb=%xx_xb Hmm. Depends, I suppose on whether you want acc[0-7] or vsr[0-28] > +/* > + * Packed VSX Integer GER Flags > + * 00 - no accumulation no saturation > + * 01 - accumulate but no saturation > + * 10 - no accumulation but with saturation > + * 11 - accumulate with saturation > + */ > +static inline bool get_sat(uint32_t flags) > +{ > + return flags & 0x2; > +} > + > +static inline bool get_acc(uint32_t flags) > +{ > + return flags & 0x1; > +} Better to have separate helpers for these? They'd be immediate operands to the function replacing XVIGER (see below) and thus optimize well. > +#define GET_VsrN(a, i) (extract32(a->VsrB((i) / 2), (i) % 2 ? 4 : 0, 4)) > +#define GET_VsrB(a, i) a->VsrB(i) > +#define GET_VsrH(a, i) a->VsrH(i) > + > +#define GET_VsrSN(a, i) (sextract32(a->VsrSB((i) / 2), (i) % 2 ? 4 : 0, 4)) > +#define GET_VsrSB(a, i) a->VsrSB(i) > +#define GET_VsrSH(a, i) a->VsrSH(i) These can be made into functions of the form typedef int32_t xviger_extract(ppc_vsr_t *a, int i); > +#define XVIGER(NAME, RANK, EL) \ > + void NAME(CPUPPCState *env, uint32_t a_r, uint32_t b_r, \ > + uint32_t at_r, uint32_t mask, uint32_t packed_flags) \ > + { \ > + ppc_vsr_t *a = cpu_vsr_ptr(env, a_r), *b = cpu_vsr_ptr(env, b_r), *at; \ > + bool sat = get_sat(packed_flags), acc = get_acc(packed_flags); \ > + uint8_t pmsk = ger_get_pmsk(mask), xmsk = ger_get_xmsk(mask), \ > + ymsk = ger_get_ymsk(mask); \ > + uint8_t pmsk_bit, xmsk_bit, ymsk_bit; \ > + int64_t psum; \ > + int32_t va, vb; \ > + int i, j, k; \ > + for (i = 0, xmsk_bit = 1 << 3; i < 4; i++, xmsk_bit >>= 1) { \ > + at = cpu_vsr_ptr(env, at_r + i); \ > + for (j = 0, ymsk_bit = 1 << 3; j < 4; j++, ymsk_bit >>= 1) { \ > + if ((xmsk_bit & xmsk) && (ymsk_bit & ymsk)) { \ > + psum = 0; \ > + for (k = 0, pmsk_bit = 1 << (RANK - 1); k < RANK; \ > + k++, pmsk_bit >>= 1) { \ > + if (pmsk_bit & pmsk) { \ > + va = (int32_t)GET_VsrS##EL(a, RANK * i + k); \ > + vb = (int32_t) ((RANK == 4) ? \ > + GET_Vsr##EL(b, RANK * j + k) : \ > + GET_VsrS##EL(b, RANK * j + k));\ > + psum += va * vb; \ > + } \ > + } \ > + if (acc) { \ > + psum += at->VsrSW(j); \ > + } \ > + if (sat && psum > INT32_MAX) { \ > + set_vscr_sat(env); \ > + at->VsrSW(j) = INT32_MAX; \ > + } else if (sat && psum < INT32_MIN) { \ > + set_vscr_sat(env); \ > + at->VsrSW(j) = INT32_MIN; \ > + } else { \ > + at->VsrSW(j) = (int32_t) psum; \ > + } \ > + } else { \ > + at->VsrSW(j) = 0; \ > + } \ > + } \ > + } \ > + } ... which means that this monster can be a function instead of a non-debuggable macro. > diff --git a/target/ppc/internal.h b/target/ppc/internal.h > index 8094e0b033..a994d98238 100644 > --- a/target/ppc/internal.h > +++ b/target/ppc/internal.h > @@ -291,4 +291,32 @@ G_NORETURN void ppc_cpu_do_unaligned_access(CPUState *cs, vaddr addr, > uintptr_t retaddr); > #endif > > +/* > + * Auxiliary functions to pack/unpack masks for GER instructions. > + * > + * Packed format: > + * Bits 0-3: xmsk > + * Bits 4-7: ymsk > + * Bits 8-15: pmsk > + */ > +static inline uint8_t ger_get_xmsk(uint32_t packed_masks) > +{ > + return packed_masks & 0xF; > +} > + > +static inline uint8_t ger_get_ymsk(uint32_t packed_masks) > +{ > + return (packed_masks >> 4) & 0xF; > +} > + > +static inline uint8_t ger_get_pmsk(uint32_t packed_masks) > +{ > + return (packed_masks >> 8) & 0xFF; > +} > + > +static inline int ger_pack_masks(int pmsk, int ymsk, int xmsk) > +{ > + return (pmsk & 0xFF) << 8 | (ymsk & 0xF) << 4 | (xmsk & 0xF); > +} Use hw/registerfields.h. C.f. PREDDESC in target/arm/internals.h. > +static bool do_ger_XX3(DisasContext *ctx, arg_XX3 *a, uint32_t op, > + void (*helper)(TCGv_env, TCGv_i32, TCGv_i32, > + TCGv_i32, TCGv_i32, TCGv_i32)) > +{ > + uint32_t mask; > + REQUIRE_INSNS_FLAGS2(ctx, ISA310); > + REQUIRE_VSX(ctx); > + if (unlikely((a->xa / 4 == a->xt / 4) || (a->xb / 4 == a->xt / 4))) { > + gen_invalid(ctx); > + return true; > + } > + > + mask = 0xFFFFFFFF; > + helper(cpu_env, tcg_constant_i32(a->xa), tcg_constant_i32(a->xb), > + tcg_constant_i32(a->xt), tcg_constant_i32(mask), > + tcg_constant_i32(op)); > + return true; > +} Why are you passing register numbers instead of pointers, like everywhere else? r~
On 26/04/2022 20:40, Richard Henderson wrote: > > On 4/26/22 05:50, Lucas Mateus Castro(alqotel) wrote: >> +%xx_at 23:3 !function=times_4 >> +@XX3_at ...... ... .. ..... ..... ........ ... &XX3 >> xt=%xx_at xb=%xx_xb > > Hmm. Depends, I suppose on whether you want acc[0-7] or vsr[0-28] I mostly used VSR function here, but since I'll change the patch 1 to your suggestion (which will require creating acc_full_offset) I'll make a few changes to create some functions for the accumulator > >> +/* >> + * Packed VSX Integer GER Flags >> + * 00 - no accumulation no saturation >> + * 01 - accumulate but no saturation >> + * 10 - no accumulation but with saturation >> + * 11 - accumulate with saturation >> + */ >> +static inline bool get_sat(uint32_t flags) >> +{ >> + return flags & 0x2; >> +} >> + >> +static inline bool get_acc(uint32_t flags) >> +{ >> + return flags & 0x1; >> +} > > Better to have separate helpers for these? They'd be immediate > operands to the function > replacing XVIGER (see below) and thus optimize well. Do you mean different functions or a function that receives packed_flags along with the callback functions? > >> +#define GET_VsrN(a, i) (extract32(a->VsrB((i) / 2), (i) % 2 ? 4 : 0, >> 4)) >> +#define GET_VsrB(a, i) a->VsrB(i) >> +#define GET_VsrH(a, i) a->VsrH(i) >> + >> +#define GET_VsrSN(a, i) (sextract32(a->VsrSB((i) / 2), (i) % 2 ? 4 : >> 0, 4)) >> +#define GET_VsrSB(a, i) a->VsrSB(i) >> +#define GET_VsrSH(a, i) a->VsrSH(i) > > These can be made into functions of the form > > typedef int32_t xviger_extract(ppc_vsr_t *a, int i); > In this case it'd be necessary to receive 2 xviger_extract functions since XVI8GER4* multiply one value as signed and the other as unsigned (and other integer GER treat both as signed). An alternative would be to isolate the innermost loop into a different function, like: typedef int64_t do_ger(int32_t a, int32_t b, int32_t at, int32_t pmsk); static int64_t ger_rank4(int32_t a, int32_t b, int32_t at, int32_t mask) { int64_t psum = 0, i; for (i = 0; i < 4; i++, mask >>= 1) { if (mask & 1) { psum += (sextract32(a, i * 8, 8)) * (extract32(b, i * 8, 8)); } } return psum; } That way we could avoid having 'rank' as a parameter, what do you think? > > >> diff --git a/target/ppc/internal.h b/target/ppc/internal.h >> index 8094e0b033..a994d98238 100644 >> --- a/target/ppc/internal.h >> +++ b/target/ppc/internal.h >> @@ -291,4 +291,32 @@ G_NORETURN void >> ppc_cpu_do_unaligned_access(CPUState *cs, vaddr addr, >> uintptr_t retaddr); >> #endif >> >> +/* >> + * Auxiliary functions to pack/unpack masks for GER instructions. >> + * >> + * Packed format: >> + * Bits 0-3: xmsk >> + * Bits 4-7: ymsk >> + * Bits 8-15: pmsk >> + */ >> +static inline uint8_t ger_get_xmsk(uint32_t packed_masks) >> +{ >> + return packed_masks & 0xF; >> +} >> + >> +static inline uint8_t ger_get_ymsk(uint32_t packed_masks) >> +{ >> + return (packed_masks >> 4) & 0xF; >> +} >> + >> +static inline uint8_t ger_get_pmsk(uint32_t packed_masks) >> +{ >> + return (packed_masks >> 8) & 0xFF; >> +} >> + >> +static inline int ger_pack_masks(int pmsk, int ymsk, int xmsk) >> +{ >> + return (pmsk & 0xFF) << 8 | (ymsk & 0xF) << 4 | (xmsk & 0xF); >> +} > > Use hw/registerfields.h. C.f. PREDDESC in target/arm/internals.h. Ok, will do > >> +static bool do_ger_XX3(DisasContext *ctx, arg_XX3 *a, uint32_t op, >> + void (*helper)(TCGv_env, TCGv_i32, >> TCGv_i32, >> + TCGv_i32, TCGv_i32, >> TCGv_i32)) >> +{ >> + uint32_t mask; >> + REQUIRE_INSNS_FLAGS2(ctx, ISA310); >> + REQUIRE_VSX(ctx); >> + if (unlikely((a->xa / 4 == a->xt / 4) || (a->xb / 4 == a->xt / >> 4))) { >> + gen_invalid(ctx); >> + return true; >> + } >> + >> + mask = 0xFFFFFFFF; >> + helper(cpu_env, tcg_constant_i32(a->xa), tcg_constant_i32(a->xb), >> + tcg_constant_i32(a->xt), tcg_constant_i32(mask), >> + tcg_constant_i32(op)); >> + return true; >> +} > > Why are you passing register numbers instead of pointers, like > everywhere else? Because here we are not working only with 1 register per register number, the ACC uses 4 and the XVF64GER* needs to use XA and XA+1, and while VSR is an array so I could do ppc_vsr_ptr+1 I thought it was better not to access memory I was not given a pointer to, so I passed XA so I can request cpu_vsr_ptr(env, xa) and cpu_vsr_ptr(env, xa + 1) > > > r~
On 4/27/22 13:24, Lucas Mateus Martins Araujo e Castro wrote: > > On 26/04/2022 20:40, Richard Henderson wrote: >> >> On 4/26/22 05:50, Lucas Mateus Castro(alqotel) wrote: >>> +%xx_at 23:3 !function=times_4 >>> +@XX3_at ...... ... .. ..... ..... ........ ... &XX3 xt=%xx_at xb=%xx_xb >> >> Hmm. Depends, I suppose on whether you want acc[0-7] or vsr[0-28] > I mostly used VSR function here, but since I'll change the patch 1 to your suggestion > (which will require creating acc_full_offset) I'll make a few changes to create some > functions for the accumulator >> >>> +/* >>> + * Packed VSX Integer GER Flags >>> + * 00 - no accumulation no saturation >>> + * 01 - accumulate but no saturation >>> + * 10 - no accumulation but with saturation >>> + * 11 - accumulate with saturation >>> + */ >>> +static inline bool get_sat(uint32_t flags) >>> +{ >>> + return flags & 0x2; >>> +} >>> + >>> +static inline bool get_acc(uint32_t flags) >>> +{ >>> + return flags & 0x1; >>> +} >> >> Better to have separate helpers for these? They'd be immediate operands to the function >> replacing XVIGER (see below) and thus optimize well. > Do you mean different functions or a function that receives packed_flags along with the > callback functions? I mean separate helper entry points, which use a common function that receives these as separate boolean arguments, along with the callbacks. Use QEMU_FLATTEN on the helper entry points to ensure that everything is inlined and the constant args are optimized. > In this case it'd be necessary to receive 2 xviger_extract functions since XVI8GER4* > multiply one value as signed and the other as unsigned (and other integer GER treat both > as signed). Certainly. > > An alternative would be to isolate the innermost loop into a different function, like: > > typedef int64_t do_ger(int32_t a, int32_t b, int32_t at, int32_t pmsk); > > static int64_t ger_rank4(int32_t a, int32_t b, int32_t at, int32_t mask) > { > int64_t psum = 0, i; > for (i = 0; i < 4; i++, mask >>= 1) { > if (mask & 1) { > psum += (sextract32(a, i * 8, 8)) * (extract32(b, i * 8, 8)); > } > } > return psum; > } > > That way we could avoid having 'rank' as a parameter, what do you think? Reasonable. I certainly like extracting uint32_t from the vector generically and not having to pass that on further. >> Why are you passing register numbers instead of pointers, like everywhere else? > Because here we are not working only with 1 register per register number, the ACC uses 4 > and the XVF64GER* needs to use XA and XA+1, and while VSR is an array so I could do > ppc_vsr_ptr+1 I thought it was better not to access memory I was not given a pointer to, > so I passed XA so I can request cpu_vsr_ptr(env, xa) and cpu_vsr_ptr(env, xa + 1) I think using cpu_vsr_ptr is the mistake. It might be clarifying to define a ppc_acc_t, if only as a typedef of ppc_vsr_t. The acc_full_offset function will compute the offset for this pointer and, importantly, will be the place to modify if and when the architecture changes to allow or require separate storage for the ACC registers. r~
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index c2b6c987c0..ee55c6cfa2 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -2688,6 +2688,11 @@ static inline uint64_t *cpu_vsrl_ptr(CPUPPCState *env, int i) return (uint64_t *)((uintptr_t)env + vsr64_offset(i, false)); } +static inline ppc_vsr_t *cpu_vsr_ptr(CPUPPCState *env, int i) +{ + return (ppc_vsr_t *)((uintptr_t)env + vsr_full_offset(i)); +} + static inline long avr64_offset(int i, bool high) { return vsr64_offset(i + 32, high); diff --git a/target/ppc/helper.h b/target/ppc/helper.h index aa6773c4a5..06553517de 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -537,6 +537,9 @@ DEF_HELPER_5(XXBLENDVB, void, vsr, vsr, vsr, vsr, i32) DEF_HELPER_5(XXBLENDVH, void, vsr, vsr, vsr, vsr, i32) DEF_HELPER_5(XXBLENDVW, void, vsr, vsr, vsr, vsr, i32) DEF_HELPER_5(XXBLENDVD, void, vsr, vsr, vsr, vsr, i32) +DEF_HELPER_6(XVI4GER8, void, env, i32, i32, i32, i32, i32) +DEF_HELPER_6(XVI8GER4, void, env, i32, i32, i32, i32, i32) +DEF_HELPER_6(XVI16GER2, void, env, i32, i32, i32, i32, i32) DEF_HELPER_2(efscfsi, i32, env, i32) DEF_HELPER_2(efscfui, i32, env, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 7a76bedfa6..653f50db93 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -170,6 +170,9 @@ &XX3 xt xa xb @XX3 ...... ..... ..... ..... ........ ... &XX3 xt=%xx_xt xa=%xx_xa xb=%xx_xb +%xx_at 23:3 !function=times_4 +@XX3_at ...... ... .. ..... ..... ........ ... &XX3 xt=%xx_at xb=%xx_xb + &XX3_dm xt xa xb dm @XX3_dm ...... ..... ..... ..... . dm:2 ..... ... &XX3_dm xt=%xx_xt xa=%xx_xa xb=%xx_xb @@ -719,3 +722,15 @@ RFEBB 010011-------------- . 0010010010 - @XL_s XXMFACC 011111 ... -- 00000 ----- 0010110001 - @X_a XXMTACC 011111 ... -- 00001 ----- 0010110001 - @X_a XXSETACCZ 011111 ... -- 00011 ----- 0010110001 - @X_a + +## Vector GER instruction + +XVI4GER8 111011 ... -- ..... ..... 00100011 ..- @XX3_at xa=%xx_xa +XVI4GER8PP 111011 ... -- ..... ..... 00100010 ..- @XX3_at xa=%xx_xa +XVI8GER4 111011 ... -- ..... ..... 00000011 ..- @XX3_at xa=%xx_xa +XVI8GER4PP 111011 ... -- ..... ..... 00000010 ..- @XX3_at xa=%xx_xa +XVI16GER2 111011 ... -- ..... ..... 01001011 ..- @XX3_at xa=%xx_xa +XVI16GER2PP 111011 ... -- ..... ..... 01101011 ..- @XX3_at xa=%xx_xa +XVI8GER4SPP 111011 ... -- ..... ..... 01100011 ..- @XX3_at xa=%xx_xa +XVI16GER2S 111011 ... -- ..... ..... 00101011 ..- @XX3_at xa=%xx_xa +XVI16GER2SPP 111011 ... -- ..... ..... 00101010 ..- @XX3_at xa=%xx_xa diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 8c1674510b..bd2f1a7c2a 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -782,6 +782,91 @@ VCT(uxs, cvtsduw, u32) VCT(sxs, cvtsdsw, s32) #undef VCT +/* + * Packed VSX Integer GER Flags + * 00 - no accumulation no saturation + * 01 - accumulate but no saturation + * 10 - no accumulation but with saturation + * 11 - accumulate with saturation + */ +static inline bool get_sat(uint32_t flags) +{ + return flags & 0x2; +} + +static inline bool get_acc(uint32_t flags) +{ + return flags & 0x1; +} + +#define GET_VsrN(a, i) (extract32(a->VsrB((i) / 2), (i) % 2 ? 4 : 0, 4)) +#define GET_VsrB(a, i) a->VsrB(i) +#define GET_VsrH(a, i) a->VsrH(i) + +#define GET_VsrSN(a, i) (sextract32(a->VsrSB((i) / 2), (i) % 2 ? 4 : 0, 4)) +#define GET_VsrSB(a, i) a->VsrSB(i) +#define GET_VsrSH(a, i) a->VsrSH(i) + +#define XVIGER(NAME, RANK, EL) \ + void NAME(CPUPPCState *env, uint32_t a_r, uint32_t b_r, \ + uint32_t at_r, uint32_t mask, uint32_t packed_flags) \ + { \ + ppc_vsr_t *a = cpu_vsr_ptr(env, a_r), *b = cpu_vsr_ptr(env, b_r), *at; \ + bool sat = get_sat(packed_flags), acc = get_acc(packed_flags); \ + uint8_t pmsk = ger_get_pmsk(mask), xmsk = ger_get_xmsk(mask), \ + ymsk = ger_get_ymsk(mask); \ + uint8_t pmsk_bit, xmsk_bit, ymsk_bit; \ + int64_t psum; \ + int32_t va, vb; \ + int i, j, k; \ + for (i = 0, xmsk_bit = 1 << 3; i < 4; i++, xmsk_bit >>= 1) { \ + at = cpu_vsr_ptr(env, at_r + i); \ + for (j = 0, ymsk_bit = 1 << 3; j < 4; j++, ymsk_bit >>= 1) { \ + if ((xmsk_bit & xmsk) && (ymsk_bit & ymsk)) { \ + psum = 0; \ + for (k = 0, pmsk_bit = 1 << (RANK - 1); k < RANK; \ + k++, pmsk_bit >>= 1) { \ + if (pmsk_bit & pmsk) { \ + va = (int32_t)GET_VsrS##EL(a, RANK * i + k); \ + vb = (int32_t) ((RANK == 4) ? \ + GET_Vsr##EL(b, RANK * j + k) : \ + GET_VsrS##EL(b, RANK * j + k));\ + psum += va * vb; \ + } \ + } \ + if (acc) { \ + psum += at->VsrSW(j); \ + } \ + if (sat && psum > INT32_MAX) { \ + set_vscr_sat(env); \ + at->VsrSW(j) = INT32_MAX; \ + } else if (sat && psum < INT32_MIN) { \ + set_vscr_sat(env); \ + at->VsrSW(j) = INT32_MIN; \ + } else { \ + at->VsrSW(j) = (int32_t) psum; \ + } \ + } else { \ + at->VsrSW(j) = 0; \ + } \ + } \ + } \ + } + +XVIGER(helper_XVI4GER8, 8, N) +XVIGER(helper_XVI8GER4, 4, B) +XVIGER(helper_XVI16GER2, 2, H) + +#undef GER_MULT +#undef XVIGER_NAME +#undef XVIGER +#undef GET_VsrN +#undef GET_VsrB +#undef GET_VsrH +#undef GET_VsrSN +#undef GET_VsrSB +#undef GET_VsrSH + target_ulong helper_vclzlsbb(ppc_avr_t *r) { target_ulong count = 0; diff --git a/target/ppc/internal.h b/target/ppc/internal.h index 8094e0b033..a994d98238 100644 --- a/target/ppc/internal.h +++ b/target/ppc/internal.h @@ -291,4 +291,32 @@ G_NORETURN void ppc_cpu_do_unaligned_access(CPUState *cs, vaddr addr, uintptr_t retaddr); #endif +/* + * Auxiliary functions to pack/unpack masks for GER instructions. + * + * Packed format: + * Bits 0-3: xmsk + * Bits 4-7: ymsk + * Bits 8-15: pmsk + */ +static inline uint8_t ger_get_xmsk(uint32_t packed_masks) +{ + return packed_masks & 0xF; +} + +static inline uint8_t ger_get_ymsk(uint32_t packed_masks) +{ + return (packed_masks >> 4) & 0xF; +} + +static inline uint8_t ger_get_pmsk(uint32_t packed_masks) +{ + return (packed_masks >> 8) & 0xFF; +} + +static inline int ger_pack_masks(int pmsk, int ymsk, int xmsk) +{ + return (pmsk & 0xFF) << 8 | (ymsk & 0xF) << 4 | (xmsk & 0xF); +} + #endif /* PPC_INTERNAL_H */ diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 919b889c40..1eb68c7081 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2823,6 +2823,56 @@ static bool trans_XXSETACCZ(DisasContext *ctx, arg_X_a *a) return true; } +/* + * Packed VSX Integer GER Flags + * 00 - no accumulation no saturation + * 01 - accumulate but no saturation + * 10 - no accumulation but with saturation + * 11 - accumulate with saturation + */ +static uint32_t pack_flags_xvi(int acc, int sat) +{ + return (sat << 1) | acc; +} + +static bool do_ger_XX3(DisasContext *ctx, arg_XX3 *a, uint32_t op, + void (*helper)(TCGv_env, TCGv_i32, TCGv_i32, + TCGv_i32, TCGv_i32, TCGv_i32)) +{ + uint32_t mask; + REQUIRE_INSNS_FLAGS2(ctx, ISA310); + REQUIRE_VSX(ctx); + if (unlikely((a->xa / 4 == a->xt / 4) || (a->xb / 4 == a->xt / 4))) { + gen_invalid(ctx); + return true; + } + + mask = 0xFFFFFFFF; + helper(cpu_env, tcg_constant_i32(a->xa), tcg_constant_i32(a->xb), + tcg_constant_i32(a->xt), tcg_constant_i32(mask), + tcg_constant_i32(op)); + return true; +} + +/* Used to keep line length < 80 */ +#define GER_NOP pack_flags_xvi(0, 0) +#define GER_PP pack_flags_xvi(1, 0) +#define GER_SAT pack_flags_xvi(0, 1) +#define GER_SPP pack_flags_xvi(1, 1) +TRANS(XVI4GER8, do_ger_XX3, GER_NOP, gen_helper_XVI4GER8) +TRANS(XVI4GER8PP, do_ger_XX3, GER_PP, gen_helper_XVI4GER8) +TRANS(XVI8GER4, do_ger_XX3, GER_NOP, gen_helper_XVI8GER4) +TRANS(XVI8GER4PP, do_ger_XX3, GER_PP, gen_helper_XVI8GER4) +TRANS(XVI8GER4SPP, do_ger_XX3, GER_SPP, gen_helper_XVI8GER4) +TRANS(XVI16GER2, do_ger_XX3, GER_NOP, gen_helper_XVI16GER2) +TRANS(XVI16GER2PP, do_ger_XX3, GER_PP, gen_helper_XVI16GER2) +TRANS(XVI16GER2S, do_ger_XX3, GER_SAT, gen_helper_XVI16GER2) +TRANS(XVI16GER2SPP, do_ger_XX3, GER_SPP, gen_helper_XVI16GER2) +#undef GER_NOP +#undef GER_PP +#undef GER_SAT +#undef GER_SPP + #undef GEN_XX2FORM #undef GEN_XX3FORM #undef GEN_XX2IFORM