diff mbox series

[v6,15/36] tcg: Add guest load/store primitives for TCGv_i128

Message ID 20230130214844.1158612-16-richard.henderson@linaro.org (mailing list archive)
State New, archived
Headers show
Series tcg: Support for Int128 with helpers | expand

Commit Message

Richard Henderson Jan. 30, 2023, 9:48 p.m. UTC
These are not yet considering atomicity of the 16-byte value;
this is a direct replacement for the current target code which
uses a pair of 8-byte operations.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h |  10 +++
 include/tcg/tcg-op.h    |   2 +
 accel/tcg/cputlb.c      | 112 +++++++++++++++++++++++++++++++++
 accel/tcg/user-exec.c   |  66 ++++++++++++++++++++
 tcg/tcg-op.c            | 134 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 324 insertions(+)

Comments

Alex Bennée Feb. 1, 2023, 9:52 a.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> These are not yet considering atomicity of the 16-byte value;
> this is a direct replacement for the current target code which
> uses a pair of 8-byte operations.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  include/exec/cpu_ldst.h |  10 +++
>  include/tcg/tcg-op.h    |   2 +
>  accel/tcg/cputlb.c      | 112 +++++++++++++++++++++++++++++++++
>  accel/tcg/user-exec.c   |  66 ++++++++++++++++++++
>  tcg/tcg-op.c            | 134 ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 324 insertions(+)
>
> diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
> index d0c7c0d5fe..09b55cc0ee 100644
> --- a/include/exec/cpu_ldst.h
> +++ b/include/exec/cpu_ldst.h
> @@ -220,6 +220,11 @@ uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr ptr,
>  uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr ptr,
>                          MemOpIdx oi, uintptr_t ra);
>  
> +Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
> +                       MemOpIdx oi, uintptr_t ra);
> +Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
> +                       MemOpIdx oi, uintptr_t ra);
> +
>  void cpu_stb_mmu(CPUArchState *env, abi_ptr ptr, uint8_t val,
>                   MemOpIdx oi, uintptr_t ra);
>  void cpu_stw_be_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
> @@ -235,6 +240,11 @@ void cpu_stl_le_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
>  void cpu_stq_le_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
>                      MemOpIdx oi, uintptr_t ra);
>  
> +void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
> +                     MemOpIdx oi, uintptr_t ra);
> +void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
> +                     MemOpIdx oi, uintptr_t ra);
> +
>  uint32_t cpu_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
>                                   uint32_t cmpv, uint32_t newv,
>                                   MemOpIdx oi, uintptr_t retaddr);
> diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
> index c4276767d1..e5f5b63c37 100644
> --- a/include/tcg/tcg-op.h
> +++ b/include/tcg/tcg-op.h
> @@ -845,6 +845,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, MemOp);
>  void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, MemOp);
>  void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, MemOp);
>  void tcg_gen_qemu_st_i64(TCGv_i64, TCGv, TCGArg, MemOp);
> +void tcg_gen_qemu_ld_i128(TCGv_i128, TCGv, TCGArg, MemOp);
> +void tcg_gen_qemu_st_i128(TCGv_i128, TCGv, TCGArg, MemOp);
>  
>  static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
>  {
> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
> index 4e040a1cb9..e3604ad313 100644
> --- a/accel/tcg/cputlb.c
> +++ b/accel/tcg/cputlb.c
> @@ -2187,6 +2187,64 @@ uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
>      return cpu_load_helper(env, addr, oi, ra, helper_le_ldq_mmu);
>  }
>  
> +Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
> +                       MemOpIdx oi, uintptr_t ra)
> +{
> +    MemOp mop = get_memop(oi);
> +    int mmu_idx = get_mmuidx(oi);
> +    MemOpIdx new_oi;
> +    unsigned a_bits;
> +    uint64_t h, l;
> +
> +    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128));
> +    a_bits = get_alignment_bits(mop);
> +
> +    /* Handle CPU specific unaligned behaviour */
> +    if (addr & ((1 << a_bits) - 1)) {
> +        cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_LOAD,
> +                             mmu_idx, ra);
> +    }
> +
> +    /* Construct an unaligned 64-bit replacement MemOpIdx. */
> +    mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN;
> +    new_oi = make_memop_idx(mop, mmu_idx);
> +
> +    h = helper_be_ldq_mmu(env, addr, new_oi, ra);
> +    l = helper_be_ldq_mmu(env, addr + 8, new_oi, ra);
> +
> +    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
> +    return int128_make128(l, h);
> +}
> +
> +Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
> +                       MemOpIdx oi, uintptr_t ra)
> +{
> +    MemOp mop = get_memop(oi);
> +    int mmu_idx = get_mmuidx(oi);
> +    MemOpIdx new_oi;
> +    unsigned a_bits;
> +    uint64_t h, l;
> +
> +    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_LE|MO_128));

Why not use validate_memop for this like elsewhere in cputlb?

<snip>
>  
> +void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
> +                     MemOpIdx oi, uintptr_t ra)
> +{
> +    MemOp mop = get_memop(oi);
> +    int mmu_idx = get_mmuidx(oi);
> +    MemOpIdx new_oi;
> +    unsigned a_bits;
> +
> +    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128));

ditto for the others

> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
> index cb83d2375d..33ef325f6e 100644
> --- a/tcg/tcg-op.c
> +++ b/tcg/tcg-op.c
> @@ -3109,6 +3109,140 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
>      }
>  }
>

I'm confused because the TCG ops in this patch are still using i64  and
the atomic use hasn't come in yet. Worth splitting the patch?

Anyway:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>


> +static void canonicalize_memop_i128_as_i64(MemOp ret[2], MemOp orig)
> +{
> +    MemOp mop_1 = orig, mop_2;
> +
> +    tcg_debug_assert((orig & MO_SIZE) == MO_128);
> +    tcg_debug_assert((orig & MO_SIGN) == 0);
> +
> +    /* Use a memory ordering implemented by the host. */
> +    if (!TCG_TARGET_HAS_MEMORY_BSWAP && (orig & MO_BSWAP)) {
> +        mop_1 &= ~MO_BSWAP;
> +    }
> +
> +    /* Reduce the size to 64-bit. */
> +    mop_1 = (mop_1 & ~MO_SIZE) | MO_64;
> +
> +    /* Retain the alignment constraints of the original. */
> +    switch (orig & MO_AMASK) {
> +    case MO_UNALN:
> +    case MO_ALIGN_2:
> +    case MO_ALIGN_4:
> +        mop_2 = mop_1;
> +        break;
> +    case MO_ALIGN_8:
> +        /* Prefer MO_ALIGN+MO_64 to MO_ALIGN_8+MO_64. */
> +        mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN;
> +        mop_2 = mop_1;
> +        break;
> +    case MO_ALIGN:
> +        /* Second has 8-byte alignment; first has 16-byte alignment. */
> +        mop_2 = mop_1;
> +        mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN_16;
> +        break;
> +    case MO_ALIGN_16:
> +    case MO_ALIGN_32:
> +    case MO_ALIGN_64:
> +        /* Second has 8-byte alignment; first retains original. */
> +        mop_2 = (mop_1 & ~MO_AMASK) | MO_ALIGN;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +    ret[0] = mop_1;
> +    ret[1] = mop_2;
> +}
> +
> +void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop)
> +{
> +    MemOp mop[2];
> +    TCGv addr_p8;
> +    TCGv_i64 x, y;
> +
> +    canonicalize_memop_i128_as_i64(mop, memop);
> +
> +    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
> +    addr = plugin_prep_mem_callbacks(addr);
> +
> +    /* TODO: respect atomicity of the operation. */
> +    /* TODO: allow the tcg backend to see the whole operation. */
> +
> +    /*
> +     * Since there are no global TCGv_i128, there is no visible state
> +     * changed if the second load faults.  Load directly into the two
> +     * subwords.
> +     */
> +    if ((memop & MO_BSWAP) == MO_LE) {
> +        x = TCGV128_LOW(val);
> +        y = TCGV128_HIGH(val);
> +    } else {
> +        x = TCGV128_HIGH(val);
> +        y = TCGV128_LOW(val);
> +    }
> +
> +    gen_ldst_i64(INDEX_op_qemu_ld_i64, x, addr, mop[0], idx);
> +
> +    if ((mop[0] ^ memop) & MO_BSWAP) {
> +        tcg_gen_bswap64_i64(x, x);
> +    }
> +
> +    addr_p8 = tcg_temp_new();
> +    tcg_gen_addi_tl(addr_p8, addr, 8);
> +    gen_ldst_i64(INDEX_op_qemu_ld_i64, y, addr_p8, mop[1], idx);
> +    tcg_temp_free(addr_p8);
> +
> +    if ((mop[0] ^ memop) & MO_BSWAP) {
> +        tcg_gen_bswap64_i64(y, y);
> +    }
> +
> +    plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx),
> +                             QEMU_PLUGIN_MEM_R);
> +}
> +
> +void tcg_gen_qemu_st_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop)
> +{
> +    MemOp mop[2];
> +    TCGv addr_p8;
> +    TCGv_i64 x, y;
> +
> +    canonicalize_memop_i128_as_i64(mop, memop);
> +
> +    tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST);
> +    addr = plugin_prep_mem_callbacks(addr);
> +
> +    /* TODO: respect atomicity of the operation. */
> +    /* TODO: allow the tcg backend to see the whole operation. */
> +
> +    if ((memop & MO_BSWAP) == MO_LE) {
> +        x = TCGV128_LOW(val);
> +        y = TCGV128_HIGH(val);
> +    } else {
> +        x = TCGV128_HIGH(val);
> +        y = TCGV128_LOW(val);
> +    }
> +
> +    addr_p8 = tcg_temp_new();
> +    if ((mop[0] ^ memop) & MO_BSWAP) {
> +        TCGv_i64 t = tcg_temp_new_i64();
> +
> +        tcg_gen_bswap64_i64(t, x);
> +        gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr, mop[0], idx);
> +        tcg_gen_bswap64_i64(t, y);
> +        tcg_gen_addi_tl(addr_p8, addr, 8);
> +        gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr_p8, mop[1], idx);
> +        tcg_temp_free_i64(t);
> +    } else {
> +        gen_ldst_i64(INDEX_op_qemu_st_i64, x, addr, mop[0], idx);
> +        tcg_gen_addi_tl(addr_p8, addr, 8);
> +        gen_ldst_i64(INDEX_op_qemu_st_i64, y, addr_p8, mop[1], idx);
> +    }
> +    tcg_temp_free(addr_p8);
> +
> +    plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx),
> +                             QEMU_PLUGIN_MEM_W);
> +}
> +
>  static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc)
>  {
>      switch (opc & MO_SSIZE) {
Richard Henderson Feb. 1, 2023, 7:03 p.m. UTC | #2
On 1/31/23 23:52, Alex Bennée wrote:
>> +    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_LE|MO_128));
> 
> Why not use validate_memop for this like elsewhere in cputlb?

validate_memop is going away; this patch was pulled forward from that.

> I'm confused because the TCG ops in this patch are still using i64  and
> the atomic use hasn't come in yet. Worth splitting the patch?

I'm confused about what sort of split you're suggesting.
The non-atomic use comes in with the very next patch in this series...


r~
Philippe Mathieu-Daudé Feb. 3, 2023, 2:20 p.m. UTC | #3
On 1/2/23 20:03, Richard Henderson wrote:
> On 1/31/23 23:52, Alex Bennée wrote:

>> Worth splitting the patch?

> I'm confused about what sort of split you're suggesting.

Maybe a preliminary patch with the LD/ST helpers:

  include/exec/cpu_ldst.h |  10 +++
  accel/tcg/cputlb.c      | 112 +++++++++++++++++++++++++++++++++
  accel/tcg/user-exec.c   |  66 ++++++++++++++++++++

Then the translator functions:

  include/tcg/tcg-op.h    |   2 +
  tcg/tcg-op.c            | 134 ++++++++++++++++++++++++++++++++++++++++

?
diff mbox series

Patch

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index d0c7c0d5fe..09b55cc0ee 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -220,6 +220,11 @@  uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr ptr,
 uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr ptr,
                         MemOpIdx oi, uintptr_t ra);
 
+Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
+                       MemOpIdx oi, uintptr_t ra);
+Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
+                       MemOpIdx oi, uintptr_t ra);
+
 void cpu_stb_mmu(CPUArchState *env, abi_ptr ptr, uint8_t val,
                  MemOpIdx oi, uintptr_t ra);
 void cpu_stw_be_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
@@ -235,6 +240,11 @@  void cpu_stl_le_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
 void cpu_stq_le_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
                     MemOpIdx oi, uintptr_t ra);
 
+void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                     MemOpIdx oi, uintptr_t ra);
+void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                     MemOpIdx oi, uintptr_t ra);
+
 uint32_t cpu_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
                                  uint32_t cmpv, uint32_t newv,
                                  MemOpIdx oi, uintptr_t retaddr);
diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index c4276767d1..e5f5b63c37 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -845,6 +845,8 @@  void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, MemOp);
 void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, MemOp);
 void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, MemOp);
 void tcg_gen_qemu_st_i64(TCGv_i64, TCGv, TCGArg, MemOp);
+void tcg_gen_qemu_ld_i128(TCGv_i128, TCGv, TCGArg, MemOp);
+void tcg_gen_qemu_st_i128(TCGv_i128, TCGv, TCGArg, MemOp);
 
 static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
 {
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 4e040a1cb9..e3604ad313 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2187,6 +2187,64 @@  uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
     return cpu_load_helper(env, addr, oi, ra, helper_le_ldq_mmu);
 }
 
+Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
+                       MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    int mmu_idx = get_mmuidx(oi);
+    MemOpIdx new_oi;
+    unsigned a_bits;
+    uint64_t h, l;
+
+    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128));
+    a_bits = get_alignment_bits(mop);
+
+    /* Handle CPU specific unaligned behaviour */
+    if (addr & ((1 << a_bits) - 1)) {
+        cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_LOAD,
+                             mmu_idx, ra);
+    }
+
+    /* Construct an unaligned 64-bit replacement MemOpIdx. */
+    mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN;
+    new_oi = make_memop_idx(mop, mmu_idx);
+
+    h = helper_be_ldq_mmu(env, addr, new_oi, ra);
+    l = helper_be_ldq_mmu(env, addr + 8, new_oi, ra);
+
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+    return int128_make128(l, h);
+}
+
+Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
+                       MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    int mmu_idx = get_mmuidx(oi);
+    MemOpIdx new_oi;
+    unsigned a_bits;
+    uint64_t h, l;
+
+    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_LE|MO_128));
+    a_bits = get_alignment_bits(mop);
+
+    /* Handle CPU specific unaligned behaviour */
+    if (addr & ((1 << a_bits) - 1)) {
+        cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_LOAD,
+                             mmu_idx, ra);
+    }
+
+    /* Construct an unaligned 64-bit replacement MemOpIdx. */
+    mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN;
+    new_oi = make_memop_idx(mop, mmu_idx);
+
+    l = helper_le_ldq_mmu(env, addr, new_oi, ra);
+    h = helper_le_ldq_mmu(env, addr + 8, new_oi, ra);
+
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+    return int128_make128(l, h);
+}
+
 /*
  * Store Helpers
  */
@@ -2541,6 +2599,60 @@  void cpu_stq_le_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
     cpu_store_helper(env, addr, val, oi, retaddr, helper_le_stq_mmu);
 }
 
+void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                     MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    int mmu_idx = get_mmuidx(oi);
+    MemOpIdx new_oi;
+    unsigned a_bits;
+
+    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128));
+    a_bits = get_alignment_bits(mop);
+
+    /* Handle CPU specific unaligned behaviour */
+    if (addr & ((1 << a_bits) - 1)) {
+        cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_STORE,
+                             mmu_idx, ra);
+    }
+
+    /* Construct an unaligned 64-bit replacement MemOpIdx. */
+    mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN;
+    new_oi = make_memop_idx(mop, mmu_idx);
+
+    helper_be_stq_mmu(env, addr, int128_gethi(val), new_oi, ra);
+    helper_be_stq_mmu(env, addr + 8, int128_getlo(val), new_oi, ra);
+
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+}
+
+void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                     MemOpIdx oi, uintptr_t ra)
+{
+    MemOp mop = get_memop(oi);
+    int mmu_idx = get_mmuidx(oi);
+    MemOpIdx new_oi;
+    unsigned a_bits;
+
+    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_LE|MO_128));
+    a_bits = get_alignment_bits(mop);
+
+    /* Handle CPU specific unaligned behaviour */
+    if (addr & ((1 << a_bits) - 1)) {
+        cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_STORE,
+                             mmu_idx, ra);
+    }
+
+    /* Construct an unaligned 64-bit replacement MemOpIdx. */
+    mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN;
+    new_oi = make_memop_idx(mop, mmu_idx);
+
+    helper_le_stq_mmu(env, addr, int128_getlo(val), new_oi, ra);
+    helper_le_stq_mmu(env, addr + 8, int128_gethi(val), new_oi, ra);
+
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+}
+
 #include "ldst_common.c.inc"
 
 /*
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index a8eb63ab96..ae67d84638 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -1031,6 +1031,42 @@  uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
     return ret;
 }
 
+Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
+                       MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    Int128 ret;
+
+    validate_memop(oi, MO_128 | MO_BE);
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
+    memcpy(&ret, haddr, 16);
+    clear_helper_retaddr();
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+
+    if (!HOST_BIG_ENDIAN) {
+        ret = bswap128(ret);
+    }
+    return ret;
+}
+
+Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
+                       MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    Int128 ret;
+
+    validate_memop(oi, MO_128 | MO_LE);
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
+    memcpy(&ret, haddr, 16);
+    clear_helper_retaddr();
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+
+    if (HOST_BIG_ENDIAN) {
+        ret = bswap128(ret);
+    }
+    return ret;
+}
+
 void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
                  MemOpIdx oi, uintptr_t ra)
 {
@@ -1115,6 +1151,36 @@  void cpu_stq_le_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
+void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr,
+                     Int128 val, MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+
+    validate_memop(oi, MO_128 | MO_BE);
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
+    if (!HOST_BIG_ENDIAN) {
+        val = bswap128(val);
+    }
+    memcpy(haddr, &val, 16);
+    clear_helper_retaddr();
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+}
+
+void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr,
+                     Int128 val, MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+
+    validate_memop(oi, MO_128 | MO_LE);
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE);
+    if (HOST_BIG_ENDIAN) {
+        val = bswap128(val);
+    }
+    memcpy(haddr, &val, 16);
+    clear_helper_retaddr();
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
+}
+
 uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr ptr)
 {
     uint32_t ret;
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index cb83d2375d..33ef325f6e 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -3109,6 +3109,140 @@  void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
     }
 }
 
+static void canonicalize_memop_i128_as_i64(MemOp ret[2], MemOp orig)
+{
+    MemOp mop_1 = orig, mop_2;
+
+    tcg_debug_assert((orig & MO_SIZE) == MO_128);
+    tcg_debug_assert((orig & MO_SIGN) == 0);
+
+    /* Use a memory ordering implemented by the host. */
+    if (!TCG_TARGET_HAS_MEMORY_BSWAP && (orig & MO_BSWAP)) {
+        mop_1 &= ~MO_BSWAP;
+    }
+
+    /* Reduce the size to 64-bit. */
+    mop_1 = (mop_1 & ~MO_SIZE) | MO_64;
+
+    /* Retain the alignment constraints of the original. */
+    switch (orig & MO_AMASK) {
+    case MO_UNALN:
+    case MO_ALIGN_2:
+    case MO_ALIGN_4:
+        mop_2 = mop_1;
+        break;
+    case MO_ALIGN_8:
+        /* Prefer MO_ALIGN+MO_64 to MO_ALIGN_8+MO_64. */
+        mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN;
+        mop_2 = mop_1;
+        break;
+    case MO_ALIGN:
+        /* Second has 8-byte alignment; first has 16-byte alignment. */
+        mop_2 = mop_1;
+        mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN_16;
+        break;
+    case MO_ALIGN_16:
+    case MO_ALIGN_32:
+    case MO_ALIGN_64:
+        /* Second has 8-byte alignment; first retains original. */
+        mop_2 = (mop_1 & ~MO_AMASK) | MO_ALIGN;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    ret[0] = mop_1;
+    ret[1] = mop_2;
+}
+
+void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop)
+{
+    MemOp mop[2];
+    TCGv addr_p8;
+    TCGv_i64 x, y;
+
+    canonicalize_memop_i128_as_i64(mop, memop);
+
+    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
+    addr = plugin_prep_mem_callbacks(addr);
+
+    /* TODO: respect atomicity of the operation. */
+    /* TODO: allow the tcg backend to see the whole operation. */
+
+    /*
+     * Since there are no global TCGv_i128, there is no visible state
+     * changed if the second load faults.  Load directly into the two
+     * subwords.
+     */
+    if ((memop & MO_BSWAP) == MO_LE) {
+        x = TCGV128_LOW(val);
+        y = TCGV128_HIGH(val);
+    } else {
+        x = TCGV128_HIGH(val);
+        y = TCGV128_LOW(val);
+    }
+
+    gen_ldst_i64(INDEX_op_qemu_ld_i64, x, addr, mop[0], idx);
+
+    if ((mop[0] ^ memop) & MO_BSWAP) {
+        tcg_gen_bswap64_i64(x, x);
+    }
+
+    addr_p8 = tcg_temp_new();
+    tcg_gen_addi_tl(addr_p8, addr, 8);
+    gen_ldst_i64(INDEX_op_qemu_ld_i64, y, addr_p8, mop[1], idx);
+    tcg_temp_free(addr_p8);
+
+    if ((mop[0] ^ memop) & MO_BSWAP) {
+        tcg_gen_bswap64_i64(y, y);
+    }
+
+    plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx),
+                             QEMU_PLUGIN_MEM_R);
+}
+
+void tcg_gen_qemu_st_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop)
+{
+    MemOp mop[2];
+    TCGv addr_p8;
+    TCGv_i64 x, y;
+
+    canonicalize_memop_i128_as_i64(mop, memop);
+
+    tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST);
+    addr = plugin_prep_mem_callbacks(addr);
+
+    /* TODO: respect atomicity of the operation. */
+    /* TODO: allow the tcg backend to see the whole operation. */
+
+    if ((memop & MO_BSWAP) == MO_LE) {
+        x = TCGV128_LOW(val);
+        y = TCGV128_HIGH(val);
+    } else {
+        x = TCGV128_HIGH(val);
+        y = TCGV128_LOW(val);
+    }
+
+    addr_p8 = tcg_temp_new();
+    if ((mop[0] ^ memop) & MO_BSWAP) {
+        TCGv_i64 t = tcg_temp_new_i64();
+
+        tcg_gen_bswap64_i64(t, x);
+        gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr, mop[0], idx);
+        tcg_gen_bswap64_i64(t, y);
+        tcg_gen_addi_tl(addr_p8, addr, 8);
+        gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr_p8, mop[1], idx);
+        tcg_temp_free_i64(t);
+    } else {
+        gen_ldst_i64(INDEX_op_qemu_st_i64, x, addr, mop[0], idx);
+        tcg_gen_addi_tl(addr_p8, addr, 8);
+        gen_ldst_i64(INDEX_op_qemu_st_i64, y, addr_p8, mop[1], idx);
+    }
+    tcg_temp_free(addr_p8);
+
+    plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx),
+                             QEMU_PLUGIN_MEM_W);
+}
+
 static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc)
 {
     switch (opc & MO_SSIZE) {