diff mbox series

[1/2] target/arm: Use x86 intrinsics to implement PMULL.P64

Message ID 20230601123332.3297404-2-ardb@kernel.org (mailing list archive)
State New, archived
Headers show
Series Implement PMULL using host intrinsics | expand

Commit Message

Ard Biesheuvel June 1, 2023, 12:33 p.m. UTC
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 host/include/i386/host/cpuinfo.h |  1 +
 target/arm/tcg/vec_helper.c      | 26 +++++++++++++++++++-
 util/cpuinfo-i386.c              |  1 +
 3 files changed, 27 insertions(+), 1 deletion(-)

Comments

Peter Maydell June 1, 2023, 1 p.m. UTC | #1
On Thu, 1 Jun 2023 at 13:33, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  host/include/i386/host/cpuinfo.h |  1 +
>  target/arm/tcg/vec_helper.c      | 26 +++++++++++++++++++-
>  util/cpuinfo-i386.c              |  1 +
>  3 files changed, 27 insertions(+), 1 deletion(-)
>
> diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
> index 073d0a426f31487d..cf4ced844760d28f 100644
> --- a/host/include/i386/host/cpuinfo.h
> +++ b/host/include/i386/host/cpuinfo.h
> @@ -27,6 +27,7 @@
>  #define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
>  #define CPUINFO_ATOMIC_VMOVDQU  (1u << 17)
>  #define CPUINFO_AES             (1u << 18)
> +#define CPUINFO_PMULL           (1u << 19)
>
>  /* Initialized with a constructor. */
>  extern unsigned cpuinfo;
> diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
> index f59d3b26eacf08f8..fb422627588439b3 100644
> --- a/target/arm/tcg/vec_helper.c
> +++ b/target/arm/tcg/vec_helper.c
> @@ -25,6 +25,14 @@
>  #include "qemu/int128.h"
>  #include "vec_internal.h"
>
> +#ifdef __x86_64__
> +#include "host/cpuinfo.h"
> +#include <wmmintrin.h>
> +#define TARGET_PMULL  __attribute__((__target__("pclmul")))
> +#else
> +#define TARGET_PMULL
> +#endif
> +
>  /*
>   * Data for expanding active predicate bits to bytes, for byte elements.
>   *
> @@ -2010,12 +2018,28 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
>   * Because of the lanes are not accessed in strict columns,
>   * this probably cannot be turned into a generic helper.
>   */
> -void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
> +void TARGET_PMULL HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
>  {
>      intptr_t i, j, opr_sz = simd_oprsz(desc);
>      intptr_t hi = simd_data(desc);
>      uint64_t *d = vd, *n = vn, *m = vm;
>
> +#ifdef __x86_64__
> +    if (cpuinfo & CPUINFO_PMULL) {
> +       switch (hi) {
> +       case 0:
> +               *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x0);
> +               break;
> +       case 1:
> +               *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x11);
> +               break;
> +       default:
> +               g_assert_not_reached();
> +       }
> +        return;
> +    }
> +#endif

This needs to cope with the input vectors being more than
just 128 bits wide, I think. Also you probably still
need the clear_tail() to clear any high bits of the register.

thanks
-- PMM
Ard Biesheuvel June 1, 2023, 3:28 p.m. UTC | #2
On Thu, 1 Jun 2023 at 15:01, Peter Maydell <peter.maydell@linaro.org> wrote:
>
> On Thu, 1 Jun 2023 at 13:33, Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> >  host/include/i386/host/cpuinfo.h |  1 +
> >  target/arm/tcg/vec_helper.c      | 26 +++++++++++++++++++-
> >  util/cpuinfo-i386.c              |  1 +
> >  3 files changed, 27 insertions(+), 1 deletion(-)
> >
> > diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
> > index 073d0a426f31487d..cf4ced844760d28f 100644
> > --- a/host/include/i386/host/cpuinfo.h
> > +++ b/host/include/i386/host/cpuinfo.h
> > @@ -27,6 +27,7 @@
> >  #define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
> >  #define CPUINFO_ATOMIC_VMOVDQU  (1u << 17)
> >  #define CPUINFO_AES             (1u << 18)
> > +#define CPUINFO_PMULL           (1u << 19)
> >
> >  /* Initialized with a constructor. */
> >  extern unsigned cpuinfo;
> > diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
> > index f59d3b26eacf08f8..fb422627588439b3 100644
> > --- a/target/arm/tcg/vec_helper.c
> > +++ b/target/arm/tcg/vec_helper.c
> > @@ -25,6 +25,14 @@
> >  #include "qemu/int128.h"
> >  #include "vec_internal.h"
> >
> > +#ifdef __x86_64__
> > +#include "host/cpuinfo.h"
> > +#include <wmmintrin.h>
> > +#define TARGET_PMULL  __attribute__((__target__("pclmul")))
> > +#else
> > +#define TARGET_PMULL
> > +#endif
> > +
> >  /*
> >   * Data for expanding active predicate bits to bytes, for byte elements.
> >   *
> > @@ -2010,12 +2018,28 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
> >   * Because of the lanes are not accessed in strict columns,
> >   * this probably cannot be turned into a generic helper.
> >   */
> > -void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
> > +void TARGET_PMULL HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
> >  {
> >      intptr_t i, j, opr_sz = simd_oprsz(desc);
> >      intptr_t hi = simd_data(desc);
> >      uint64_t *d = vd, *n = vn, *m = vm;
> >
> > +#ifdef __x86_64__
> > +    if (cpuinfo & CPUINFO_PMULL) {
> > +       switch (hi) {
> > +       case 0:
> > +               *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x0);
> > +               break;
> > +       case 1:
> > +               *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x11);
> > +               break;
> > +       default:
> > +               g_assert_not_reached();
> > +       }
> > +        return;
> > +    }
> > +#endif
>
> This needs to cope with the input vectors being more than
> just 128 bits wide, I think. Also you probably still
> need the clear_tail() to clear any high bits of the register.
>

Ah yes, I missed that completely.
diff mbox series

Patch

diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
index 073d0a426f31487d..cf4ced844760d28f 100644
--- a/host/include/i386/host/cpuinfo.h
+++ b/host/include/i386/host/cpuinfo.h
@@ -27,6 +27,7 @@ 
 #define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
 #define CPUINFO_ATOMIC_VMOVDQU  (1u << 17)
 #define CPUINFO_AES             (1u << 18)
+#define CPUINFO_PMULL           (1u << 19)
 
 /* Initialized with a constructor. */
 extern unsigned cpuinfo;
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index f59d3b26eacf08f8..fb422627588439b3 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -25,6 +25,14 @@ 
 #include "qemu/int128.h"
 #include "vec_internal.h"
 
+#ifdef __x86_64__
+#include "host/cpuinfo.h"
+#include <wmmintrin.h>
+#define TARGET_PMULL  __attribute__((__target__("pclmul")))
+#else
+#define TARGET_PMULL
+#endif
+
 /*
  * Data for expanding active predicate bits to bytes, for byte elements.
  *
@@ -2010,12 +2018,28 @@  void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
  * Because of the lanes are not accessed in strict columns,
  * this probably cannot be turned into a generic helper.
  */
-void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
+void TARGET_PMULL HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
 {
     intptr_t i, j, opr_sz = simd_oprsz(desc);
     intptr_t hi = simd_data(desc);
     uint64_t *d = vd, *n = vn, *m = vm;
 
+#ifdef __x86_64__
+    if (cpuinfo & CPUINFO_PMULL) {
+	switch (hi) {
+	case 0:
+		*(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x0);
+		break;
+	case 1:
+		*(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x11);
+		break;
+	default:
+		g_assert_not_reached();
+	}
+        return;
+    }
+#endif
+
     for (i = 0; i < opr_sz / 8; i += 2) {
         uint64_t nn = n[i + hi];
         uint64_t mm = m[i + hi];
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
index 3043f066c0182dc8..8930e13451201a64 100644
--- a/util/cpuinfo-i386.c
+++ b/util/cpuinfo-i386.c
@@ -40,6 +40,7 @@  unsigned __attribute__((constructor)) cpuinfo_init(void)
         info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
         info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
         info |= (c & bit_AES ? CPUINFO_AES : 0);
+        info |= (c & bit_PCLMULQDQ ? CPUINFO_PMULL : 0);
 
         /* For AVX features, we must check available and usable. */
         if ((c & bit_AVX) && (c & bit_OSXSAVE)) {