diff mbox series

crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

Message ID 20200827173058.94519-1-ubizjak@gmail.com (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show
Series crypto/x86: Use XORL r32,32 in curve25519-x86_64.c | expand

Commit Message

Uros Bizjak Aug. 27, 2020, 5:30 p.m. UTC
x86_64 zero extends 32bit operations, so for 64bit operands,
XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
a REX prefix byte when legacy registers are used.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
---
 arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
 1 file changed, 34 insertions(+), 34 deletions(-)

Comments

Ard Biesheuvel Sept. 1, 2020, 3:39 p.m. UTC | #1
(+ Jason)

On Thu, 27 Aug 2020 at 20:31, Uros Bizjak <ubizjak@gmail.com> wrote:
>
> x86_64 zero extends 32bit operations, so for 64bit operands,
> XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> a REX prefix byte when legacy registers are used.
>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: "David S. Miller" <davem@davemloft.net>
> ---
>  arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
>  1 file changed, 34 insertions(+), 34 deletions(-)
>
> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> index 8acbb6584a37..a9edb6f8a0ba 100644
> --- a/arch/x86/crypto/curve25519-x86_64.c
> +++ b/arch/x86/crypto/curve25519-x86_64.c
> @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
>
>         asm volatile(
>                 /* Clear registers to propagate the carry bit */
> -               "  xor %%r8, %%r8;"
> -               "  xor %%r9, %%r9;"
> -               "  xor %%r10, %%r10;"
> -               "  xor %%r11, %%r11;"
> -               "  xor %1, %1;"
> +               "  xor %%r8d, %%r8d;"
> +               "  xor %%r9d, %%r9d;"
> +               "  xor %%r10d, %%r10d;"
> +               "  xor %%r11d, %%r11d;"
> +               "  xor %k1, %k1;"
>
>                 /* Begin addition chain */
>                 "  addq 0(%3), %0;"
> @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
>                 "  cmovc %0, %%rax;"
>
>                 /* Step 2: Add carry*38 to the original sum */
> -               "  xor %%rcx, %%rcx;"
> +               "  xor %%ecx, %%ecx;"
>                 "  add %%rax, %%r8;"
>                 "  adcx %%rcx, %%r9;"
>                 "  movq %%r9, 8(%1);"
> @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
>                 /* Compute src1[0] * src2 */
>                 "  movq 0(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"
>                 /* Compute src1[1] * src2 */
>                 "  movq 8(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[2] * src2 */
>                 "  movq 16(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[3] * src2 */
>                 "  movq 24(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %3, %3;"
> +               "  xor %k3, %k3;"
>                 "  adoxq 0(%1), %%r8;"
>                 "  mulxq 40(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
>                 /* Compute src1[0] * src2 */
>                 "  movq 0(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"
>                 /* Compute src1[1] * src2 */
>                 "  movq 8(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[2] * src2 */
>                 "  movq 16(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[3] * src2 */
>                 "  movq 24(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
>                 /* Compute src1[0] * src2 */
>                 "  movq 32(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
> +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
>                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"
>                 /* Compute src1[1] * src2 */
>                 "  movq 40(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
> +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
>                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[2] * src2 */
>                 "  movq 48(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
> +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
>                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[3] * src2 */
>                 "  movq 56(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
> +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
>                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
>                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
> @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %3, %3;"
> +               "  xor %k3, %k3;"
>                 "  adoxq 0(%1), %%r8;"
>                 "  mulxq 40(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 96(%1), %%r8, %%r13;"
> -               "  xor %3, %3;"
> +               "  xor %k3, %k3;"
>                 "  adoxq 64(%1), %%r8;"
>                 "  mulxq 104(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>
>                 /* Step 1: Compute all partial products */
>                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
> -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
> @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>
>                 /* Step 2: Compute two parallel carry chains */
> -               "  xor %%r15, %%r15;"
> +               "  xor %%r15d, %%r15d;"
>                 "  adox %%rax, %%r10;"
>                 "  adcx %%r8, %%r8;"
>                 "  adox %%rcx, %%r11;"
> @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %%rcx, %%rcx;"
> +               "  xor %%ecx, %%ecx;"
>                 "  adoxq 0(%1), %%r8;"
>                 "  mulxq 40(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>         asm volatile(
>                 /* Step 1: Compute all partial products */
>                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
> -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
> @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>
>                 /* Step 2: Compute two parallel carry chains */
> -               "  xor %%r15, %%r15;"
> +               "  xor %%r15d, %%r15d;"
>                 "  adox %%rax, %%r10;"
>                 "  adcx %%r8, %%r8;"
>                 "  adox %%rcx, %%r11;"
> @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>
>                 /* Step 1: Compute all partial products */
>                 "  movq 32(%1), %%rdx;"                                       /* f[0] */
> -               "  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +               "  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>                 "  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>                 "  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>                 "  movq 56(%1), %%rdx;"                                      /* f[3] */
> @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 "  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>
>                 /* Step 2: Compute two parallel carry chains */
> -               "  xor %%r15, %%r15;"
> +               "  xor %%r15d, %%r15d;"
>                 "  adox %%rax, %%r10;"
>                 "  adcx %%r8, %%r8;"
>                 "  adox %%rcx, %%r11;"
> @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %%rcx, %%rcx;"
> +               "  xor %%ecx, %%ecx;"
>                 "  adoxq 0(%1), %%r8;"
>                 "  mulxq 40(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 96(%1), %%r8, %%r13;"
> -               "  xor %%rcx, %%rcx;"
> +               "  xor %%ecx, %%ecx;"
>                 "  adoxq 64(%1), %%r8;"
>                 "  mulxq 104(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> --
> 2.26.2
>
Jason A. Donenfeld Sept. 1, 2020, 6:13 p.m. UTC | #2
Hi Uros,

Thanks for this patch. This seems correct to me, since indeed those
clear the top bits anyway, and having smaller code seems good. But
first -- I'm wondering -- have you stuck this into Vale/Hacl to see if
it still checks out there? I'm CC'ing Karthik/Aymeric/Chris/Jonathan
who might be interested in taking a look at that. Seems like it might
be easy to just special case this in Xor64 for the case where the
operands are the same. Also, have you seen any measurable differences
when benching this? I can stick it into kbench9000 to see if you
haven't looked yet.

Jason

On Tue, Sep 1, 2020 at 5:46 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> (+ Jason)
>
> On Thu, 27 Aug 2020 at 20:31, Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > x86_64 zero extends 32bit operations, so for 64bit operands,
> > XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> > a REX prefix byte when legacy registers are used.
> >
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > Cc: "David S. Miller" <davem@davemloft.net>
> > ---
> >  arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
> >  1 file changed, 34 insertions(+), 34 deletions(-)
> >
> > diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> > index 8acbb6584a37..a9edb6f8a0ba 100644
> > --- a/arch/x86/crypto/curve25519-x86_64.c
> > +++ b/arch/x86/crypto/curve25519-x86_64.c
> > @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
> >
> >         asm volatile(
> >                 /* Clear registers to propagate the carry bit */
> > -               "  xor %%r8, %%r8;"
> > -               "  xor %%r9, %%r9;"
> > -               "  xor %%r10, %%r10;"
> > -               "  xor %%r11, %%r11;"
> > -               "  xor %1, %1;"
> > +               "  xor %%r8d, %%r8d;"
> > +               "  xor %%r9d, %%r9d;"
> > +               "  xor %%r10d, %%r10d;"
> > +               "  xor %%r11d, %%r11d;"
> > +               "  xor %k1, %k1;"
> >
> >                 /* Begin addition chain */
> >                 "  addq 0(%3), %0;"
> > @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
> >                 "  cmovc %0, %%rax;"
> >
> >                 /* Step 2: Add carry*38 to the original sum */
> > -               "  xor %%rcx, %%rcx;"
> > +               "  xor %%ecx, %%ecx;"
> >                 "  add %%rax, %%r8;"
> >                 "  adcx %%rcx, %%r9;"
> >                 "  movq %%r9, 8(%1);"
> > @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >
> >                 /* Compute src1[0] * src2 */
> >                 "  movq 0(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"
> >                 /* Compute src1[1] * src2 */
> >                 "  movq 8(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[2] * src2 */
> >                 "  movq 16(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[3] * src2 */
> >                 "  movq 24(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> > @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 32(%1), %%r8, %%r13;"
> > -               "  xor %3, %3;"
> > +               "  xor %k3, %k3;"
> >                 "  adoxq 0(%1), %%r8;"
> >                 "  mulxq 40(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >
> >                 /* Compute src1[0] * src2 */
> >                 "  movq 0(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"
> >                 /* Compute src1[1] * src2 */
> >                 "  movq 8(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[2] * src2 */
> >                 "  movq 16(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[3] * src2 */
> >                 "  movq 24(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> > @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >
> >                 /* Compute src1[0] * src2 */
> >                 "  movq 32(%1), %%rdx;"
> > -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
> > -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> > +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
> > +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> >                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> >                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"
> >                 /* Compute src1[1] * src2 */
> >                 "  movq 40(%1), %%rdx;"
> > -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
> > -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> > +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
> > +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> >                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[2] * src2 */
> >                 "  movq 48(%1), %%rdx;"
> > -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
> > -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> > +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
> > +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> >                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[3] * src2 */
> >                 "  movq 56(%1), %%rdx;"
> > -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
> > -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> > +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
> > +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> >                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
> >                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
> > @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 32(%1), %%r8, %%r13;"
> > -               "  xor %3, %3;"
> > +               "  xor %k3, %k3;"
> >                 "  adoxq 0(%1), %%r8;"
> >                 "  mulxq 40(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 96(%1), %%r8, %%r13;"
> > -               "  xor %3, %3;"
> > +               "  xor %k3, %k3;"
> >                 "  adoxq 64(%1), %%r8;"
> >                 "  mulxq 104(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> >
> >                 /* Step 1: Compute all partial products */
> >                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
> > -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> > +               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> >                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> >                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> >                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
> > @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> >                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> >
> >                 /* Step 2: Compute two parallel carry chains */
> > -               "  xor %%r15, %%r15;"
> > +               "  xor %%r15d, %%r15d;"
> >                 "  adox %%rax, %%r10;"
> >                 "  adcx %%r8, %%r8;"
> >                 "  adox %%rcx, %%r11;"
> > @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 32(%1), %%r8, %%r13;"
> > -               "  xor %%rcx, %%rcx;"
> > +               "  xor %%ecx, %%ecx;"
> >                 "  adoxq 0(%1), %%r8;"
> >                 "  mulxq 40(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >         asm volatile(
> >                 /* Step 1: Compute all partial products */
> >                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
> > -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> > +               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> >                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> >                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> >                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
> > @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> >
> >                 /* Step 2: Compute two parallel carry chains */
> > -               "  xor %%r15, %%r15;"
> > +               "  xor %%r15d, %%r15d;"
> >                 "  adox %%rax, %%r10;"
> >                 "  adcx %%r8, %%r8;"
> >                 "  adox %%rcx, %%r11;"
> > @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >
> >                 /* Step 1: Compute all partial products */
> >                 "  movq 32(%1), %%rdx;"                                       /* f[0] */
> > -               "  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> > +               "  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> >                 "  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> >                 "  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> >                 "  movq 56(%1), %%rdx;"                                      /* f[3] */
> > @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >                 "  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> >
> >                 /* Step 2: Compute two parallel carry chains */
> > -               "  xor %%r15, %%r15;"
> > +               "  xor %%r15d, %%r15d;"
> >                 "  adox %%rax, %%r10;"
> >                 "  adcx %%r8, %%r8;"
> >                 "  adox %%rcx, %%r11;"
> > @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 32(%1), %%r8, %%r13;"
> > -               "  xor %%rcx, %%rcx;"
> > +               "  xor %%ecx, %%ecx;"
> >                 "  adoxq 0(%1), %%r8;"
> >                 "  mulxq 40(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 96(%1), %%r8, %%r13;"
> > -               "  xor %%rcx, %%rcx;"
> > +               "  xor %%ecx, %%ecx;"
> >                 "  adoxq 64(%1), %%r8;"
> >                 "  mulxq 104(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > --
> > 2.26.2
> >
Jason A. Donenfeld Sept. 1, 2020, 7:12 p.m. UTC | #3
On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> operands are the same. Also, have you seen any measurable differences
> when benching this? I can stick it into kbench9000 to see if you
> haven't looked yet.

On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
difference with this, at all, no matter how much I median or mean or
reduce noise by disabling interrupts.

One thing that sticks out is that all the replacements of r8-r15 by
their %r8d-r15d counterparts still have the REX prefix, as is
necessary to access those registers. The only ones worth changing,
then, are the legacy registers, and on a whole, this amounts to only
48 bytes of difference.
Uros Bizjak Sept. 2, 2020, 5:50 a.m. UTC | #4
On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > operands are the same. Also, have you seen any measurable differences
> > when benching this? I can stick it into kbench9000 to see if you
> > haven't looked yet.
>
> On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> difference with this, at all, no matter how much I median or mean or
> reduce noise by disabling interrupts.
>
> One thing that sticks out is that all the replacements of r8-r15 by
> their %r8d-r15d counterparts still have the REX prefix, as is
> necessary to access those registers. The only ones worth changing,
> then, are the legacy registers, and on a whole, this amounts to only
> 48 bytes of difference.

The patch implements one of x86 target specific optimizations,
performed by gcc. The optimization results in code size savings of one
byte, where REX prefix is omitted with legacy registers, but otherwise
should have no measurable runtime effect. Since gcc applies this
optimization universally to all integer registers, I took the same
approach and implemented the same change to legacy and REX registers.
As measured above, 48 bytes saved is a good result for such a trivial
optimization.

Uros.
Peter Zijlstra Sept. 2, 2020, 9:17 a.m. UTC | #5
On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> >
> > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > operands are the same. Also, have you seen any measurable differences
> > > when benching this? I can stick it into kbench9000 to see if you
> > > haven't looked yet.
> >
> > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > difference with this, at all, no matter how much I median or mean or
> > reduce noise by disabling interrupts.
> >
> > One thing that sticks out is that all the replacements of r8-r15 by
> > their %r8d-r15d counterparts still have the REX prefix, as is
> > necessary to access those registers. The only ones worth changing,
> > then, are the legacy registers, and on a whole, this amounts to only
> > 48 bytes of difference.
> 
> The patch implements one of x86 target specific optimizations,
> performed by gcc. The optimization results in code size savings of one
> byte, where REX prefix is omitted with legacy registers, but otherwise
> should have no measurable runtime effect. Since gcc applies this
> optimization universally to all integer registers, I took the same
> approach and implemented the same change to legacy and REX registers.
> As measured above, 48 bytes saved is a good result for such a trivial
> optimization.

Could we instead implement this optimization in GAS ? Then we can leave
the code as-is.
Uros Bizjak Sept. 2, 2020, 11:36 a.m. UTC | #6
On Wed, Sep 2, 2020 at 11:17 AM <peterz@infradead.org> wrote:
>
> On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> > On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > >
> > > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > > operands are the same. Also, have you seen any measurable differences
> > > > when benching this? I can stick it into kbench9000 to see if you
> > > > haven't looked yet.
> > >
> > > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > > difference with this, at all, no matter how much I median or mean or
> > > reduce noise by disabling interrupts.
> > >
> > > One thing that sticks out is that all the replacements of r8-r15 by
> > > their %r8d-r15d counterparts still have the REX prefix, as is
> > > necessary to access those registers. The only ones worth changing,
> > > then, are the legacy registers, and on a whole, this amounts to only
> > > 48 bytes of difference.
> >
> > The patch implements one of x86 target specific optimizations,
> > performed by gcc. The optimization results in code size savings of one
> > byte, where REX prefix is omitted with legacy registers, but otherwise
> > should have no measurable runtime effect. Since gcc applies this
> > optimization universally to all integer registers, I took the same
> > approach and implemented the same change to legacy and REX registers.
> > As measured above, 48 bytes saved is a good result for such a trivial
> > optimization.
>
> Could we instead implement this optimization in GAS ? Then we can leave
> the code as-is.

I don't think that e.g. "xorq %rax,%rax" should universally be
translated to "xorl %eax,%eax" in the assembler. Perhaps the author
expected exactly 3 bytes (to align the code or similar), and the
assembler would change the length to 2 bytes behind his back, breaking
the expectations.

Uros.
Jason A. Donenfeld Sept. 2, 2020, 2:58 p.m. UTC | #7
On Wed, Sep 2, 2020 at 11:44 AM <peterz@infradead.org> wrote:
>
> On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> > On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > >
> > > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > > operands are the same. Also, have you seen any measurable differences
> > > > when benching this? I can stick it into kbench9000 to see if you
> > > > haven't looked yet.
> > >
> > > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > > difference with this, at all, no matter how much I median or mean or
> > > reduce noise by disabling interrupts.
> > >
> > > One thing that sticks out is that all the replacements of r8-r15 by
> > > their %r8d-r15d counterparts still have the REX prefix, as is
> > > necessary to access those registers. The only ones worth changing,
> > > then, are the legacy registers, and on a whole, this amounts to only
> > > 48 bytes of difference.
> >
> > The patch implements one of x86 target specific optimizations,
> > performed by gcc. The optimization results in code size savings of one
> > byte, where REX prefix is omitted with legacy registers, but otherwise
> > should have no measurable runtime effect. Since gcc applies this
> > optimization universally to all integer registers, I took the same
> > approach and implemented the same change to legacy and REX registers.
> > As measured above, 48 bytes saved is a good result for such a trivial
> > optimization.
>
> Could we instead implement this optimization in GAS ? Then we can leave
> the code as-is.

I rather like that idea. Though I wonder if some would balk at it for
smelling a bit like the MIPS assembler with its optimization pass...
Jason A. Donenfeld Sept. 2, 2020, 3 p.m. UTC | #8
On Wed, Sep 2, 2020 at 1:42 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Sep 2, 2020 at 11:17 AM <peterz@infradead.org> wrote:
> >
> > On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> > > On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > >
> > > > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > > > operands are the same. Also, have you seen any measurable differences
> > > > > when benching this? I can stick it into kbench9000 to see if you
> > > > > haven't looked yet.
> > > >
> > > > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > > > difference with this, at all, no matter how much I median or mean or
> > > > reduce noise by disabling interrupts.
> > > >
> > > > One thing that sticks out is that all the replacements of r8-r15 by
> > > > their %r8d-r15d counterparts still have the REX prefix, as is
> > > > necessary to access those registers. The only ones worth changing,
> > > > then, are the legacy registers, and on a whole, this amounts to only
> > > > 48 bytes of difference.
> > >
> > > The patch implements one of x86 target specific optimizations,
> > > performed by gcc. The optimization results in code size savings of one
> > > byte, where REX prefix is omitted with legacy registers, but otherwise
> > > should have no measurable runtime effect. Since gcc applies this
> > > optimization universally to all integer registers, I took the same
> > > approach and implemented the same change to legacy and REX registers.
> > > As measured above, 48 bytes saved is a good result for such a trivial
> > > optimization.
> >
> > Could we instead implement this optimization in GAS ? Then we can leave
> > the code as-is.
>
> I don't think that e.g. "xorq %rax,%rax" should universally be
> translated to "xorl %eax,%eax" in the assembler. Perhaps the author
> expected exactly 3 bytes (to align the code or similar), and the
> assembler would change the length to 2 bytes behind his back, breaking
> the expectations.

Are you sure that's something that GAS actually provides now? Seems
like a lot of mnemonics have ambiguous/injective encodings, and this
wouldn't make things any different. Most authors use .byte or .align
when they care about the actual bytes, no?
Jason A. Donenfeld Sept. 7, 2020, 1:14 p.m. UTC | #9
Hi Uros, Herbert,

On Thu, Aug 27, 2020 at 07:30:58PM +0200, Uros Bizjak wrote:
> x86_64 zero extends 32bit operations, so for 64bit operands,
> XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> a REX prefix byte when legacy registers are used.
> 
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: "David S. Miller" <davem@davemloft.net>
> ---
>  arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
>  1 file changed, 34 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> index 8acbb6584a37..a9edb6f8a0ba 100644
> --- a/arch/x86/crypto/curve25519-x86_64.c
> +++ b/arch/x86/crypto/curve25519-x86_64.c
> @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
>  
>  	asm volatile(
>  		/* Clear registers to propagate the carry bit */
> -		"  xor %%r8, %%r8;"
> -		"  xor %%r9, %%r9;"
> -		"  xor %%r10, %%r10;"
> -		"  xor %%r11, %%r11;"
> -		"  xor %1, %1;"
> +		"  xor %%r8d, %%r8d;"
> +		"  xor %%r9d, %%r9d;"
> +		"  xor %%r10d, %%r10d;"
> +		"  xor %%r11d, %%r11d;"
> +		"  xor %k1, %k1;"
>  
>  		/* Begin addition chain */
>  		"  addq 0(%3), %0;"
> @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
>  		"  cmovc %0, %%rax;"
>  
>  		/* Step 2: Add carry*38 to the original sum */
> -		"  xor %%rcx, %%rcx;"
> +		"  xor %%ecx, %%ecx;"
>  		"  add %%rax, %%r8;"
>  		"  adcx %%rcx, %%r9;"
>  		"  movq %%r9, 8(%1);"
> @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  
>  		/* Compute src1[0] * src2 */
>  		"  movq 0(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"
>  		/* Compute src1[1] * src2 */
>  		"  movq 8(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[2] * src2 */
>  		"  movq 16(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[3] * src2 */
>  		"  movq 24(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %3, %3;"
> +		"  xor %k3, %k3;"
>  		"  adoxq 0(%1), %%r8;"
>  		"  mulxq 40(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  
>  		/* Compute src1[0] * src2 */
>  		"  movq 0(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"
>  		/* Compute src1[1] * src2 */
>  		"  movq 8(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[2] * src2 */
>  		"  movq 16(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[3] * src2 */
>  		"  movq 24(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  
>  		/* Compute src1[0] * src2 */
>  		"  movq 32(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> +		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
> +		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
>  		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>  		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"
>  		/* Compute src1[1] * src2 */
>  		"  movq 40(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> +		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
> +		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
>  		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[2] * src2 */
>  		"  movq 48(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> +		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
> +		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
>  		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[3] * src2 */
>  		"  movq 56(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> +		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
> +		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
>  		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
>  		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
> @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %3, %3;"
> +		"  xor %k3, %k3;"
>  		"  adoxq 0(%1), %%r8;"
>  		"  mulxq 40(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 96(%1), %%r8, %%r13;"
> -		"  xor %3, %3;"
> +		"  xor %k3, %k3;"
>  		"  adoxq 64(%1), %%r8;"
>  		"  mulxq 104(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>  
>  		/* Step 1: Compute all partial products */
>  		"  movq 0(%1), %%rdx;"                                       /* f[0] */
> -		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>  		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>  		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>  		"  movq 24(%1), %%rdx;"                                      /* f[3] */
> @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>  		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>  
>  		/* Step 2: Compute two parallel carry chains */
> -		"  xor %%r15, %%r15;"
> +		"  xor %%r15d, %%r15d;"
>  		"  adox %%rax, %%r10;"
>  		"  adcx %%r8, %%r8;"
>  		"  adox %%rcx, %%r11;"
> @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %%rcx, %%rcx;"
> +		"  xor %%ecx, %%ecx;"
>  		"  adoxq 0(%1), %%r8;"
>  		"  mulxq 40(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  	asm volatile(
>  		/* Step 1: Compute all partial products */
>  		"  movq 0(%1), %%rdx;"                                       /* f[0] */
> -		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>  		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>  		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>  		"  movq 24(%1), %%rdx;"                                      /* f[3] */
> @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>  
>  		/* Step 2: Compute two parallel carry chains */
> -		"  xor %%r15, %%r15;"
> +		"  xor %%r15d, %%r15d;"
>  		"  adox %%rax, %%r10;"
>  		"  adcx %%r8, %%r8;"
>  		"  adox %%rcx, %%r11;"
> @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  
>  		/* Step 1: Compute all partial products */
>  		"  movq 32(%1), %%rdx;"                                       /* f[0] */
> -		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +		"  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>  		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>  		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>  		"  movq 56(%1), %%rdx;"                                      /* f[3] */
> @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>  
>  		/* Step 2: Compute two parallel carry chains */
> -		"  xor %%r15, %%r15;"
> +		"  xor %%r15d, %%r15d;"
>  		"  adox %%rax, %%r10;"
>  		"  adcx %%r8, %%r8;"
>  		"  adox %%rcx, %%r11;"
> @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %%rcx, %%rcx;"
> +		"  xor %%ecx, %%ecx;"
>  		"  adoxq 0(%1), %%r8;"
>  		"  mulxq 40(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 96(%1), %%r8, %%r13;"
> -		"  xor %%rcx, %%rcx;"
> +		"  xor %%ecx, %%ecx;"
>  		"  adoxq 64(%1), %%r8;"
>  		"  mulxq 104(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> -- 
> 2.26.2
> 

Looks like this is going into HACL in the end, so:

Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>

for cryptodev-2.6.git, rather than crypto-2.6.git

Thanks,
Jason
Herbert Xu Sept. 11, 2020, 6:56 a.m. UTC | #10
On Thu, Aug 27, 2020 at 07:30:58PM +0200, Uros Bizjak wrote:
> x86_64 zero extends 32bit operations, so for 64bit operands,
> XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> a REX prefix byte when legacy registers are used.
> 
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: "David S. Miller" <davem@davemloft.net>
> ---
>  arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
>  1 file changed, 34 insertions(+), 34 deletions(-)

Patch applied.  Thanks.
diff mbox series

Patch

diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 8acbb6584a37..a9edb6f8a0ba 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -45,11 +45,11 @@  static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
 
 	asm volatile(
 		/* Clear registers to propagate the carry bit */
-		"  xor %%r8, %%r8;"
-		"  xor %%r9, %%r9;"
-		"  xor %%r10, %%r10;"
-		"  xor %%r11, %%r11;"
-		"  xor %1, %1;"
+		"  xor %%r8d, %%r8d;"
+		"  xor %%r9d, %%r9d;"
+		"  xor %%r10d, %%r10d;"
+		"  xor %%r11d, %%r11d;"
+		"  xor %k1, %k1;"
 
 		/* Begin addition chain */
 		"  addq 0(%3), %0;"
@@ -93,7 +93,7 @@  static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 		"  cmovc %0, %%rax;"
 
 		/* Step 2: Add carry*38 to the original sum */
-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 8(%1);"
@@ -165,28 +165,28 @@  static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 
 		/* Compute src1[0] * src2 */
 		"  movq 0(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
 		/* Compute src1[1] * src2 */
 		"  movq 8(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[2] * src2 */
 		"  movq 16(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[3] * src2 */
 		"  movq 24(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
@@ -200,7 +200,7 @@  static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %3, %3;"
+		"  xor %k3, %k3;"
 		"  adoxq 0(%1), %%r8;"
 		"  mulxq 40(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -246,28 +246,28 @@  static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 
 		/* Compute src1[0] * src2 */
 		"  movq 0(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
 		/* Compute src1[1] * src2 */
 		"  movq 8(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[2] * src2 */
 		"  movq 16(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[3] * src2 */
 		"  movq 24(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
@@ -277,29 +277,29 @@  static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 
 		/* Compute src1[0] * src2 */
 		"  movq 32(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
 		/* Compute src1[1] * src2 */
 		"  movq 40(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[2] * src2 */
 		"  movq 48(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[3] * src2 */
 		"  movq 56(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
@@ -312,7 +312,7 @@  static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %3, %3;"
+		"  xor %k3, %k3;"
 		"  adoxq 0(%1), %%r8;"
 		"  mulxq 40(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -345,7 +345,7 @@  static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 96(%1), %%r8, %%r13;"
-		"  xor %3, %3;"
+		"  xor %k3, %k3;"
 		"  adoxq 64(%1), %%r8;"
 		"  mulxq 104(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -516,7 +516,7 @@  static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 
 		/* Step 1: Compute all partial products */
 		"  movq 0(%1), %%rdx;"                                       /* f[0] */
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
@@ -526,7 +526,7 @@  static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
 
 		/* Step 2: Compute two parallel carry chains */
-		"  xor %%r15, %%r15;"
+		"  xor %%r15d, %%r15d;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
 		"  adox %%rcx, %%r11;"
@@ -563,7 +563,7 @@  static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
 		"  adoxq 0(%1), %%r8;"
 		"  mulxq 40(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -607,7 +607,7 @@  static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 	asm volatile(
 		/* Step 1: Compute all partial products */
 		"  movq 0(%1), %%rdx;"                                       /* f[0] */
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
@@ -617,7 +617,7 @@  static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
 
 		/* Step 2: Compute two parallel carry chains */
-		"  xor %%r15, %%r15;"
+		"  xor %%r15d, %%r15d;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
 		"  adox %%rcx, %%r11;"
@@ -647,7 +647,7 @@  static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 
 		/* Step 1: Compute all partial products */
 		"  movq 32(%1), %%rdx;"                                       /* f[0] */
-		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
 		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
 		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
 		"  movq 56(%1), %%rdx;"                                      /* f[3] */
@@ -657,7 +657,7 @@  static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
 
 		/* Step 2: Compute two parallel carry chains */
-		"  xor %%r15, %%r15;"
+		"  xor %%r15d, %%r15d;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
 		"  adox %%rcx, %%r11;"
@@ -692,7 +692,7 @@  static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
 		"  adoxq 0(%1), %%r8;"
 		"  mulxq 40(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -725,7 +725,7 @@  static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 96(%1), %%r8, %%r13;"
-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
 		"  adoxq 64(%1), %%r8;"
 		"  mulxq 104(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"