diff mbox series

x86/kernel/hyper-v: xmm fast hypercall as guest

Message ID 75cb269dc51961574c4bbb8bbb2473f31819bcb6.1533250572.git.isaku.yamahata@gmail.com (mailing list archive)
State New, archived
Headers show
Series x86/kernel/hyper-v: xmm fast hypercall as guest | expand

Commit Message

Isaku Yamahata Aug. 2, 2018, 10:56 p.m. UTC
hyper-v hypercall supports fast hypercall and xmm fast hypercall
where argument is exchanged though regular/xmm registers.
This patch implements them and make use of them.
With this patch, pci-hyperv.c, hv_apic.c(__send_ipi_mask), and
hyperv/mmu.c will use (xmm) fast hypercall.

Signed-off-by: Isaku Yamahata <isaku.yamahata@gmail.com>
---
 arch/x86/hyperv/hv_apic.c           |   3 +-
 arch/x86/hyperv/mmu.c               |   4 +-
 arch/x86/include/asm/hyperv-tlfs.h  |   1 +
 arch/x86/include/asm/mshyperv.h     | 184 ++++++++++++++++++++++++++++++++++--
 drivers/hv/hv.c                     |   3 +-
 drivers/pci/controller/pci-hyperv.c |   5 +-
 6 files changed, 186 insertions(+), 14 deletions(-)

Comments

Vitaly Kuznetsov Aug. 10, 2018, 8:33 a.m. UTC | #1
Isaku Yamahata <isaku.yamahata@gmail.com> writes:

> hyper-v hypercall supports fast hypercall and xmm fast hypercall
> where argument is exchanged though regular/xmm registers.
> This patch implements them and make use of them.
> With this patch, pci-hyperv.c, hv_apic.c(__send_ipi_mask), and
> hyperv/mmu.c will use (xmm) fast hypercall.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@gmail.com>
> ---
>  arch/x86/hyperv/hv_apic.c           |   3 +-
>  arch/x86/hyperv/mmu.c               |   4 +-
>  arch/x86/include/asm/hyperv-tlfs.h  |   1 +
>  arch/x86/include/asm/mshyperv.h     | 184 ++++++++++++++++++++++++++++++++++--
>  drivers/hv/hv.c                     |   3 +-
>  drivers/pci/controller/pci-hyperv.c |   5 +-
>  6 files changed, 186 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
> index 402338365651..c0cb8180b560 100644
> --- a/arch/x86/hyperv/hv_apic.c
> +++ b/arch/x86/hyperv/hv_apic.c
> @@ -173,7 +173,8 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector)
>  		__set_bit(vcpu, (unsigned long *)&ipi_arg->cpu_mask);
>  	}
>
> -	ret = hv_do_hypercall(HVCALL_SEND_IPI, ipi_arg, NULL);
> +	ret = hv_do_hypercall(HVCALL_SEND_IPI, ipi_arg, sizeof(*ipi_arg),
> +			      NULL, 0);
>
>  ipi_mask_done:
>  	local_irq_restore(flags);
> diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
> index de27615c51ea..21bf609d01db 100644
> --- a/arch/x86/hyperv/mmu.c
> +++ b/arch/x86/hyperv/mmu.c
> @@ -112,11 +112,11 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
>  	if (info->end == TLB_FLUSH_ALL) {
>  		flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
>  		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> -					 flush, NULL);
> +					 flush, sizeof(*flush), NULL, 0);
>  	} else if (info->end &&
>  		   ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
>  		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> -					 flush, NULL);
> +					 flush, sizeof(*flush), NULL, 0);
>  	} else {
>  		gva_n = fill_gva_list(flush->gva_list, 0,
>  				      info->start, info->end);
> diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
> index b8c89265baf0..267112b24a0c 100644
> --- a/arch/x86/include/asm/hyperv-tlfs.h
> +++ b/arch/x86/include/asm/hyperv-tlfs.h
> @@ -123,6 +123,7 @@
>   * registers is available
>   */
>  #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE		(1 << 4)
> +#define HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE		(1 << 15)
>  /* Support for a virtual guest idle state is available */
>  #define HV_X64_GUEST_IDLE_STATE_AVAILABLE		(1 << 5)
>  /* Guest crash data handler available */
> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> index 5a7375ed5f7c..12f7d5dbd993 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -8,6 +8,7 @@
>  #include <asm/io.h>
>  #include <asm/hyperv-tlfs.h>
>  #include <asm/nospec-branch.h>
> +#include <asm/fpu/api.h>
>
>  #define VP_INVAL	U32_MAX
>
> @@ -125,16 +126,13 @@ extern struct clocksource *hyperv_cs;
>  extern void *hv_hypercall_pg;
>  extern void  __percpu  **hyperv_pcpu_input_arg;
>
> -static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
> +static inline u64 __hv_do_hypercall(u64 control, void *input, void *output)
>  {
>  	u64 input_address = input ? virt_to_phys(input) : 0;
>  	u64 output_address = output ? virt_to_phys(output) : 0;
>  	u64 hv_status;
>
>  #ifdef CONFIG_X86_64
> -	if (!hv_hypercall_pg)
> -		return U64_MAX;
> -
>  	__asm__ __volatile__("mov %4, %%r8\n"
>  			     CALL_NOSPEC
>  			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
> @@ -148,9 +146,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
>  	u32 output_address_hi = upper_32_bits(output_address);
>  	u32 output_address_lo = lower_32_bits(output_address);
>
> -	if (!hv_hypercall_pg)
> -		return U64_MAX;
> -
>  	__asm__ __volatile__(CALL_NOSPEC
>  			     : "=A" (hv_status),
>  			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
> @@ -194,6 +189,43 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
>  		return hv_status;
>  }
>
> +/* Fast hypercall with 16 bytes of input and no output */
> +static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
> +{
> +	u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
> +
> +#ifdef CONFIG_X86_64
> +	{
> +		__asm__ __volatile__("mov %4, %%r8\n"
> +				     CALL_NOSPEC
> +				     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
> +				       "+c" (control), "+d" (input1)
> +				     : "r" (input2),
> +				       THUNK_TARGET(hv_hypercall_pg)
> +				     : "cc", "r8", "r9", "r10", "r11");
> +	}
> +#else
> +	{
> +		u32 input1_hi = upper_32_bits(input1);
> +		u32 input1_lo = lower_32_bits(input1);
> +		u32 input2_hi = upper_32_bits(input2);
> +		u32 input2_lo = lower_32_bits(input2);
> +
> +		__asm__ __volatile__ (CALL_NOSPEC
> +				      : "=A"(hv_status),
> +					"+c"(input1_lo),
> +					ASM_CALL_CONSTRAINT
> +				      : "A" (control),
> +					"b" (input1_hi),
> +					"D" (input2_hi),
> +					"S" (input2_lo),
> +					THUNK_TARGET(hv_hypercall_pg)
> +				      : "cc", "edi", "esi");
> +	}
> +#endif
> +	return hv_status;
> +}
> +
>  /*
>   * Rep hypercalls. Callers of this functions are supposed to ensure that
>   * rep_count and varhead_size comply with Hyper-V hypercall definition.
> @@ -209,7 +241,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
>  	control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
>
>  	do {
> -		status = hv_do_hypercall(control, input, output);
> +		status = __hv_do_hypercall(control, input, output);
>  		if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
>  			return status;
>
> @@ -226,6 +258,142 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
>  	return status;
>  }
>
> +#define HV_XMM_BYTE_MAX	112
> +
> +/* ibytes = fixed header size + var header size + data size in bytes */
> +static inline u64 hv_do_xmm_fast_hypercall(
> +	u64 control, void *input, size_t ibytes, void *output, size_t obytes)
> +{
> +	u64 status;
> +	u64 input0;
> +	u64 input1;
> +	u64 tmp[12] __aligned((16));
> +
> +	BUG_ON(ibytes <= 16);
> +	BUG_ON(roundup(ibytes, 16) + obytes > HV_XMM_BYTE_MAX);
> +
> +	control |= HV_HYPERCALL_FAST_BIT;
> +
> +	/* it's assumed that there are at least two inputs */
> +	input0 = ((u64 *)input)[0];
> +	input1 = ((u64 *)input)[1];
> +	/*
> +	 * If input is aligned and long enough, we can avoid copy
> +	 * and directly store it into xmm registers.
> +	 * It's a case when hyperv_pcpu_input_arg is used.
> +	 */
> +	memcpy(tmp, (u8 *)input + 16, ibytes - 16);
> +	memset((u8 *)tmp + (ibytes - 16), 0, sizeof(tmp) - (ibytes - 16));
> +
> +	kernel_fpu_begin();
> +	if (ibytes > 2 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (tmp[0]));
> +	if (ibytes > 4 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (tmp[2]));
> +	if (ibytes > 6 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (tmp[4]));
> +	if (ibytes > 8 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (tmp[6]));
> +	if (ibytes > 10 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (tmp[8]));
> +	if (ibytes > 12 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (tmp[10]));
> +

I'm afraid this won't speed things up. XMM hypercalls are faster because
hypervisor doesn't need to read data from guest's memory and write it
there. Here, we actually do more.

Another thing is kernel_fpu_begin(). In particular, in case FPU was
initialized we'll have to save its state and restore it later. This is
also expensive. I would even suggest we check
&current->thread.fpu->initialized and do regular hypercall in case it
is.

Did you try to benchmark your solution? Use e.g. IPI benchmark with PV
IPIs enabled: https://lkml.org/lkml/2017/12/11/364
(this will always have FPU uninitialized, we will also need something
else like PV TLB shootdown from a process using FPU).

> +#ifdef CONFIG_X86_64
> +	__asm__ __volatile__("mov %4, %%r8\n"
> +			     CALL_NOSPEC
> +			     : "=a" (status), ASM_CALL_CONSTRAINT,
> +			       "+c" (control), "+d" (input0)
> +			     : "r" (input1),
> +			       THUNK_TARGET(hv_hypercall_pg)
> +			     : "cc", "r8", "r9", "r10", "r11");
> +#else
> +	{
> +		u32 input0_hi = upper_32_bits(input0);
> +		u32 input0_lo = lower_32_bits(input0);
> +		u32 input1_hi = upper_32_bits(input1);
> +		u32 input1_lo = lower_32_bits(input1);
> +
> +		__asm__ __volatile__ (CALL_NOSPEC
> +				      : "=A"(status),
> +					"+c"(input0_lo),
> +					ASM_CALL_CONSTRAINT
> +				      :	"A" (control),
> +					"b" (input0_hi),
> +					"D"(input1_hi),
> +					"S"(input1_lo),
> +					THUNK_TARGET(hv_hypercall_pg)
> +				      : "cc", "edi", "esi");
> +	}
> +#endif
> +	if (output) {
> +		size_t i_end = roundup(ibytes, 16);
> +		size_t o_end = i_end + obytes;
> +
> +		if (i_end <= 2 * 8 && 2 * 8 < o_end)
> +			__asm__ __volatile__(
> +				"movdqa %%xmm0, %s" : : "m" (tmp[0]));
> +		if (i_end <= 4 * 8 && 4 * 8 < o_end)
> +			__asm__ __volatile__(
> +				"movdqa %%xmm1, %s" : : "m" (tmp[2]));
> +		if (i_end <= 6 * 8 && 6 * 8 < o_end)
> +			__asm__ __volatile__(
> +				"movdqa %%xmm2, %s" : : "m" (tmp[4]));
> +		if (i_end <= 8 * 8 && 8 * 8 < o_end)
> +			__asm__ __volatile__(
> +				"movdqa %%xmm3, %s" : : "m" (tmp[6]));
> +		if (i_end <= 10 * 8 && 10 * 8 < o_end)
> +			__asm__ __volatile__(
> +				"movdqa %%xmm4, %s" : : "m" (tmp[8]));
> +		if (i_end <= 12 * 8 && 12 * 8 < o_end)
> +			__asm__ __volatile__(
> +				"movdqa %%xmm5, %s" : : "m" (tmp[10]));
> +		memcpy(output, (u8 *)tmp + i_end - 16, obytes);
> +	}
> +	kernel_fpu_end();
> +	return status;
> +}
> +
> +static inline u64 hv_do_hypercall(
> +	u64 control, void *input, size_t ibytes, void *output, size_t obytes)
> +{
> +	if (unlikely(!hv_hypercall_pg))
> +		return U64_MAX;
> +
> +	/* fast hypercall */
> +	if (output == NULL && ibytes <= 16) {
> +		u64 tmp[2];
> +
> +		if (ibytes <= 8) {
> +			tmp[0] = 0;
> +			memcpy(&tmp, input, ibytes);
> +			return hv_do_fast_hypercall8((u16)control, tmp[0]);
> +		}
> +
> +		tmp[1] = 0;
> +		memcpy(tmp, input, ibytes);
> +		return hv_do_fast_hypercall16((u16)control, tmp[0], tmp[1]);
> +	}
> +
> +	/* xmm fast hypercall */
> +	if (static_cpu_has(X86_FEATURE_XMM) &&
> +	    ms_hyperv.features & HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE &&
> +	    roundup(ibytes, 16) + obytes <= HV_XMM_BYTE_MAX) {
> +		if (output) {
> +			if (ms_hyperv.features &
> +			    HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE)
> +				return hv_do_xmm_fast_hypercall(
> +					control, input, ibytes, output, obytes);
> +		} else {
> +			WARN_ON(obytes > 0);
> +			return hv_do_xmm_fast_hypercall(
> +				control, input, ibytes, output, obytes);
> +		}
> +	}
> +
> +	return __hv_do_hypercall(control, input, output);
> +}
> +
>  /*
>   * Hypervisor's notion of virtual processor ID is different from
>   * Linux' notion of CPU ID. This information can only be retrieved
> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> index 658dc765753b..0c7b7a88fdcd 100644
> --- a/drivers/hv/hv.c
> +++ b/drivers/hv/hv.c
> @@ -92,7 +92,8 @@ int hv_post_message(union hv_connection_id connection_id,
>  	aligned_msg->payload_size = payload_size;
>  	memcpy((void *)aligned_msg->payload, payload, payload_size);
>
> -	status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL);
> +	status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg,
> +				 sizeof(*aligned_msg), NULL, 0);
>
>  	/* Preemption must remain disabled until after the hypercall
>  	 * so some other thread can't get scheduled onto this cpu and
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index f6325f1a89e8..7c229009daba 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -979,8 +979,9 @@ static void hv_irq_unmask(struct irq_data *data)
>  		}
>  	}
>
> -	res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
> -			      params, NULL);
> +	res = hv_do_hypercall(
> +		HVCALL_RETARGET_INTERRUPT | (var_size << 17),
> +		params, sizeof(params) + var_size * 8, NULL, 0);
>
>  exit_unlock:
>  	spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
Isaku Yamahata Aug. 13, 2018, 5:52 p.m. UTC | #2
On Fri, Aug 10, 2018 at 10:33:00AM +0200,
Vitaly Kuznetsov <vkuznets@redhat.com> wrote:

> I'm afraid this won't speed things up. XMM hypercalls are faster because
> hypervisor doesn't need to read data from guest's memory and write it
> there. Here, we actually do more.
> 
> Another thing is kernel_fpu_begin(). In particular, in case FPU was
> initialized we'll have to save its state and restore it later. This is
> also expensive. I would even suggest we check
> &current->thread.fpu->initialized and do regular hypercall in case it
> is.
> 
> Did you try to benchmark your solution? Use e.g. IPI benchmark with PV
> IPIs enabled: https://lkml.org/lkml/2017/12/11/364
> (this will always have FPU uninitialized, we will also need something
> else like PV TLB shootdown from a process using FPU).

Hi Vitaly. I haven't benchmarked it yet.
As you mentioned, the patch needs to be polished to do less.
Especially not to use xsave/xrestore.
Before going for such complication, I wanted to post the patch early
and benchmark.
I should have marked it as RFC or v1.

I'm pulled out for other stuff for now, I'll resume to benchmark it
and complicates it unless you (or someone else) do it before me.
diff mbox series

Patch

diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 402338365651..c0cb8180b560 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -173,7 +173,8 @@  static bool __send_ipi_mask(const struct cpumask *mask, int vector)
 		__set_bit(vcpu, (unsigned long *)&ipi_arg->cpu_mask);
 	}
 
-	ret = hv_do_hypercall(HVCALL_SEND_IPI, ipi_arg, NULL);
+	ret = hv_do_hypercall(HVCALL_SEND_IPI, ipi_arg, sizeof(*ipi_arg),
+			      NULL, 0);
 
 ipi_mask_done:
 	local_irq_restore(flags);
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index de27615c51ea..21bf609d01db 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -112,11 +112,11 @@  static void hyperv_flush_tlb_others(const struct cpumask *cpus,
 	if (info->end == TLB_FLUSH_ALL) {
 		flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
 		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
-					 flush, NULL);
+					 flush, sizeof(*flush), NULL, 0);
 	} else if (info->end &&
 		   ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
 		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
-					 flush, NULL);
+					 flush, sizeof(*flush), NULL, 0);
 	} else {
 		gva_n = fill_gva_list(flush->gva_list, 0,
 				      info->start, info->end);
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index b8c89265baf0..267112b24a0c 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -123,6 +123,7 @@ 
  * registers is available
  */
 #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE		(1 << 4)
+#define HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE		(1 << 15)
 /* Support for a virtual guest idle state is available */
 #define HV_X64_GUEST_IDLE_STATE_AVAILABLE		(1 << 5)
 /* Guest crash data handler available */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 5a7375ed5f7c..12f7d5dbd993 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -8,6 +8,7 @@ 
 #include <asm/io.h>
 #include <asm/hyperv-tlfs.h>
 #include <asm/nospec-branch.h>
+#include <asm/fpu/api.h>
 
 #define VP_INVAL	U32_MAX
 
@@ -125,16 +126,13 @@  extern struct clocksource *hyperv_cs;
 extern void *hv_hypercall_pg;
 extern void  __percpu  **hyperv_pcpu_input_arg;
 
-static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
+static inline u64 __hv_do_hypercall(u64 control, void *input, void *output)
 {
 	u64 input_address = input ? virt_to_phys(input) : 0;
 	u64 output_address = output ? virt_to_phys(output) : 0;
 	u64 hv_status;
 
 #ifdef CONFIG_X86_64
-	if (!hv_hypercall_pg)
-		return U64_MAX;
-
 	__asm__ __volatile__("mov %4, %%r8\n"
 			     CALL_NOSPEC
 			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
@@ -148,9 +146,6 @@  static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 	u32 output_address_hi = upper_32_bits(output_address);
 	u32 output_address_lo = lower_32_bits(output_address);
 
-	if (!hv_hypercall_pg)
-		return U64_MAX;
-
 	__asm__ __volatile__(CALL_NOSPEC
 			     : "=A" (hv_status),
 			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
@@ -194,6 +189,43 @@  static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 		return hv_status;
 }
 
+/* Fast hypercall with 16 bytes of input and no output */
+static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
+{
+	u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+#ifdef CONFIG_X86_64
+	{
+		__asm__ __volatile__("mov %4, %%r8\n"
+				     CALL_NOSPEC
+				     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+				       "+c" (control), "+d" (input1)
+				     : "r" (input2),
+				       THUNK_TARGET(hv_hypercall_pg)
+				     : "cc", "r8", "r9", "r10", "r11");
+	}
+#else
+	{
+		u32 input1_hi = upper_32_bits(input1);
+		u32 input1_lo = lower_32_bits(input1);
+		u32 input2_hi = upper_32_bits(input2);
+		u32 input2_lo = lower_32_bits(input2);
+
+		__asm__ __volatile__ (CALL_NOSPEC
+				      : "=A"(hv_status),
+					"+c"(input1_lo),
+					ASM_CALL_CONSTRAINT
+				      : "A" (control),
+					"b" (input1_hi),
+					"D" (input2_hi),
+					"S" (input2_lo),
+					THUNK_TARGET(hv_hypercall_pg)
+				      : "cc", "edi", "esi");
+	}
+#endif
+	return hv_status;
+}
+
 /*
  * Rep hypercalls. Callers of this functions are supposed to ensure that
  * rep_count and varhead_size comply with Hyper-V hypercall definition.
@@ -209,7 +241,7 @@  static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
 	control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
 
 	do {
-		status = hv_do_hypercall(control, input, output);
+		status = __hv_do_hypercall(control, input, output);
 		if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
 			return status;
 
@@ -226,6 +258,142 @@  static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
 	return status;
 }
 
+#define HV_XMM_BYTE_MAX	112
+
+/* ibytes = fixed header size + var header size + data size in bytes */
+static inline u64 hv_do_xmm_fast_hypercall(
+	u64 control, void *input, size_t ibytes, void *output, size_t obytes)
+{
+	u64 status;
+	u64 input0;
+	u64 input1;
+	u64 tmp[12] __aligned((16));
+
+	BUG_ON(ibytes <= 16);
+	BUG_ON(roundup(ibytes, 16) + obytes > HV_XMM_BYTE_MAX);
+
+	control |= HV_HYPERCALL_FAST_BIT;
+
+	/* it's assumed that there are at least two inputs */
+	input0 = ((u64 *)input)[0];
+	input1 = ((u64 *)input)[1];
+	/*
+	 * If input is aligned and long enough, we can avoid copy
+	 * and directly store it into xmm registers.
+	 * It's a case when hyperv_pcpu_input_arg is used.
+	 */
+	memcpy(tmp, (u8 *)input + 16, ibytes - 16);
+	memset((u8 *)tmp + (ibytes - 16), 0, sizeof(tmp) - (ibytes - 16));
+
+	kernel_fpu_begin();
+	if (ibytes > 2 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (tmp[0]));
+	if (ibytes > 4 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (tmp[2]));
+	if (ibytes > 6 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (tmp[4]));
+	if (ibytes > 8 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (tmp[6]));
+	if (ibytes > 10 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (tmp[8]));
+	if (ibytes > 12 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (tmp[10]));
+
+#ifdef CONFIG_X86_64
+	__asm__ __volatile__("mov %4, %%r8\n"
+			     CALL_NOSPEC
+			     : "=a" (status), ASM_CALL_CONSTRAINT,
+			       "+c" (control), "+d" (input0)
+			     : "r" (input1),
+			       THUNK_TARGET(hv_hypercall_pg)
+			     : "cc", "r8", "r9", "r10", "r11");
+#else
+	{
+		u32 input0_hi = upper_32_bits(input0);
+		u32 input0_lo = lower_32_bits(input0);
+		u32 input1_hi = upper_32_bits(input1);
+		u32 input1_lo = lower_32_bits(input1);
+
+		__asm__ __volatile__ (CALL_NOSPEC
+				      : "=A"(status),
+					"+c"(input0_lo),
+					ASM_CALL_CONSTRAINT
+				      :	"A" (control),
+					"b" (input0_hi),
+					"D"(input1_hi),
+					"S"(input1_lo),
+					THUNK_TARGET(hv_hypercall_pg)
+				      : "cc", "edi", "esi");
+	}
+#endif
+	if (output) {
+		size_t i_end = roundup(ibytes, 16);
+		size_t o_end = i_end + obytes;
+
+		if (i_end <= 2 * 8 && 2 * 8 < o_end)
+			__asm__ __volatile__(
+				"movdqa %%xmm0, %s" : : "m" (tmp[0]));
+		if (i_end <= 4 * 8 && 4 * 8 < o_end)
+			__asm__ __volatile__(
+				"movdqa %%xmm1, %s" : : "m" (tmp[2]));
+		if (i_end <= 6 * 8 && 6 * 8 < o_end)
+			__asm__ __volatile__(
+				"movdqa %%xmm2, %s" : : "m" (tmp[4]));
+		if (i_end <= 8 * 8 && 8 * 8 < o_end)
+			__asm__ __volatile__(
+				"movdqa %%xmm3, %s" : : "m" (tmp[6]));
+		if (i_end <= 10 * 8 && 10 * 8 < o_end)
+			__asm__ __volatile__(
+				"movdqa %%xmm4, %s" : : "m" (tmp[8]));
+		if (i_end <= 12 * 8 && 12 * 8 < o_end)
+			__asm__ __volatile__(
+				"movdqa %%xmm5, %s" : : "m" (tmp[10]));
+		memcpy(output, (u8 *)tmp + i_end - 16, obytes);
+	}
+	kernel_fpu_end();
+	return status;
+}
+
+static inline u64 hv_do_hypercall(
+	u64 control, void *input, size_t ibytes, void *output, size_t obytes)
+{
+	if (unlikely(!hv_hypercall_pg))
+		return U64_MAX;
+
+	/* fast hypercall */
+	if (output == NULL && ibytes <= 16) {
+		u64 tmp[2];
+
+		if (ibytes <= 8) {
+			tmp[0] = 0;
+			memcpy(&tmp, input, ibytes);
+			return hv_do_fast_hypercall8((u16)control, tmp[0]);
+		}
+
+		tmp[1] = 0;
+		memcpy(tmp, input, ibytes);
+		return hv_do_fast_hypercall16((u16)control, tmp[0], tmp[1]);
+	}
+
+	/* xmm fast hypercall */
+	if (static_cpu_has(X86_FEATURE_XMM) &&
+	    ms_hyperv.features & HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE &&
+	    roundup(ibytes, 16) + obytes <= HV_XMM_BYTE_MAX) {
+		if (output) {
+			if (ms_hyperv.features &
+			    HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE)
+				return hv_do_xmm_fast_hypercall(
+					control, input, ibytes, output, obytes);
+		} else {
+			WARN_ON(obytes > 0);
+			return hv_do_xmm_fast_hypercall(
+				control, input, ibytes, output, obytes);
+		}
+	}
+
+	return __hv_do_hypercall(control, input, output);
+}
+
 /*
  * Hypervisor's notion of virtual processor ID is different from
  * Linux' notion of CPU ID. This information can only be retrieved
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 658dc765753b..0c7b7a88fdcd 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -92,7 +92,8 @@  int hv_post_message(union hv_connection_id connection_id,
 	aligned_msg->payload_size = payload_size;
 	memcpy((void *)aligned_msg->payload, payload, payload_size);
 
-	status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL);
+	status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg,
+				 sizeof(*aligned_msg), NULL, 0);
 
 	/* Preemption must remain disabled until after the hypercall
 	 * so some other thread can't get scheduled onto this cpu and
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index f6325f1a89e8..7c229009daba 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -979,8 +979,9 @@  static void hv_irq_unmask(struct irq_data *data)
 		}
 	}
 
-	res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
-			      params, NULL);
+	res = hv_do_hypercall(
+		HVCALL_RETARGET_INTERRUPT | (var_size << 17),
+		params, sizeof(params) + var_size * 8, NULL, 0);
 
 exit_unlock:
 	spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);