diff mbox series

[5/7] KVM: x86: Implement KVM_HYPERV_SET_TLB_FLUSH_INHIBIT

Message ID 20241004140810.34231-6-nikwip@amazon.de (mailing list archive)
State New, archived
Headers show
Series KVM: x86: Introduce new ioctl KVM_HYPERV_SET_TLB_FLUSH_INHIBIT | expand

Commit Message

Nikolas Wipper Oct. 4, 2024, 2:08 p.m. UTC
Implement KVM_HYPERV_SET_TLB_FLUSH_INHIBIT for x86. Apart from setting/
clearing the internal TLB flush inhibit flag this ioctl also wakes up
vCPUs suspended and waiting on this vCPU.

When the flag is set, a vCPU trying to flush the inhibited vCPUs TLB with
a Hyper-V hyper-call suspendeds until the bit is cleared again. The hyper-
call finishes internally, but the instruction isn't skipped, and execution
doesn't return to the guest. This behaviour and the suspended state itself
are specified in Microsoft's "Hypervisor Top Level Functional
Specification" (TLFS).

To maintain thread safety during suspension and unsuspension, the IOCTL
uses the KVM SRCU. After flipping the flush inhibit flag, it synchronizes
SRCU to ensure that all running TLB flush attempts that saw the old state,
finish before the IOCTL exits. If the flag was changed to inhibit new TLB
flushes, this guarantees that no TLB flushes happen once the ioctl exits.
If the flag was changed to no longer inhibit TLB flushes, the
synchronization ensures that all suspended CPUs finished entering the
suspended state properly, so they can be unsuspended again.

Once TLB flushes are no longer inhibited, all suspended vCPUs are
unsuspended.

Signed-off-by: Nikolas Wipper <nikwip@amazon.de>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/hyperv.c           | 22 ++++++++++++++++++++
 arch/x86/kvm/x86.c              | 37 +++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)

Comments

Vitaly Kuznetsov Oct. 10, 2024, 8:57 a.m. UTC | #1
Nikolas Wipper <nikwip@amazon.de> writes:

> Implement KVM_HYPERV_SET_TLB_FLUSH_INHIBIT for x86. Apart from setting/
> clearing the internal TLB flush inhibit flag this ioctl also wakes up
> vCPUs suspended and waiting on this vCPU.
>
> When the flag is set, a vCPU trying to flush the inhibited vCPUs TLB with
> a Hyper-V hyper-call suspendeds until the bit is cleared again. The hyper-
> call finishes internally, but the instruction isn't skipped, and execution
> doesn't return to the guest. This behaviour and the suspended state itself
> are specified in Microsoft's "Hypervisor Top Level Functional
> Specification" (TLFS).
>
> To maintain thread safety during suspension and unsuspension, the IOCTL
> uses the KVM SRCU. After flipping the flush inhibit flag, it synchronizes
> SRCU to ensure that all running TLB flush attempts that saw the old state,
> finish before the IOCTL exits. If the flag was changed to inhibit new TLB
> flushes, this guarantees that no TLB flushes happen once the ioctl exits.
> If the flag was changed to no longer inhibit TLB flushes, the
> synchronization ensures that all suspended CPUs finished entering the
> suspended state properly, so they can be unsuspended again.
>
> Once TLB flushes are no longer inhibited, all suspended vCPUs are
> unsuspended.
>
> Signed-off-by: Nikolas Wipper <nikwip@amazon.de>
> ---
>  arch/x86/include/asm/kvm_host.h |  2 ++
>  arch/x86/kvm/hyperv.c           | 22 ++++++++++++++++++++
>  arch/x86/kvm/x86.c              | 37 +++++++++++++++++++++++++++++++++
>  3 files changed, 61 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7571ac578884..ab3a9beb61a2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -698,6 +698,8 @@ struct kvm_vcpu_hv {
>  
>  	bool suspended;
>  	int waiting_on;
> +
> +	int tlb_flush_inhibit;

This is basically boolean, right? And we only make it 'int' to be able
to store 'u8' from the ioctl? This doesn't look very clean. Do you
envision anything but '1'/'0' in 'inhibit'? If not, maybe we can just
make it a flag (and e.g. extend 'flags' to be u32/u64)? This way we can
convert 'tlb_flush_inhibit' to a normal bool.

>  };
>  
>  struct kvm_hypervisor_cpuid {
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index e68fbc0c7fc1..40ea8340838f 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -2137,6 +2137,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>  		bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
>  
>  		kvm_for_each_vcpu(i, v, kvm) {
> +			if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit))
> +				goto ret_suspend;
> +
>  			__set_bit(i, vcpu_mask);
>  		}
>  	} else if (!is_guest_mode(vcpu)) {
> @@ -2148,6 +2151,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>  				__clear_bit(i, vcpu_mask);
>  				continue;
>  			}
> +
> +			if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit))
> +				goto ret_suspend;
>  		}
>  	} else {
>  		struct kvm_vcpu_hv *hv_v;
> @@ -2175,6 +2181,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>  						    sparse_banks))
>  				continue;
>  
> +			if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit))
> +				goto ret_suspend;
> +

These READ_ONCEs make me think I misunderstand something here, please
bear with me :-).

Like we're trying to protect against 'tlb_flush_inhibit' being read
somewhere in the beginning of the function and want to generate real
memory accesses. But what happens if tlb_flush_inhibit changes right
_after_ we checked it here and _before_ we actuall do
kvm_make_vcpus_request_mask()? Wouldn't it be a problem? In case it
would, I think we need to reverse the order: do
kvm_make_vcpus_request_mask() anyway and after it go through vcpu_mask
checking whether any of the affected vCPUs has 'tlb_flush_inhibit' and
if it does, suspend the caller.

>  			__set_bit(i, vcpu_mask);
>  		}
>  	}
> @@ -2193,6 +2202,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>  	/* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
>  	return (u64)HV_STATUS_SUCCESS |
>  		((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
> +ret_suspend:
> +	kvm_hv_vcpu_suspend_tlb_flush(vcpu, v->vcpu_id);
> +	return -EBUSY;
>  }
>  
>  static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
> @@ -2380,6 +2392,13 @@ static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
>  	u32 tlb_lock_count = 0;
>  	int ret;
>  
> +	/*
> +	 * Reached when the hyper-call resulted in a suspension of the vCPU.
> +	 * The instruction will be re-tried once the vCPU is unsuspended.
> +	 */
> +	if (kvm_hv_vcpu_suspended(vcpu))
> +		return 1;
> +
>  	if (hv_result_success(result) && is_guest_mode(vcpu) &&
>  	    kvm_hv_is_tlb_flush_hcall(vcpu) &&
>  	    kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
> @@ -2919,6 +2938,9 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
>  
>  void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id)
>  {
> +	RCU_LOCKDEP_WARN(!srcu_read_lock_held(&vcpu->kvm->srcu),
> +			 "Suspicious Hyper-V TLB flush inhibit usage\n");
> +
>  	/* waiting_on's store should happen before suspended's */
>  	WRITE_ONCE(vcpu->arch.hyperv->waiting_on, vcpu_id);
>  	WRITE_ONCE(vcpu->arch.hyperv->suspended, true);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 18d0a300e79a..1f925e32a927 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4642,6 +4642,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_HYPERV_CPUID:
>  	case KVM_CAP_HYPERV_ENFORCE_CPUID:
>  	case KVM_CAP_SYS_HYPERV_CPUID:
> +	case KVM_CAP_HYPERV_TLB_FLUSH_INHIBIT:
>  #endif
>  	case KVM_CAP_PCI_SEGMENT:
>  	case KVM_CAP_DEBUGREGS:
> @@ -5853,6 +5854,31 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>  	}
>  }
>  
> +static int kvm_vcpu_ioctl_set_tlb_flush_inhibit(struct kvm_vcpu *vcpu,
> +						struct kvm_hyperv_tlb_flush_inhibit *set)
> +{
> +	if (set->inhibit == READ_ONCE(vcpu->arch.hyperv->tlb_flush_inhibit))
> +		return 0;
> +
> +	WRITE_ONCE(vcpu->arch.hyperv->tlb_flush_inhibit, set->inhibit);

As you say before, vCPU ioctls are serialized and noone else sets
tlb_flush_inhibit, do I understand correctly that
READ_ONCE()/WRITE_ONCE() are redundant here?

> +
> +	/*
> +	 * synchronize_srcu() ensures that:
> +	 * - On inhibit, all concurrent TLB flushes finished before this ioctl
> +	 *   exits.
> +	 * - On uninhibit, there are no longer vCPUs being suspended due to this
> +	 *   inhibit.
> +	 * This function can't race with itself, because vCPU IOCTLs are
> +	 * serialized, so if the inhibit bit is already the desired value this
> +	 * synchronization has already happened.
> +	 */
> +	synchronize_srcu(&vcpu->kvm->srcu);
> +	if (!set->inhibit)
> +		kvm_hv_vcpu_unsuspend_tlb_flush(vcpu);
> +
> +	return 0;
> +}
> +
>  long kvm_arch_vcpu_ioctl(struct file *filp,
>  			 unsigned int ioctl, unsigned long arg)
>  {
> @@ -6306,6 +6332,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  	case KVM_SET_DEVICE_ATTR:
>  		r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
>  		break;
> +#ifdef CONFIG_KVM_HYPERV
> +	case KVM_HYPERV_SET_TLB_FLUSH_INHIBIT: {
> +		struct kvm_hyperv_tlb_flush_inhibit set;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&set, argp, sizeof(set)))
> +			goto out;
> +		r = kvm_vcpu_ioctl_set_tlb_flush_inhibit(vcpu, &set);
> +		break;
> +	}
> +#endif
>  	default:
>  		r = -EINVAL;
>  	}
Nikolas Wipper Oct. 14, 2024, 6:02 p.m. UTC | #2
On 10.10.24 10:57, Vitaly Kuznetsov wrote:
> Nikolas Wipper <nikwip@amazon.de> writes:
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 7571ac578884..ab3a9beb61a2 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -698,6 +698,8 @@ struct kvm_vcpu_hv {
>>
>>  	bool suspended;
>>  	int waiting_on;
>> +
>> +	int tlb_flush_inhibit;
>
> This is basically boolean, right? And we only make it 'int' to be able
> to store 'u8' from the ioctl? This doesn't look very clean. Do you
> envision anything but '1'/'0' in 'inhibit'? If not, maybe we can just
> make it a flag (and e.g. extend 'flags' to be u32/u64)? This way we can
> convert 'tlb_flush_inhibit' to a normal bool.
>

Yes, inhibit would always be binary, so incorporating it into the flags
sounds reasonable. Even with the current API, this could just be a bool
(tlb_flush_inhibit = inhibit == 1;)

>>  };
>>
>>  struct kvm_hypervisor_cpuid {
>> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
>> index e68fbc0c7fc1..40ea8340838f 100644
>> --- a/arch/x86/kvm/hyperv.c
>> +++ b/arch/x86/kvm/hyperv.c
>> @@ -2137,6 +2137,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>>  		bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
>>
>>  		kvm_for_each_vcpu(i, v, kvm) {
>> +			if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit))
>> +				goto ret_suspend;
>> +
>>  			__set_bit(i, vcpu_mask);
>>  		}
>>  	} else if (!is_guest_mode(vcpu)) {
>> @@ -2148,6 +2151,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>>  				__clear_bit(i, vcpu_mask);
>>  				continue;
>>  			}
>> +
>> +			if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit))
>> +				goto ret_suspend;
>>  		}
>>  	} else {
>>  		struct kvm_vcpu_hv *hv_v;
>> @@ -2175,6 +2181,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>>  						    sparse_banks))
>>  				continue;
>>
>> +			if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit))
>> +				goto ret_suspend;
>> +
>
> These READ_ONCEs make me think I misunderstand something here, please
> bear with me :-).
>
> Like we're trying to protect against 'tlb_flush_inhibit' being read
> somewhere in the beginning of the function and want to generate real
> memory accesses. But what happens if tlb_flush_inhibit changes right
> _after_ we checked it here and _before_ we actuall do
> kvm_make_vcpus_request_mask()? Wouldn't it be a problem? In case it
> would, I think we need to reverse the order: do
> kvm_make_vcpus_request_mask() anyway and after it go through vcpu_mask
> checking whether any of the affected vCPUs has 'tlb_flush_inhibit' and
> if it does, suspend the caller.
>

The case you're describing is prevented through SRCU synchronisation in
the ioctl. The hypercall actually holds a read side critical section
during the whole of its execution, so when tlb_flush_inhibit changes
after we read it, the ioctl would wait for the flushes to complete:

vCPU 0                   | vCPU 1
-------------------------+------------------------
                         | hypercall enter
                         | srcu_read_lock()
ioctl enter              |
                         | tlb_flush_inhibit read
tlb_flush_inhibit write  |
synchronize_srcu() start |
                         | TLB flush reqs send
                         | srcu_read_unlock()
synchronize_srcu() end   |
ioctl exit               |

>>  			__set_bit(i, vcpu_mask);
>>  		}
>>  	}
>> @@ -2193,6 +2202,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>>  	/* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
>>  	return (u64)HV_STATUS_SUCCESS |
>>  		((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
>> +ret_suspend:
>> +	kvm_hv_vcpu_suspend_tlb_flush(vcpu, v->vcpu_id);
>> +	return -EBUSY;
>>  }
>>
>>  static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
>> @@ -2380,6 +2392,13 @@ static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
>>  	u32 tlb_lock_count = 0;
>>  	int ret;
>>
>> +	/*
>> +	 * Reached when the hyper-call resulted in a suspension of the vCPU.
>> +	 * The instruction will be re-tried once the vCPU is unsuspended.
>> +	 */
>> +	if (kvm_hv_vcpu_suspended(vcpu))
>> +		return 1;
>> +
>>  	if (hv_result_success(result) && is_guest_mode(vcpu) &&
>>  	    kvm_hv_is_tlb_flush_hcall(vcpu) &&
>>  	    kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
>> @@ -2919,6 +2938,9 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
>>
>>  void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id)
>>  {
>> +	RCU_LOCKDEP_WARN(!srcu_read_lock_held(&vcpu->kvm->srcu),
>> +			 "Suspicious Hyper-V TLB flush inhibit usage\n");
>> +
>>  	/* waiting_on's store should happen before suspended's */
>>  	WRITE_ONCE(vcpu->arch.hyperv->waiting_on, vcpu_id);
>>  	WRITE_ONCE(vcpu->arch.hyperv->suspended, true);
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 18d0a300e79a..1f925e32a927 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -4642,6 +4642,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>  	case KVM_CAP_HYPERV_CPUID:
>>  	case KVM_CAP_HYPERV_ENFORCE_CPUID:
>>  	case KVM_CAP_SYS_HYPERV_CPUID:
>> +	case KVM_CAP_HYPERV_TLB_FLUSH_INHIBIT:
>>  #endif
>>  	case KVM_CAP_PCI_SEGMENT:
>>  	case KVM_CAP_DEBUGREGS:
>> @@ -5853,6 +5854,31 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>>  	}
>>  }
>>
>> +static int kvm_vcpu_ioctl_set_tlb_flush_inhibit(struct kvm_vcpu *vcpu,
>> +						struct kvm_hyperv_tlb_flush_inhibit *set)
>> +{
>> +	if (set->inhibit == READ_ONCE(vcpu->arch.hyperv->tlb_flush_inhibit))
>> +		return 0;
>> +
>> +	WRITE_ONCE(vcpu->arch.hyperv->tlb_flush_inhibit, set->inhibit);
>
> As you say before, vCPU ioctls are serialized and noone else sets
> tlb_flush_inhibit, do I understand correctly that
> READ_ONCE()/WRITE_ONCE() are redundant here?
>

As mentioned before, since tlb_flush_inhibit is shared it needs
these calls.

Nikolas
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7571ac578884..ab3a9beb61a2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -698,6 +698,8 @@  struct kvm_vcpu_hv {
 
 	bool suspended;
 	int waiting_on;
+
+	int tlb_flush_inhibit;
 };
 
 struct kvm_hypervisor_cpuid {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index e68fbc0c7fc1..40ea8340838f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2137,6 +2137,9 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 		bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
 
 		kvm_for_each_vcpu(i, v, kvm) {
+			if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit))
+				goto ret_suspend;
+
 			__set_bit(i, vcpu_mask);
 		}
 	} else if (!is_guest_mode(vcpu)) {
@@ -2148,6 +2151,9 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 				__clear_bit(i, vcpu_mask);
 				continue;
 			}
+
+			if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit))
+				goto ret_suspend;
 		}
 	} else {
 		struct kvm_vcpu_hv *hv_v;
@@ -2175,6 +2181,9 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 						    sparse_banks))
 				continue;
 
+			if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit))
+				goto ret_suspend;
+
 			__set_bit(i, vcpu_mask);
 		}
 	}
@@ -2193,6 +2202,9 @@  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 	/* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
 	return (u64)HV_STATUS_SUCCESS |
 		((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
+ret_suspend:
+	kvm_hv_vcpu_suspend_tlb_flush(vcpu, v->vcpu_id);
+	return -EBUSY;
 }
 
 static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
@@ -2380,6 +2392,13 @@  static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
 	u32 tlb_lock_count = 0;
 	int ret;
 
+	/*
+	 * Reached when the hyper-call resulted in a suspension of the vCPU.
+	 * The instruction will be re-tried once the vCPU is unsuspended.
+	 */
+	if (kvm_hv_vcpu_suspended(vcpu))
+		return 1;
+
 	if (hv_result_success(result) && is_guest_mode(vcpu) &&
 	    kvm_hv_is_tlb_flush_hcall(vcpu) &&
 	    kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
@@ -2919,6 +2938,9 @@  int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 
 void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id)
 {
+	RCU_LOCKDEP_WARN(!srcu_read_lock_held(&vcpu->kvm->srcu),
+			 "Suspicious Hyper-V TLB flush inhibit usage\n");
+
 	/* waiting_on's store should happen before suspended's */
 	WRITE_ONCE(vcpu->arch.hyperv->waiting_on, vcpu_id);
 	WRITE_ONCE(vcpu->arch.hyperv->suspended, true);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 18d0a300e79a..1f925e32a927 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4642,6 +4642,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_CPUID:
 	case KVM_CAP_HYPERV_ENFORCE_CPUID:
 	case KVM_CAP_SYS_HYPERV_CPUID:
+	case KVM_CAP_HYPERV_TLB_FLUSH_INHIBIT:
 #endif
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
@@ -5853,6 +5854,31 @@  static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 	}
 }
 
+static int kvm_vcpu_ioctl_set_tlb_flush_inhibit(struct kvm_vcpu *vcpu,
+						struct kvm_hyperv_tlb_flush_inhibit *set)
+{
+	if (set->inhibit == READ_ONCE(vcpu->arch.hyperv->tlb_flush_inhibit))
+		return 0;
+
+	WRITE_ONCE(vcpu->arch.hyperv->tlb_flush_inhibit, set->inhibit);
+
+	/*
+	 * synchronize_srcu() ensures that:
+	 * - On inhibit, all concurrent TLB flushes finished before this ioctl
+	 *   exits.
+	 * - On uninhibit, there are no longer vCPUs being suspended due to this
+	 *   inhibit.
+	 * This function can't race with itself, because vCPU IOCTLs are
+	 * serialized, so if the inhibit bit is already the desired value this
+	 * synchronization has already happened.
+	 */
+	synchronize_srcu(&vcpu->kvm->srcu);
+	if (!set->inhibit)
+		kvm_hv_vcpu_unsuspend_tlb_flush(vcpu);
+
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -6306,6 +6332,17 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 	case KVM_SET_DEVICE_ATTR:
 		r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
 		break;
+#ifdef CONFIG_KVM_HYPERV
+	case KVM_HYPERV_SET_TLB_FLUSH_INHIBIT: {
+		struct kvm_hyperv_tlb_flush_inhibit set;
+
+		r = -EFAULT;
+		if (copy_from_user(&set, argp, sizeof(set)))
+			goto out;
+		r = kvm_vcpu_ioctl_set_tlb_flush_inhibit(vcpu, &set);
+		break;
+	}
+#endif
 	default:
 		r = -EINVAL;
 	}