diff mbox

[2/6] KVM: use kvm_{test,clear}_request instead of {test,clear}_bit

Message ID 20170406202056.18379-3-rkrcmar@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Radim Krčmář April 6, 2017, 8:20 p.m. UTC
Users were expected to use kvm_check_request() for testing and clearing,
but request have expanded their use since then and some users want to
only test or do a faster clear.

Make sure that requests are not directly accessed with bit operations, because
we'll be clearing them later.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/mips/kvm/emulate.c           |  2 +-
 arch/powerpc/kvm/book3s_pr.c      |  2 +-
 arch/powerpc/kvm/book3s_pr_papr.c |  2 +-
 arch/powerpc/kvm/booke.c          |  4 ++--
 arch/powerpc/kvm/powerpc.c        |  2 +-
 arch/s390/kvm/kvm-s390.c          |  2 +-
 arch/x86/kvm/vmx.c                |  2 +-
 arch/x86/kvm/x86.c                |  8 ++++----
 include/linux/kvm_host.h          | 14 ++++++++++++--
 9 files changed, 24 insertions(+), 14 deletions(-)

Comments

Christian Borntraeger April 7, 2017, 10:55 a.m. UTC | #1
On 04/06/2017 10:20 PM, Radim Krčmář wrote:
> Users were expected to use kvm_check_request() for testing and clearing,
> but request have expanded their use since then and some users want to
> only test or do a faster clear.
> 
> Make sure that requests are not directly accessed with bit operations, because
> we'll be clearing them later.
> 
> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>

Patch itself looks sane
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>


one question:
>  static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
>  {
> -	if (test_bit(req, &vcpu->requests)) {
> -		clear_bit(req, &vcpu->requests);
> +	if (kvm_test_request(req, vcpu)) {
> +		kvm_clear_request(req, vcpu);

This looks fine. I am just asking myself why we do not use
test_and_clear_bit? Do we expect gcc to merge all test bits as
a fast path? This does not seem to work as far as I can tell and
almost everybody does a fast path like in



arch/s390/kvm/kvm-s390.c:
        if (!vcpu->requests)
                return 0;

arch/x86/kvm/x86.c:
    if (vcpu->requests) {

> 
>  		/*
>  		 * Ensure the rest of the request is visible to kvm_check_request's
>
Radim Krčmář April 7, 2017, 12:24 p.m. UTC | #2
2017-04-07 12:55+0200, Christian Borntraeger:
> On 04/06/2017 10:20 PM, Radim Krčmář wrote:
>>  static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
>>  {
>> -	if (test_bit(req, &vcpu->requests)) {
>> -		clear_bit(req, &vcpu->requests);
>> +	if (kvm_test_request(req, vcpu)) {
>> +		kvm_clear_request(req, vcpu);
> 
> This looks fine. I am just asking myself why we do not use
> test_and_clear_bit? Do we expect gcc to merge all test bits as
> a fast path? This does not seem to work as far as I can tell and
> almost everybody does a fast path like in

test_and_clear_bit() is a slower operation even if the test is false (at
least on x86), because it needs to be fully atomic.

> arch/s390/kvm/kvm-s390.c:
>         if (!vcpu->requests)
>                 return 0;
> 
> arch/x86/kvm/x86.c:
>     if (vcpu->requests) {

We'll mostly have only one request set, so splitting the test_and_clear
improves the performance of many subsequent tests_and_clear()s even if
the compiler doesn't optimize.

GCC couldn't even optimize if we used test_and_clear_bit(), because that
instruction adds barriers, but the forward check for vcpu->requests is
there because we do not trust the optimizer to do it for us and it would
make a big difference.
Radim Krčmář April 7, 2017, 2:05 p.m. UTC | #3
2017-04-07 14:24+0200, Radim Krčmář:
> 2017-04-07 12:55+0200, Christian Borntraeger:
>> On 04/06/2017 10:20 PM, Radim Krčmář wrote:
>>>  static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
>>>  {
>>> -	if (test_bit(req, &vcpu->requests)) {
>>> -		clear_bit(req, &vcpu->requests);
>>> +	if (kvm_test_request(req, vcpu)) {
>>> +		kvm_clear_request(req, vcpu);
>> 
>> This looks fine. I am just asking myself why we do not use
>> test_and_clear_bit? Do we expect gcc to merge all test bits as
>> a fast path? This does not seem to work as far as I can tell and
>> almost everybody does a fast path like in
> 
> test_and_clear_bit() is a slower operation even if the test is false (at
> least on x86), because it needs to be fully atomic.
> 
>> arch/s390/kvm/kvm-s390.c:
>>         if (!vcpu->requests)
>>                 return 0;
>> 
>> arch/x86/kvm/x86.c:
>>     if (vcpu->requests) {
> 
> We'll mostly have only one request set, so splitting the test_and_clear
> improves the performance of many subsequent tests_and_clear()s even if
> the compiler doesn't optimize.
> 
> GCC couldn't even optimize if we used test_and_clear_bit(), because that
> instruction adds barriers, but the forward check for vcpu->requests is
> there because we do not trust the optimizer to do it for us and it would
> make a big difference.

Ugh, I started thinking that bitops are not atomic because I looked at
wrong boot/bitops.h by mistake.  The compiler cannot merge test_bit()s,
but the speed difference holds.
diff mbox

Patch

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index d40cfaad4529..aee5ba5840af 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -865,7 +865,7 @@  enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 		 * check if any I/O interrupts are pending.
 		 */
 		if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
-			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		}
 	}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d4dfc0ca2a44..d8ace42f34a6 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -349,7 +349,7 @@  static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 	if (msr & MSR_POW) {
 		if (!vcpu->arch.pending_exceptions) {
 			kvm_vcpu_block(vcpu);
-			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 			vcpu->stat.halt_wakeup++;
 
 			/* Unset POW bit after we woke up */
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index f102616febc7..bcbeeb62dd13 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -344,7 +344,7 @@  int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 	case H_CEDE:
 		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
 	case H_LOGICAL_CI_LOAD:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 0514cbd4e533..ab968f60d14c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -579,7 +579,7 @@  static void arm_next_watchdog(struct kvm_vcpu *vcpu)
 	 * userspace, so clear the KVM_REQ_WATCHDOG request.
 	 */
 	if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
-		clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_WATCHDOG, vcpu);
 
 	spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
 	nr_jiffies = watchdog_next_timeout(vcpu);
@@ -690,7 +690,7 @@  int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.shared->msr & MSR_WE) {
 		local_irq_enable();
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		hard_irq_disable();
 
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 0e42aa8a279f..63bdf14d4389 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -232,7 +232,7 @@  int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	case EV_HCALL_TOKEN(EV_IDLE):
 		r = EV_SUCCESS;
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		break;
 	default:
 		r = EV_UNIMPLEMENTED;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 45b6d9ca5d24..ead4b476cce9 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2444,7 +2444,7 @@  static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 	}
 
 	/* nothing to do, just clear the request */
-	clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cfdb0d9389d1..24cb416aa84f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6317,7 +6317,7 @@  static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
 			return handle_interrupt_window(&vmx->vcpu);
 
-		if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
 			return 1;
 
 		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6bc47e2712c8..71a019832df9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1768,7 +1768,7 @@  static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 	/* guest entries allowed */
 	kvm_for_each_vcpu(i, vcpu, kvm)
-		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
 
 	spin_unlock(&ka->pvclock_gtod_sync_lock);
 #endif
@@ -7045,7 +7045,7 @@  static int vcpu_run(struct kvm_vcpu *vcpu)
 		if (r <= 0)
 			break;
 
-		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
 
@@ -7173,7 +7173,7 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
 		kvm_apic_accept_events(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		r = -EAGAIN;
 		goto out;
 	}
@@ -8383,7 +8383,7 @@  static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (atomic_read(&vcpu->arch.nmi_queued))
 		return true;
 
-	if (test_bit(KVM_REQ_SMI, &vcpu->requests))
+	if (kvm_test_request(KVM_REQ_SMI, vcpu))
 		return true;
 
 	if (kvm_arch_interrupt_allowed(vcpu) &&
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7e74ae4d99bb..51737d6401ae 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1079,10 +1079,20 @@  static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 	set_bit(req, &vcpu->requests);
 }
 
+static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
+{
+	return test_bit(req, &vcpu->requests);
+}
+
+static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
+{
+	clear_bit(req, &vcpu->requests);
+}
+
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 {
-	if (test_bit(req, &vcpu->requests)) {
-		clear_bit(req, &vcpu->requests);
+	if (kvm_test_request(req, vcpu)) {
+		kvm_clear_request(req, vcpu);
 
 		/*
 		 * Ensure the rest of the request is visible to kvm_check_request's