diff mbox series

[RFC,15/39] KVM: x86/xen: handle PV spinlocks slowpath

Message ID 20190220201609.28290-16-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series x86/KVM: Xen HVM guest support | expand

Commit Message

Joao Martins Feb. 20, 2019, 8:15 p.m. UTC
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>

Add support for SCHEDOP_poll hypercall.

This implementation is optimized for polling for a single channel, which
is what Linux does. Polling for multiple channels is not especially
efficient (and has not been tested).

PV spinlocks slow path uses this hypercall, and explicitly crash if it's
not supported.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/include/asm/kvm_host.h |   3 ++
 arch/x86/kvm/xen.c              | 108 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)

Comments

David Woodhouse Feb. 8, 2022, 12:36 p.m. UTC | #1
On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
> From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> 
> Add support for SCHEDOP_poll hypercall.
> 
> This implementation is optimized for polling for a single channel, which
> is what Linux does. Polling for multiple channels is not especially
> efficient (and has not been tested).
> 
> PV spinlocks slow path uses this hypercall, and explicitly crash if it's
> not supported.
> 
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> ---

...

> +static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
> +{
> +       struct kvm_vcpu_xen *vcpu_xen = vcpu_to_xen_vcpu(vcpu);
> +
> +       if ((vcpu_xen->poll_evtchn == port ||
> +            vcpu_xen->poll_evtchn == -1) &&
> +           test_and_clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.xen.poll_mask))
> +               wake_up(&vcpu_xen->sched_waitq);
> +}

...

> +	if (sched_poll.nr_ports == 1)
> +		vcpu_xen->poll_evtchn = port;
> +	else
> +		vcpu_xen->poll_evtchn = -1;
> +
> +	if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports))
> +		wait_event_interruptible_timeout(
> +			 vcpu_xen->sched_waitq,
> +			 wait_pending_event(vcpu, sched_poll.nr_ports, ports),
> +			 sched_poll.timeout ?: KTIME_MAX);

Hm, this doesn't wake on other interrupts, does it? I think it should.
Shouldn't it basically be like HLT, with an additional wakeup when the
listed ports are triggered even when they're masked?

At https://git.infradead.org/users/dwmw2/linux.git/commitdiff/ddfbdf1af
I've tried to make it use kvm_vcpu_halt(), and kvm_xen_check_poller()
sets KVM_REQ_UNBLOCK when an event is delivered to a monitored port.

I haven't quite got it to work yet, but does it seem like a sane
approach?

+       if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
+               vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+               kvm_vcpu_halt(vcpu);
Joao Martins Feb. 10, 2022, 12:17 p.m. UTC | #2
On 2/8/22 12:36, David Woodhouse wrote:
> On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
>> From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>>
>> Add support for SCHEDOP_poll hypercall.
>>
>> This implementation is optimized for polling for a single channel, which
>> is what Linux does. Polling for multiple channels is not especially
>> efficient (and has not been tested).
>>
>> PV spinlocks slow path uses this hypercall, and explicitly crash if it's
>> not supported.
>>
>> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>> ---
> 
> ...
> 
>> +static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
>> +{
>> +       struct kvm_vcpu_xen *vcpu_xen = vcpu_to_xen_vcpu(vcpu);
>> +
>> +       if ((vcpu_xen->poll_evtchn == port ||
>> +            vcpu_xen->poll_evtchn == -1) &&
>> +           test_and_clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.xen.poll_mask))
>> +               wake_up(&vcpu_xen->sched_waitq);
>> +}
> 
> ...
> 
>> +	if (sched_poll.nr_ports == 1)
>> +		vcpu_xen->poll_evtchn = port;
>> +	else
>> +		vcpu_xen->poll_evtchn = -1;
>> +
>> +	if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports))
>> +		wait_event_interruptible_timeout(
>> +			 vcpu_xen->sched_waitq,
>> +			 wait_pending_event(vcpu, sched_poll.nr_ports, ports),
>> +			 sched_poll.timeout ?: KTIME_MAX);
> 
> Hm, this doesn't wake on other interrupts, does it? 

Hmm, I don't think so? This was specifically polling on event channels,
not sleeping or blocking.

> I think it should.
> Shouldn't it basically be like HLT, with an additional wakeup when the
> listed ports are triggered even when they're masked?
> 

I am actually not sure.

Quickly glancing at the xen source, this hypercall doesn't appear to really
block the vcpu, but rather just looking if the evtchn ports are pending and
if a timeout is is specified it sets up a timer. And ofc, wake any evtchn
pollers. But it doesn't appear to actually block the VCPU. It should be
IIRC, the functional equivalent of KVM_HC_VAPIC_POLL_IRQ but for event
channels.


> At https://git.infradead.org/users/dwmw2/linux.git/commitdiff/ddfbdf1af
> I've tried to make it use kvm_vcpu_halt(), and kvm_xen_check_poller()
> sets KVM_REQ_UNBLOCK when an event is delivered to a monitored port.
> 
> I haven't quite got it to work yet, but does it seem like a sane
> approach?
> 

	Joao
David Woodhouse Feb. 10, 2022, 2:11 p.m. UTC | #3
On Thu, 2022-02-10 at 12:17 +0000, Joao Martins wrote:
> On 2/8/22 12:36, David Woodhouse wrote:
> > > +	if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports))
> > > +		wait_event_interruptible_timeout(
> > > +			 vcpu_xen->sched_waitq,
> > > +			 wait_pending_event(vcpu, sched_poll.nr_ports, ports),
> > > +			 sched_poll.timeout ?: KTIME_MAX);
> > 
> > Hm, this doesn't wake on other interrupts, does it? 
> 
> Hmm, I don't think so? This was specifically polling on event channels,
> not sleeping or blocking.
> 
> > I think it should.
> > Shouldn't it basically be like HLT, with an additional wakeup when the
> > listed ports are triggered even when they're masked?
> > 
> 
> I am actually not sure.
> 
> Quickly glancing at the xen source, this hypercall doesn't appear to really
> block the vcpu, but rather just looking if the evtchn ports are pending and
> if a timeout is is specified it sets up a timer. And ofc, wake any evtchn
> pollers. But it doesn't appear to actually block the VCPU. It should be
> IIRC, the functional equivalent of KVM_HC_VAPIC_POLL_IRQ but for event
> channels.
> 

It does block.

https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=xen/common/sched/core.c;hb=RELEASE-4.14.4#l1385

It sets the _VPF_blocked bit in the vCPU's pause_flags, then the
raise_softirq(SCHEDULE_SOFTIRQ) causes it to deschedule this vCPU when
it gets back to the return-to-guest path. It's not Linux, so you don't
see a schedule() call :)

It'll remain blocked until either an (unmasked) event channel or HVM
IRQ is asserted, or one of the masked event channels listed in the poll
list is raised.

Which is why I tried to tie it into the kvm_vcpu_halt() code path when
I updated the patch.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7fcc81dbb688..c629fedb2e21 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -554,6 +554,8 @@  struct kvm_vcpu_xen {
 	unsigned int virq_to_port[KVM_XEN_NR_VIRQS];
 	struct hrtimer timer;
 	atomic_t timer_pending;
+	wait_queue_head_t sched_waitq;
+	int poll_evtchn;
 };
 
 struct kvm_vcpu_arch {
@@ -865,6 +867,7 @@  struct kvm_xen {
 	struct shared_info *shinfo;
 
 	struct idr port_to_evt;
+	unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
 	struct mutex xen_lock;
 };
 
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 753a6d2c11cd..07066402737d 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -563,6 +563,16 @@  static int kvm_xen_evtchn_set_pending(struct kvm_vcpu *svcpu,
 					     evfd->port);
 }
 
+static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
+{
+	struct kvm_vcpu_xen *vcpu_xen = vcpu_to_xen_vcpu(vcpu);
+
+	if ((vcpu_xen->poll_evtchn == port ||
+	     vcpu_xen->poll_evtchn == -1) &&
+	    test_and_clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.xen.poll_mask))
+		wake_up(&vcpu_xen->sched_waitq);
+}
+
 static int kvm_xen_evtchn_send(struct kvm_vcpu *vcpu, int port)
 {
 	struct eventfd_ctx *eventfd;
@@ -581,6 +591,8 @@  static int kvm_xen_evtchn_send(struct kvm_vcpu *vcpu, int port)
 			eventfd_signal(eventfd, 1);
 	}
 
+	kvm_xen_check_poller(kvm_get_vcpu(vcpu->kvm, evtchnfd->vcpu), port);
+
 	return 0;
 }
 
@@ -669,6 +681,94 @@  static int kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout)
 	return 0;
 }
 
+static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
+			       evtchn_port_t *ports)
+{
+	int i;
+	struct shared_info *shared_info =
+		(struct shared_info *)vcpu->kvm->arch.xen.shinfo;
+
+	for (i = 0; i < nr_ports; i++)
+		if (test_bit(ports[i],
+			     (unsigned long *)shared_info->evtchn_pending))
+			return true;
+
+	return false;
+}
+
+static int kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	struct kvm_vcpu_xen *vcpu_xen = vcpu_to_xen_vcpu(vcpu);
+	int idx, i;
+	struct sched_poll sched_poll;
+	evtchn_port_t port, *ports;
+	struct shared_info *shared_info;
+	struct evtchnfd *evtchnfd;
+	int ret = 0;
+
+	if (kvm_vcpu_read_guest(vcpu, gpa,
+				&sched_poll, sizeof(sched_poll)))
+		return -EFAULT;
+
+	shared_info = (struct shared_info *)vcpu->kvm->arch.xen.shinfo;
+
+	if (unlikely(sched_poll.nr_ports > 1)) {
+		/* Xen (unofficially) limits number of pollers to 128 */
+		if (sched_poll.nr_ports > 128)
+			return -EINVAL;
+
+		ports = kmalloc_array(sched_poll.nr_ports,
+				      sizeof(*ports), GFP_KERNEL);
+		if (!ports)
+			return -ENOMEM;
+	} else
+		ports = &port;
+
+	set_bit(vcpu->vcpu_id, vcpu->kvm->arch.xen.poll_mask);
+
+	for (i = 0; i < sched_poll.nr_ports; i++) {
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+		gpa = kvm_mmu_gva_to_gpa_system(vcpu,
+						(gva_t)(sched_poll.ports + i),
+						NULL);
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+		if (!gpa || kvm_vcpu_read_guest(vcpu, gpa,
+						&ports[i], sizeof(port))) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		evtchnfd = idr_find(&vcpu->kvm->arch.xen.port_to_evt,
+				    ports[i]);
+		if (!evtchnfd) {
+			ret = -ENOENT;
+			goto out;
+		}
+	}
+
+	if (sched_poll.nr_ports == 1)
+		vcpu_xen->poll_evtchn = port;
+	else
+		vcpu_xen->poll_evtchn = -1;
+
+	if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports))
+		wait_event_interruptible_timeout(
+			 vcpu_xen->sched_waitq,
+			 wait_pending_event(vcpu, sched_poll.nr_ports, ports),
+			 sched_poll.timeout ?: KTIME_MAX);
+
+	vcpu_xen->poll_evtchn = 0;
+
+out:
+	/* Really, this is only needed in case of timeout */
+	clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.xen.poll_mask);
+
+	if (unlikely(sched_poll.nr_ports > 1))
+		kfree(ports);
+	return ret;
+}
+
 static int kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, int cmd, u64 param)
 {
 	int ret = -ENOSYS;
@@ -687,6 +787,9 @@  static int kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, int cmd, u64 param)
 		kvm_vcpu_on_spin(vcpu, true);
 		ret = 0;
 		break;
+	case SCHEDOP_poll:
+		ret = kvm_xen_schedop_poll(vcpu, gpa);
+		break;
 	default:
 		break;
 	}
@@ -744,6 +847,9 @@  int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 		r = kvm_xen_hcall_sched_op(vcpu, params[0], params[1]);
 		if (!r)
 			goto hcall_success;
+		else if (params[0] == SCHEDOP_poll)
+			/* SCHEDOP_poll should be handled in kernel */
+			return r;
 		break;
 	/* fallthrough */
 	default:
@@ -770,6 +876,8 @@  int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 
 void kvm_xen_vcpu_init(struct kvm_vcpu *vcpu)
 {
+	init_waitqueue_head(&vcpu->arch.xen.sched_waitq);
+	vcpu->arch.xen.poll_evtchn = 0;
 }
 
 void kvm_xen_vcpu_uninit(struct kvm_vcpu *vcpu)