[RFC,3/4] KVM: x86: Add EOI exit bitmap inference
diff mbox

Message ID 1431481652-27268-3-git-send-email-srutherford@google.com
State New
Headers show

Commit Message

Steve Rutherford May 13, 2015, 1:47 a.m. UTC
In order to support a userspace IOAPIC interacting with an in kernel
APIC, the EOI exit bitmaps need to be configurable.

If the IOAPIC is in userspace (i.e. the irqchip has been split), the
EOI exit bitmaps will be set whenever the GSI Routes are configured.
In particular, for the low 24 MSI routes, the EOI Exit bit
corresponding to the destination vector will be set for the
destination VCPU.

The intention is for the userspace IOAPIC to use MSI routes [0,23] to
inject interrupts into the guest.

This is a slight abuse of the notion of an MSI Route, given that MSIs
classically bypass the IOAPIC. It might be worthwhile to add an
additional route type to improve clarity.

Compile tested for Intel x86.

Signed-off-by: Steve Rutherford <srutherford@google.com>
---
 arch/x86/kvm/ioapic.c    | 11 +++++++++++
 arch/x86/kvm/ioapic.h    |  1 +
 arch/x86/kvm/lapic.c     |  2 ++
 arch/x86/kvm/x86.c       | 13 +++++++++++--
 include/linux/kvm_host.h |  4 ++++
 virt/kvm/irqchip.c       | 32 ++++++++++++++++++++++++++++++++
 6 files changed, 61 insertions(+), 2 deletions(-)

Comments

Jan Kiszka May 13, 2015, 6:12 a.m. UTC | #1
On 2015-05-13 03:47, Steve Rutherford wrote:
> In order to support a userspace IOAPIC interacting with an in kernel
> APIC, the EOI exit bitmaps need to be configurable.
> 
> If the IOAPIC is in userspace (i.e. the irqchip has been split), the
> EOI exit bitmaps will be set whenever the GSI Routes are configured.
> In particular, for the low 24 MSI routes, the EOI Exit bit
> corresponding to the destination vector will be set for the
> destination VCPU.
> 
> The intention is for the userspace IOAPIC to use MSI routes [0,23] to
> inject interrupts into the guest.
> 
> This is a slight abuse of the notion of an MSI Route, given that MSIs
> classically bypass the IOAPIC. It might be worthwhile to add an
> additional route type to improve clarity.
> 
> Compile tested for Intel x86.
> 
> Signed-off-by: Steve Rutherford <srutherford@google.com>
> ---
>  arch/x86/kvm/ioapic.c    | 11 +++++++++++
>  arch/x86/kvm/ioapic.h    |  1 +
>  arch/x86/kvm/lapic.c     |  2 ++
>  arch/x86/kvm/x86.c       | 13 +++++++++++--
>  include/linux/kvm_host.h |  4 ++++
>  virt/kvm/irqchip.c       | 32 ++++++++++++++++++++++++++++++++
>  6 files changed, 61 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
> index 856f791..3323c86 100644
> --- a/arch/x86/kvm/ioapic.c
> +++ b/arch/x86/kvm/ioapic.c
> @@ -672,3 +672,14 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
>  	spin_unlock(&ioapic->lock);
>  	return 0;
>  }
> +
> +void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
> +{
> +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> +
> +	if (ioapic)
> +		return;
> +	if (!lapic_in_kernel(kvm))
> +		return;
> +	kvm_make_scan_ioapic_request(kvm);
> +}
> diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
> index ca0b0b4..b7af71b 100644
> --- a/arch/x86/kvm/ioapic.h
> +++ b/arch/x86/kvm/ioapic.h
> @@ -123,4 +123,5 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
>  void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
>  			u32 *tmr);
>  
> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
>  #endif
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 42fada6f..7533b87 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -211,6 +211,8 @@ out:
>  
>  	if (!irqchip_split(kvm))
>  		kvm_vcpu_request_scan_ioapic(kvm);
> +	else
> +		kvm_vcpu_request_scan_userspace_ioapic(kvm);
>  }
>  
>  static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cc27c35..6127fe7 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6335,8 +6335,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  				goto out;
>  			}
>  		}
> -		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
> -			vcpu_scan_ioapic(vcpu);
> +		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) {
> +			if (irqchip_split(vcpu->kvm)) {
> +				memset(vcpu->arch.eoi_exit_bitmaps, 0, 32);
> +				kvm_scan_ioapic_routes(
> +				    vcpu, vcpu->arch.eoi_exit_bitmaps);
> +				kvm_x86_ops->load_eoi_exitmap(
> +				    vcpu, vcpu->arch.eoi_exit_bitmaps);
> +
> +			} else
> +				vcpu_scan_ioapic(vcpu);
> +		}
>  		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
>  			kvm_vcpu_reload_apic_access_page(vcpu);
>  	}
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index cef20ad..678215a 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -438,10 +438,14 @@ void vcpu_put(struct kvm_vcpu *vcpu);
>  
>  #ifdef __KVM_HAVE_IOAPIC
>  void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
> +void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm);
>  #else
>  static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
>  {
>  }
> +static inline void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
> +{
> +}
>  #endif
>  
>  #ifdef CONFIG_HAVE_KVM_IRQFD
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 8aaceed..8a253aa 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -205,6 +205,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
>  
>  	synchronize_srcu_expedited(&kvm->irq_srcu);
>  
> +	kvm_vcpu_request_scan_userspace_ioapic(kvm);
> +
>  	new = old;
>  	r = 0;
>  
> @@ -212,3 +214,33 @@ out:
>  	kfree(new);
>  	return r;
>  }
> +
> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvm_kernel_irq_routing_entry *entry;
> +	struct kvm_irq_routing_table *table;
> +	u32 i, nr_rt_entries;
> +
> +	mutex_lock(&kvm->irq_lock);
> +	table = kvm->irq_routing;
> +	nr_rt_entries = min_t(u32, table->nr_rt_entries, IOAPIC_NUM_PINS);
> +	for (i = 0; i < nr_rt_entries; ++i) {
> +		hlist_for_each_entry(entry, &table->map[i], link) {
> +			u32 dest_id, dest_mode;
> +
> +			if (entry->type != KVM_IRQ_ROUTING_MSI)
> +				continue;
> +			dest_id = (entry->msi.address_lo >> 12) & 0xff;
> +			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
> +			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
> +						dest_mode)) {
> +				u32 vector = entry->msi.data & 0xff;
> +
> +				__set_bit(vector,
> +					  (unsigned long *) eoi_exit_bitmap);
> +			}
> +		}
> +	}
> +	mutex_unlock(&kvm->irq_lock);
> +}
> 

This looks a bit frightening regarding the lookup costs. Do we really
have to run through the complete routing table to find the needed
information? There can be way more "real" MSI entries than IOAPIC pins.
There can even be multiple IOAPICs (thanks to your patches overcoming
the single in-kernel instance). And this is potentially serializing the
whole VM (kvm->irq_lock), not just those VCPUs that use the IOAPIC.

Jan
Paolo Bonzini May 13, 2015, 7:51 a.m. UTC | #2
On 13/05/2015 03:47, Steve Rutherford wrote:
> @@ -205,6 +205,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
>  
>  	synchronize_srcu_expedited(&kvm->irq_srcu);
>  
> +	kvm_vcpu_request_scan_userspace_ioapic(kvm);
> +
>  	new = old;
>  	r = 0;
>  

This can be done before synchronize_srcu_expedited, so that changes
ripple to the VCPUs faster.

> +void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
> +{
> +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> +
> +	if (ioapic)
> +		return;
> +	if (!lapic_in_kernel(kvm))
> +		return;
> +	kvm_make_scan_ioapic_request(kvm);
> +}

This is okay for use in kvm_set_irq_routing (perhaps renamed to
kvm_arch_irq_routing_update), but when it is used here:

>  	if (!irqchip_split(kvm))
>  		kvm_vcpu_request_scan_ioapic(kvm);
> +	else
> +		kvm_vcpu_request_scan_userspace_ioapic(kvm);

... you can simply do this:

- 	if (!irqchip_split(kvm))
-		kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_make_scan_ioapic_request(kvm);

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini May 13, 2015, 8:04 a.m. UTC | #3
On 13/05/2015 08:12, Jan Kiszka wrote:
>> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
>> +{
>> +	struct kvm *kvm = vcpu->kvm;
>> +	struct kvm_kernel_irq_routing_entry *entry;
>> +	struct kvm_irq_routing_table *table;
>> +	u32 i, nr_rt_entries;
>> +
>> +	mutex_lock(&kvm->irq_lock);

This only needs irq_srcu protection, not irq_lock, so the lookup cost
becomes much smaller (all CPUs can proceed in parallel).

You would need to put an smp_mb here, to ensure that irq_routing is read
after KVM_SCAN_IOAPIC is cleared.  You can introduce
smb_mb__after_srcu_read_lock in order to elide it.

The matching memory barrier would be a smp_mb__before_atomic in
kvm_make_scan_ioapic_request.

>> +	table = kvm->irq_routing;
>> +	nr_rt_entries = min_t(u32, table->nr_rt_entries, IOAPIC_NUM_PINS);
>> +	for (i = 0; i < nr_rt_entries; ++i) {
>> +		hlist_for_each_entry(entry, &table->map[i], link) {
>> +			u32 dest_id, dest_mode;
>> +
>> +			if (entry->type != KVM_IRQ_ROUTING_MSI)
>> +				continue;
>> +			dest_id = (entry->msi.address_lo >> 12) & 0xff;
>> +			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
>> +			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
>> +						dest_mode)) {
>> +				u32 vector = entry->msi.data & 0xff;
>> +
>> +				__set_bit(vector,
>> +					  (unsigned long *) eoi_exit_bitmap);
>> +			}
>> +		}
>> +	}
>> +	mutex_unlock(&kvm->irq_lock);
>> +}
>>
> 
> This looks a bit frightening regarding the lookup costs. Do we really
> have to run through the complete routing table to find the needed
> information? There can be way more "real" MSI entries than IOAPIC pins.

It does at most IOAPIC_NUM_PINS iterations however.

> There can even be multiple IOAPICs (thanks to your patches overcoming
> the single in-kernel instance).

With multiple IOAPICs you have more than 24 GSIs per IOAPIC.  That means
that the above loop is broken for multiple IOAPICs.

But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
the number of IOAPIC routes that will cause EOI exits?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka May 13, 2015, 8:10 a.m. UTC | #4
On 2015-05-13 10:04, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 08:12, Jan Kiszka wrote:
>>> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
>>> +{
>>> +	struct kvm *kvm = vcpu->kvm;
>>> +	struct kvm_kernel_irq_routing_entry *entry;
>>> +	struct kvm_irq_routing_table *table;
>>> +	u32 i, nr_rt_entries;
>>> +
>>> +	mutex_lock(&kvm->irq_lock);
> 
> This only needs irq_srcu protection, not irq_lock, so the lookup cost
> becomes much smaller (all CPUs can proceed in parallel).
> 
> You would need to put an smp_mb here, to ensure that irq_routing is read
> after KVM_SCAN_IOAPIC is cleared.  You can introduce
> smb_mb__after_srcu_read_lock in order to elide it.
> 
> The matching memory barrier would be a smp_mb__before_atomic in
> kvm_make_scan_ioapic_request.
> 
>>> +	table = kvm->irq_routing;
>>> +	nr_rt_entries = min_t(u32, table->nr_rt_entries, IOAPIC_NUM_PINS);
>>> +	for (i = 0; i < nr_rt_entries; ++i) {
>>> +		hlist_for_each_entry(entry, &table->map[i], link) {
>>> +			u32 dest_id, dest_mode;
>>> +
>>> +			if (entry->type != KVM_IRQ_ROUTING_MSI)
>>> +				continue;
>>> +			dest_id = (entry->msi.address_lo >> 12) & 0xff;
>>> +			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
>>> +			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
>>> +						dest_mode)) {
>>> +				u32 vector = entry->msi.data & 0xff;
>>> +
>>> +				__set_bit(vector,
>>> +					  (unsigned long *) eoi_exit_bitmap);
>>> +			}
>>> +		}
>>> +	}
>>> +	mutex_unlock(&kvm->irq_lock);
>>> +}
>>>
>>
>> This looks a bit frightening regarding the lookup costs. Do we really
>> have to run through the complete routing table to find the needed
>> information? There can be way more "real" MSI entries than IOAPIC pins.
> 
> It does at most IOAPIC_NUM_PINS iterations however.
> 
>> There can even be multiple IOAPICs (thanks to your patches overcoming
>> the single in-kernel instance).
> 
> With multiple IOAPICs you have more than 24 GSIs per IOAPIC.  That means

I don't think that the number of pins per IOAPIC increases. At least not
in the devices I've seen so far.

> that the above loop is broken for multiple IOAPICs.

The worst case remains #IOAPIC * 24 iterations - if we have means to
stop after the IOAPIC entries, not iterating over all routes.

> 
> But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
> the number of IOAPIC routes that will cause EOI exits?

And you need to ensure that their routes can be found in the table
directly. Given IOAPIC hotplug, that may not be the first ones there...

Jan
Paolo Bonzini May 13, 2015, 9:24 a.m. UTC | #5
On 13/05/2015 10:10, Jan Kiszka wrote:
>>> >> There can even be multiple IOAPICs (thanks to your patches overcoming
>>> >> the single in-kernel instance).
>> > 
>> > With multiple IOAPICs you have more than 24 GSIs per IOAPIC.  That means
> I don't think that the number of pins per IOAPIC increases. At least not
> in the devices I've seen so far.

Sorry, that was supposed to be "more than 24 GSIs for the IOAPICs".

>> > that the above loop is broken for multiple IOAPICs.
> The worst case remains #IOAPIC * 24 iterations - if we have means to
> stop after the IOAPIC entries, not iterating over all routes.

Yes.  Which is not too bad if VCPUs can process it in parallel.

>> > But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
>> > the number of IOAPIC routes that will cause EOI exits?
> And you need to ensure that their routes can be found in the table
> directly. Given IOAPIC hotplug, that may not be the first ones there...

Can you reserve a bunch of GSIs at the beginning of the GSI space, and
use rt->map[] to access them and build the EOI exit bitmap?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka May 13, 2015, 10:25 a.m. UTC | #6
On 2015-05-13 11:24, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 10:10, Jan Kiszka wrote:
>>>>>> There can even be multiple IOAPICs (thanks to your patches overcoming
>>>>>> the single in-kernel instance).
>>>>
>>>> With multiple IOAPICs you have more than 24 GSIs per IOAPIC.  That means
>> I don't think that the number of pins per IOAPIC increases. At least not
>> in the devices I've seen so far.
> 
> Sorry, that was supposed to be "more than 24 GSIs for the IOAPICs".
> 
>>>> that the above loop is broken for multiple IOAPICs.
>> The worst case remains #IOAPIC * 24 iterations - if we have means to
>> stop after the IOAPIC entries, not iterating over all routes.
> 
> Yes.  Which is not too bad if VCPUs can process it in parallel.
> 
>>>> But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
>>>> the number of IOAPIC routes that will cause EOI exits?
>> And you need to ensure that their routes can be found in the table
>> directly. Given IOAPIC hotplug, that may not be the first ones there...
> 
> Can you reserve a bunch of GSIs at the beginning of the GSI space, and
> use rt->map[] to access them and build the EOI exit bitmap?

Ideally, userspace could give the kernel a hint where to look. It has a
copy of the routing table and manages it, no?

Jan
Paolo Bonzini May 13, 2015, 1:04 p.m. UTC | #7
On 13/05/2015 12:25, Jan Kiszka wrote:
>>>> But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
>>>> the number of IOAPIC routes that will cause EOI exits?
>>> 
>>> And you need to ensure that their routes can be found in the table
>>> directly. Given IOAPIC hotplug, that may not be the first ones there...
>> 
>> Can you reserve a bunch of GSIs at the beginning of the GSI space, and
>> use rt->map[] to access them and build the EOI exit bitmap?
> 
> Ideally, userspace could give the kernel a hint where to look. It has a
> copy of the routing table and manages it, no?

Well, reserving space at the beginning of the table (and communicating
the size via KVM_ENABLE_CAP is a way to "give the kernel a hint where to
look".  Perhaps not the best, but simple and supports hotplug to some
extent.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka May 13, 2015, 1:19 p.m. UTC | #8
On 2015-05-13 15:04, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 12:25, Jan Kiszka wrote:
>>>>> But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
>>>>> the number of IOAPIC routes that will cause EOI exits?
>>>>
>>>> And you need to ensure that their routes can be found in the table
>>>> directly. Given IOAPIC hotplug, that may not be the first ones there...
>>>
>>> Can you reserve a bunch of GSIs at the beginning of the GSI space, and
>>> use rt->map[] to access them and build the EOI exit bitmap?
>>
>> Ideally, userspace could give the kernel a hint where to look. It has a
>> copy of the routing table and manages it, no?
> 
> Well, reserving space at the beginning of the table (and communicating
> the size via KVM_ENABLE_CAP is a way to "give the kernel a hint where to
> look".  Perhaps not the best, but simple and supports hotplug to some
> extent.

But then we need at least a way to differentiate between IOAPIC and MSI
routes so that the loop can actually stop when hitting the first
non-IOAPIC entry. Right now, this is not possible. But even that would
be a kind of ugly interface.

Jan
Steve Rutherford May 13, 2015, 10:21 p.m. UTC | #9
On Wed, May 13, 2015 at 10:04:53AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 08:12, Jan Kiszka wrote:
> >> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
> >> +{
> >> +	struct kvm *kvm = vcpu->kvm;
> >> +	struct kvm_kernel_irq_routing_entry *entry;
> >> +	struct kvm_irq_routing_table *table;
> >> +	u32 i, nr_rt_entries;
> >> +
> >> +	mutex_lock(&kvm->irq_lock);
> 
> This only needs irq_srcu protection, not irq_lock, so the lookup cost
> becomes much smaller (all CPUs can proceed in parallel).
> 
> You would need to put an smp_mb here, to ensure that irq_routing is read
> after KVM_SCAN_IOAPIC is cleared.  You can introduce
> smb_mb__after_srcu_read_lock in order to elide it.
> 
> The matching memory barrier would be a smp_mb__before_atomic in
> kvm_make_scan_ioapic_request.
> 

Makes sense, I'll update this for the next iteration.

Steve
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Rutherford May 13, 2015, 10:24 p.m. UTC | #10
On Wed, May 13, 2015 at 09:51:43AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 03:47, Steve Rutherford wrote:
> > @@ -205,6 +205,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
> >  
> >  	synchronize_srcu_expedited(&kvm->irq_srcu);
> >  
> > +	kvm_vcpu_request_scan_userspace_ioapic(kvm);
> > +
> >  	new = old;
> >  	r = 0;
> >  
> 
> This can be done before synchronize_srcu_expedited, so that changes
> ripple to the VCPUs faster.
> 
> > +void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
> > +{
> > +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> > +
> > +	if (ioapic)
> > +		return;
> > +	if (!lapic_in_kernel(kvm))
> > +		return;
> > +	kvm_make_scan_ioapic_request(kvm);
> > +}
> 
> This is okay for use in kvm_set_irq_routing (perhaps renamed to
> kvm_arch_irq_routing_update), but when it is used here:
> 
> >  	if (!irqchip_split(kvm))
> >  		kvm_vcpu_request_scan_ioapic(kvm);
> > +	else
> > +		kvm_vcpu_request_scan_userspace_ioapic(kvm);
> 
> ... you can simply do this:
> 
> - 	if (!irqchip_split(kvm))
> -		kvm_vcpu_request_scan_ioapic(kvm);
> +	kvm_make_scan_ioapic_request(kvm);
> 
> Paolo

Seems reasonable.

While we're on the topic of scanning the IOAPIC, should this also scan the IOAPIC when (un)registering irq ack notifiers? [Which is currently done for the in-kernel IOAPIC.]
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini May 14, 2015, 9:20 a.m. UTC | #11
On 14/05/2015 00:24, Steve Rutherford wrote:
> Seems reasonable.
> 
> While we're on the topic of scanning the IOAPIC, should this also
> scan the IOAPIC when (un)registering irq ack notifiers? [Which is
> currently done for the in-kernel IOAPIC.]

Would irq_ack_notifiers be used at all with this patch set?  Resampling
of IOAPIC level-triggered interrupts would be implemented in userspace.
 For the same reason, assigned devices using legacy device assignment
probably would not be able to use INTX (so this feature should depend on
!KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
irq_ack_notifiers goes away.

Alex, how does VFIO do INTX resampling if you're using TCG or -machine
kernel_irqchip=off?  (Context: this series keeps the local APIC
emulation in the kernel, thus including MSI, but moves the IOAPIC
emualtion to userspace).

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson May 14, 2015, 3:23 p.m. UTC | #12
On Thu, 2015-05-14 at 11:20 +0200, Paolo Bonzini wrote:
> 
> On 14/05/2015 00:24, Steve Rutherford wrote:
> > Seems reasonable.
> > 
> > While we're on the topic of scanning the IOAPIC, should this also
> > scan the IOAPIC when (un)registering irq ack notifiers? [Which is
> > currently done for the in-kernel IOAPIC.]
> 
> Would irq_ack_notifiers be used at all with this patch set?  Resampling
> of IOAPIC level-triggered interrupts would be implemented in userspace.
>  For the same reason, assigned devices using legacy device assignment
> probably would not be able to use INTX (so this feature should depend on
> !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> irq_ack_notifiers goes away.
> 
> Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> kernel_irqchip=off?  (Context: this series keeps the local APIC
> emulation in the kernel, thus including MSI, but moves the IOAPIC
> emualtion to userspace).

Without KVM irqchip, we use a rudimentary approach where we disable
mmaps to the device when an interrupt occurs.  This makes us trap all
accesses to the device.  We then handle any device access from the guest
as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
we've at least rate-limited it.  If it doesn't, a timer eventually
re-enables devices mmaps.  It's not very efficient, but it works (for
both PCI and platform devices) and avoids IRQ APIs through QEMU that get
very platform/machine specific.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini May 14, 2015, 3:46 p.m. UTC | #13
On 14/05/2015 17:23, Alex Williamson wrote:
> > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > of IOAPIC level-triggered interrupts would be implemented in userspace.
> >  For the same reason, assigned devices using legacy device assignment
> > probably would not be able to use INTX (so this feature should depend on
> > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > irq_ack_notifiers goes away.
> > 
> > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > emualtion to userspace).
> 
> Without KVM irqchip, we use a rudimentary approach where we disable
> mmaps to the device when an interrupt occurs.  This makes us trap all
> accesses to the device.  We then handle any device access from the guest
> as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> we've at least rate-limited it.  If it doesn't, a timer eventually
> re-enables devices mmaps.  It's not very efficient, but it works (for
> both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> very platform/machine specific.  Thanks,

If we move the IOAPIC back to userspace, it probably would be easier to
implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
start!

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson May 14, 2015, 4:04 p.m. UTC | #14
On Thu, 2015-05-14 at 17:46 +0200, Paolo Bonzini wrote:
> 
> On 14/05/2015 17:23, Alex Williamson wrote:
> > > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > > of IOAPIC level-triggered interrupts would be implemented in userspace.
> > >  For the same reason, assigned devices using legacy device assignment
> > > probably would not be able to use INTX (so this feature should depend on
> > > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > > irq_ack_notifiers goes away.
> > > 
> > > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > > emualtion to userspace).
> > 
> > Without KVM irqchip, we use a rudimentary approach where we disable
> > mmaps to the device when an interrupt occurs.  This makes us trap all
> > accesses to the device.  We then handle any device access from the guest
> > as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> > we've at least rate-limited it.  If it doesn't, a timer eventually
> > re-enables devices mmaps.  It's not very efficient, but it works (for
> > both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> > very platform/machine specific.  Thanks,
> 
> If we move the IOAPIC back to userspace, it probably would be easier to
> implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
> start!

The current solution is merely functional with the benefit of being
architecture and machine-type independent.  If someone actually cared
about vfio assigned device performance without KVM irqchip, we'd
certainly need a more deterministic and efficient EOI path.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Rutherford May 14, 2015, 10:10 p.m. UTC | #15
On Thu, May 14, 2015 at 10:04:40AM -0600, Alex Williamson wrote:
> On Thu, 2015-05-14 at 17:46 +0200, Paolo Bonzini wrote:
> > 
> > On 14/05/2015 17:23, Alex Williamson wrote:
> > > > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > > > of IOAPIC level-triggered interrupts would be implemented in userspace.
> > > >  For the same reason, assigned devices using legacy device assignment
> > > > probably would not be able to use INTX (so this feature should depend on
> > > > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > > > irq_ack_notifiers goes away.
> > > > 
> > > > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > > > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > > > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > > > emualtion to userspace).
> > > 
> > > Without KVM irqchip, we use a rudimentary approach where we disable
> > > mmaps to the device when an interrupt occurs.  This makes us trap all
> > > accesses to the device.  We then handle any device access from the guest
> > > as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> > > we've at least rate-limited it.  If it doesn't, a timer eventually
> > > re-enables devices mmaps.  It's not very efficient, but it works (for
> > > both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> > > very platform/machine specific.  Thanks,
> > 
> > If we move the IOAPIC back to userspace, it probably would be easier to
> > implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
> > start!
> 
> The current solution is merely functional with the benefit of being
> architecture and machine-type independent.  If someone actually cared
> about vfio assigned device performance without KVM irqchip, we'd
> certainly need a more deterministic and efficient EOI path.  Thanks,
> 
> Alex
> 

I'd like this feature to not preclude fast /modern/ device assignment (i.e. devices using MSIs). A perf hit (or even incompatibility) for legacy devices is fine [that's sort of the premise of this patchset]. What's necessary for the device assignment to work with the split irqchip?

My take regarding the original question: No, we don't need to scan the IOAPIC when the irqchip is split, given that the userspace IOAPIC is not using resamplefds to fetch EOIs from the kernel. 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson May 14, 2015, 10:35 p.m. UTC | #16
On Thu, 2015-05-14 at 15:10 -0700, Steve Rutherford wrote:
> On Thu, May 14, 2015 at 10:04:40AM -0600, Alex Williamson wrote:
> > On Thu, 2015-05-14 at 17:46 +0200, Paolo Bonzini wrote:
> > > 
> > > On 14/05/2015 17:23, Alex Williamson wrote:
> > > > > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > > > > of IOAPIC level-triggered interrupts would be implemented in userspace.
> > > > >  For the same reason, assigned devices using legacy device assignment
> > > > > probably would not be able to use INTX (so this feature should depend on
> > > > > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > > > > irq_ack_notifiers goes away.
> > > > > 
> > > > > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > > > > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > > > > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > > > > emualtion to userspace).
> > > > 
> > > > Without KVM irqchip, we use a rudimentary approach where we disable
> > > > mmaps to the device when an interrupt occurs.  This makes us trap all
> > > > accesses to the device.  We then handle any device access from the guest
> > > > as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> > > > we've at least rate-limited it.  If it doesn't, a timer eventually
> > > > re-enables devices mmaps.  It's not very efficient, but it works (for
> > > > both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> > > > very platform/machine specific.  Thanks,
> > > 
> > > If we move the IOAPIC back to userspace, it probably would be easier to
> > > implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
> > > start!
> > 
> > The current solution is merely functional with the benefit of being
> > architecture and machine-type independent.  If someone actually cared
> > about vfio assigned device performance without KVM irqchip, we'd
> > certainly need a more deterministic and efficient EOI path.  Thanks,
> > 
> > Alex
> > 
> 
> I'd like this feature to not preclude fast /modern/ device assignment
> (i.e. devices using MSIs). A perf hit (or even incompatibility) for
> legacy devices is fine [that's sort of the premise of this patchset].
> What's necessary for the device assignment to work with the split
> irqchip?

The current code will attempt to use kvm_irqchip_add_msi_route() and
kvm_irqchip_add_irqfd_notifier() to route the MSI through the KVM
irqchip.  Without that, QEMU will receive the MSI eventfd and call
msi_notify() or msix_notify(), which would be significantly slower.
This should be pretty much the same as virtio, which you've hopefully
already taken into consideration.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Rutherford May 14, 2015, 11:21 p.m. UTC | #17
On Thu, May 14, 2015 at 04:35:19PM -0600, Alex Williamson wrote:
> On Thu, 2015-05-14 at 15:10 -0700, Steve Rutherford wrote:
> > On Thu, May 14, 2015 at 10:04:40AM -0600, Alex Williamson wrote:
> > > On Thu, 2015-05-14 at 17:46 +0200, Paolo Bonzini wrote:
> > > > 
> > > > On 14/05/2015 17:23, Alex Williamson wrote:
> > > > > > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > > > > > of IOAPIC level-triggered interrupts would be implemented in userspace.
> > > > > >  For the same reason, assigned devices using legacy device assignment
> > > > > > probably would not be able to use INTX (so this feature should depend on
> > > > > > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > > > > > irq_ack_notifiers goes away.
> > > > > > 
> > > > > > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > > > > > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > > > > > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > > > > > emualtion to userspace).
> > > > > 
> > > > > Without KVM irqchip, we use a rudimentary approach where we disable
> > > > > mmaps to the device when an interrupt occurs.  This makes us trap all
> > > > > accesses to the device.  We then handle any device access from the guest
> > > > > as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> > > > > we've at least rate-limited it.  If it doesn't, a timer eventually
> > > > > re-enables devices mmaps.  It's not very efficient, but it works (for
> > > > > both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> > > > > very platform/machine specific.  Thanks,
> > > > 
> > > > If we move the IOAPIC back to userspace, it probably would be easier to
> > > > implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
> > > > start!
> > > 
> > > The current solution is merely functional with the benefit of being
> > > architecture and machine-type independent.  If someone actually cared
> > > about vfio assigned device performance without KVM irqchip, we'd
> > > certainly need a more deterministic and efficient EOI path.  Thanks,
> > > 
> > > Alex
> > > 
> > 
> > I'd like this feature to not preclude fast /modern/ device assignment
> > (i.e. devices using MSIs). A perf hit (or even incompatibility) for
> > legacy devices is fine [that's sort of the premise of this patchset].
> > What's necessary for the device assignment to work with the split
> > irqchip?
> 
> The current code will attempt to use kvm_irqchip_add_msi_route() and
> kvm_irqchip_add_irqfd_notifier() to route the MSI through the KVM
> irqchip.  Without that, QEMU will receive the MSI eventfd and call
> msi_notify() or msix_notify(), which would be significantly slower.
> This should be pretty much the same as virtio, which you've hopefully
> already taken into consideration.  Thanks,
> 
> Alex
> 

Those will be easy to update, given that the routing table is staying in the kernel. The notion of "kvm_irqchip_in_kernel" will need to be updated when support for this is added to QEMU, but that's not a huge deal.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Rutherford May 15, 2015, 2:38 a.m. UTC | #18
On Wed, May 13, 2015 at 10:04:53AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 08:12, Jan Kiszka wrote:
> >> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
> >> +{
> >> +	struct kvm *kvm = vcpu->kvm;
> >> +	struct kvm_kernel_irq_routing_entry *entry;
> >> +	struct kvm_irq_routing_table *table;
> >> +	u32 i, nr_rt_entries;
> >> +
> >> +	mutex_lock(&kvm->irq_lock);
> 
> This only needs irq_srcu protection, not irq_lock, so the lookup cost
> becomes much smaller (all CPUs can proceed in parallel).

I've updated the next iteration to include this. 

> 
> You would need to put an smp_mb here, to ensure that irq_routing is read
> after KVM_SCAN_IOAPIC is cleared.  You can introduce
> smb_mb__after_srcu_read_lock in order to elide it.
> 
> The matching memory barrier would be a smp_mb__before_atomic in
> kvm_make_scan_ioapic_request. 

Wait, what could happen if KVM_SCAN_IOAPIC is cleared after irq_routing is read? I'm imagining the following, but I'm not sure it makes sense.

After reading irq_routing, another routing update could roll through (which would be followed the setting of KVM_SCAN_IOAPIC), both of which could be followed by the reordered clearing of KVM_SCAN_IOAPIC. This would lead to only one scan, which would use the stale value for kvm->irq_routing.

The reading of kvm->irq_routing (and the reads in srcu_read_lock) should be able to be reordered back to before the clearing of the bit in vcpu->requests (but after the read of vcpu->requests), because reads can be reordered with unrelated writes, but not with other reads. If a routing update came through on another cpu during this window, the issue above could happen. 

Adding a memory barrier (but not an wmb or an rmb) before reading irq_routing (ideally not in the srcu critical section) should prevent this from happening?

Sorry if this is long winded. Memory ordering always feels subtle. 

Steve
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f791..3323c86 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -672,3 +672,14 @@  int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	spin_unlock(&ioapic->lock);
 	return 0;
 }
+
+void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+	if (ioapic)
+		return;
+	if (!lapic_in_kernel(kvm))
+		return;
+	kvm_make_scan_ioapic_request(kvm);
+}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4..b7af71b 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -123,4 +123,5 @@  int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
 			u32 *tmr);
 
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 42fada6f..7533b87 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -211,6 +211,8 @@  out:
 
 	if (!irqchip_split(kvm))
 		kvm_vcpu_request_scan_ioapic(kvm);
+	else
+		kvm_vcpu_request_scan_userspace_ioapic(kvm);
 }
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc27c35..6127fe7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6335,8 +6335,17 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 				goto out;
 			}
 		}
-		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
-			vcpu_scan_ioapic(vcpu);
+		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) {
+			if (irqchip_split(vcpu->kvm)) {
+				memset(vcpu->arch.eoi_exit_bitmaps, 0, 32);
+				kvm_scan_ioapic_routes(
+				    vcpu, vcpu->arch.eoi_exit_bitmaps);
+				kvm_x86_ops->load_eoi_exitmap(
+				    vcpu, vcpu->arch.eoi_exit_bitmaps);
+
+			} else
+				vcpu_scan_ioapic(vcpu);
+		}
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
 			kvm_vcpu_reload_apic_access_page(vcpu);
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cef20ad..678215a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -438,10 +438,14 @@  void vcpu_put(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_HAVE_IOAPIC
 void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
+void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm);
 #else
 static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
 {
 }
+static inline void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
+{
+}
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 8aaceed..8a253aa 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -205,6 +205,8 @@  int kvm_set_irq_routing(struct kvm *kvm,
 
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 
+	kvm_vcpu_request_scan_userspace_ioapic(kvm);
+
 	new = old;
 	r = 0;
 
@@ -212,3 +214,33 @@  out:
 	kfree(new);
 	return r;
 }
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_kernel_irq_routing_entry *entry;
+	struct kvm_irq_routing_table *table;
+	u32 i, nr_rt_entries;
+
+	mutex_lock(&kvm->irq_lock);
+	table = kvm->irq_routing;
+	nr_rt_entries = min_t(u32, table->nr_rt_entries, IOAPIC_NUM_PINS);
+	for (i = 0; i < nr_rt_entries; ++i) {
+		hlist_for_each_entry(entry, &table->map[i], link) {
+			u32 dest_id, dest_mode;
+
+			if (entry->type != KVM_IRQ_ROUTING_MSI)
+				continue;
+			dest_id = (entry->msi.address_lo >> 12) & 0xff;
+			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
+			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
+						dest_mode)) {
+				u32 vector = entry->msi.data & 0xff;
+
+				__set_bit(vector,
+					  (unsigned long *) eoi_exit_bitmap);
+			}
+		}
+	}
+	mutex_unlock(&kvm->irq_lock);
+}