[RFC,1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
diff mbox

Message ID 1431481652-27268-1-git-send-email-srutherford@google.com
State New
Headers show

Commit Message

Steve Rutherford May 13, 2015, 1:47 a.m. UTC
First patch in a series which enables the relocation of the
PIC/IOAPIC/PIT to userspace.

Adds capability KVM_CAP_SPLIT_IRQCHIP and ioctl KVM_SPLIT_IRQCHIP.

KVM_SPLIT_IRQCHIP enables the construction of LAPICs without the rest
of the irqchip.

Compile tested for x86.

Signed-off-by: Steve Rutherford <srutherford@google.com>
Suggested-by: Andrew Honig <ahonig@google.com>
---
 Documentation/virtual/kvm/api.txt | 15 ++++++++++++
 arch/powerpc/kvm/irq.h            |  5 ++++
 arch/s390/kvm/irq.h               |  4 ++++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/assigned-dev.c       |  4 ++--
 arch/x86/kvm/irq.c                |  6 ++---
 arch/x86/kvm/irq.h                | 11 +++++++++
 arch/x86/kvm/irq_comm.c           |  7 ++++++
 arch/x86/kvm/lapic.c              | 13 +++++++----
 arch/x86/kvm/mmu.c                |  2 +-
 arch/x86/kvm/svm.c                |  4 ++--
 arch/x86/kvm/vmx.c                | 12 +++++-----
 arch/x86/kvm/x86.c                | 49 +++++++++++++++++++++++++++------------
 include/kvm/arm_vgic.h            |  1 +
 include/linux/kvm_host.h          |  1 +
 include/uapi/linux/kvm.h          |  3 +++
 virt/kvm/irqchip.c                |  2 +-
 17 files changed, 106 insertions(+), 35 deletions(-)

Comments

Paolo Bonzini May 13, 2015, 7:57 a.m. UTC | #1
On 13/05/2015 03:47, Steve Rutherford wrote:
> First patch in a series which enables the relocation of the
> PIC/IOAPIC/PIT to userspace.
> 
> Adds capability KVM_CAP_SPLIT_IRQCHIP and ioctl KVM_SPLIT_IRQCHIP.
> 
> KVM_SPLIT_IRQCHIP enables the construction of LAPICs without the rest
> of the irqchip.

The ioctl name in the code is KVM_CREATE_SPLIT_IRQCHIP, but it doesn't
seem to create a local APIC.

> Compile tested for x86.

The capability is fine.  However, instead of introducing a new ioctl,
you can use KVM_CAP_ENABLE_CAP_VM and enable the capability once
(per-VM) using KVM_ENABLE_CAP.  The enabling code is basically the same
you have in your code.

Paolo

> Signed-off-by: Steve Rutherford <srutherford@google.com>
> Suggested-by: Andrew Honig <ahonig@google.com>
> ---
>  Documentation/virtual/kvm/api.txt | 15 ++++++++++++
>  arch/powerpc/kvm/irq.h            |  5 ++++
>  arch/s390/kvm/irq.h               |  4 ++++
>  arch/x86/include/asm/kvm_host.h   |  2 ++
>  arch/x86/kvm/assigned-dev.c       |  4 ++--
>  arch/x86/kvm/irq.c                |  6 ++---
>  arch/x86/kvm/irq.h                | 11 +++++++++
>  arch/x86/kvm/irq_comm.c           |  7 ++++++
>  arch/x86/kvm/lapic.c              | 13 +++++++----
>  arch/x86/kvm/mmu.c                |  2 +-
>  arch/x86/kvm/svm.c                |  4 ++--
>  arch/x86/kvm/vmx.c                | 12 +++++-----
>  arch/x86/kvm/x86.c                | 49 +++++++++++++++++++++++++++------------
>  include/kvm/arm_vgic.h            |  1 +
>  include/linux/kvm_host.h          |  1 +
>  include/uapi/linux/kvm.h          |  3 +++
>  virt/kvm/irqchip.c                |  2 +-
>  17 files changed, 106 insertions(+), 35 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 6955444..0744b4e 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2979,6 +2979,21 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
>  and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
>  which is the maximum number of possibly pending cpu-local interrupts.
>  
> +4.96 KVM_SPLIT_IRQCHIP
> +
> +Capability: KVM_CAP_SPLIT_IRQCHIP
> +Architectures: x86
> +Type:  VM ioctl
> +Parameters: None
> +Returns: 0 on success, -1 on error
> +
> +Create a local apic for each processor in the kernel.  This differs from
> +KVM_CREATE_IRQCHIP in that it only creates the local apic; it creates neither
> +the ioapic nor the pic in the kernel. Also, enables in kernel routing of
> +interrupt requests. Fails if VCPU has already been created, or if the irqchip is
> +already in the kernel.
> +
> +
>  5. The kvm_run structure
>  ------------------------
>  
> diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
> index 5a9a10b..5e6fa06 100644
> --- a/arch/powerpc/kvm/irq.h
> +++ b/arch/powerpc/kvm/irq.h
> @@ -17,4 +17,9 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
>  	return ret;
>  }
>  
> +static inline int lapic_in_kernel(struct kvm *kvm)
> +{
> +	return irqchip_in_kernel(kvm);
> +}
> +
>  #endif
> diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
> index d98e415..db876c3 100644
> --- a/arch/s390/kvm/irq.h
> +++ b/arch/s390/kvm/irq.h
> @@ -19,4 +19,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
>  	return 1;
>  }
>  
> +static inline int lapic_in_kernel(struct kvm *kvm)
> +{
> +	return irqchip_in_kernel(kvm);
> +}
>  #endif
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index bbb8f4e..3ddc134 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -638,6 +638,8 @@ struct kvm_arch {
>  	bool boot_vcpu_runs_old_kvmclock;
>  
>  	u64 disabled_quirks;
> +
> +	bool irqchip_split;
>  };
>  
>  struct kvm_vm_stat {
> diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
> index d090ecf..1237e92 100644
> --- a/arch/x86/kvm/assigned-dev.c
> +++ b/arch/x86/kvm/assigned-dev.c
> @@ -291,7 +291,7 @@ static int kvm_deassign_irq(struct kvm *kvm,
>  {
>  	unsigned long guest_irq_type, host_irq_type;
>  
> -	if (!irqchip_in_kernel(kvm))
> +	if (!lapic_in_kernel(kvm))
>  		return -EINVAL;
>  	/* no irq assignment to deassign */
>  	if (!assigned_dev->irq_requested_type)
> @@ -568,7 +568,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
>  	struct kvm_assigned_dev_kernel *match;
>  	unsigned long host_irq_type, guest_irq_type;
>  
> -	if (!irqchip_in_kernel(kvm))
> +	if (!lapic_in_kernel(kvm))
>  		return r;
>  
>  	mutex_lock(&kvm->lock);
> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> index a1ec6a50..706e47a 100644
> --- a/arch/x86/kvm/irq.c
> +++ b/arch/x86/kvm/irq.c
> @@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
>   */
>  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
>  {
> -	if (!irqchip_in_kernel(v->kvm))
> +	if (!lapic_in_kernel(v->kvm))
>  		return v->arch.interrupt.pending;
>  
>  	if (kvm_cpu_has_extint(v))
> @@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
>   */
>  int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
>  {
> -	if (!irqchip_in_kernel(v->kvm))
> +	if (!lapic_in_kernel(v->kvm))
>  		return v->arch.interrupt.pending;
>  
>  	if (kvm_cpu_has_extint(v))
> @@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
>  {
>  	int vector;
>  
> -	if (!irqchip_in_kernel(v->kvm))
> +	if (!lapic_in_kernel(v->kvm))
>  		return v->arch.interrupt.nr;
>  
>  	vector = kvm_cpu_get_extint(v);
> diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
> index ad68c73..e46abf3 100644
> --- a/arch/x86/kvm/irq.h
> +++ b/arch/x86/kvm/irq.h
> @@ -92,6 +92,17 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
>  	return ret;
>  }
>  
> +static inline int irqchip_split(struct kvm *kvm)
> +{
> +	return kvm->arch.irqchip_split;
> +}
> +
> +static inline int lapic_in_kernel(struct kvm *kvm)
> +{
> +	return irqchip_split(kvm) || irqchip_in_kernel(kvm);
> +}
> +
> +
>  void kvm_pic_reset(struct kvm_kpic_state *s);
>  
>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
> index 9efff9e..f43c59a 100644
> --- a/arch/x86/kvm/irq_comm.c
> +++ b/arch/x86/kvm/irq_comm.c
> @@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
>  	return kvm_set_irq_routing(kvm, default_routing,
>  				   ARRAY_SIZE(default_routing), 0);
>  }
> +
> +static const struct kvm_irq_routing_entry empty_routing[] = {};
> +
> +int kvm_setup_empty_irq_routing(struct kvm *kvm)
> +{
> +	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
> +}
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index dc5b57b..bc392a6 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -209,7 +209,8 @@ out:
>  	if (old)
>  		kfree_rcu(old, rcu);
>  
> -	kvm_vcpu_request_scan_ioapic(kvm);
> +	if (!irqchip_split(kvm))
> +		kvm_vcpu_request_scan_ioapic(kvm);
>  }
>  
>  static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> @@ -1819,7 +1820,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
>  		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
>  				apic_find_highest_isr(apic));
>  	kvm_make_request(KVM_REQ_EVENT, vcpu);
> -	kvm_rtc_eoi_tracking_restore_one(vcpu);
> +	if (!irqchip_split(vcpu->kvm))
> +		kvm_rtc_eoi_tracking_restore_one(vcpu);
>  }
>  
>  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
> @@ -1902,7 +1904,8 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
>  	    /* Cache not set: could be safe but we don't bother. */
>  	    apic->highest_isr_cache == -1 ||
>  	    /* Need EOI to update ioapic. */
> -	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
> +	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache) ||
> +	    irqchip_split(vcpu->kvm)) {
>  		/*
>  		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
>  		 * so we need not do anything here.
> @@ -1958,7 +1961,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	u32 reg = (msr - APIC_BASE_MSR) << 4;
>  
> -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
>  		return 1;
>  
>  	if (reg == APIC_ICR2)
> @@ -1975,7 +1978,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
>  
> -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
>  		return 1;
>  
>  	if (reg == APIC_DFR || reg == APIC_ICR2) {
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index b78e83f..e5bf6db 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3465,7 +3465,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
>  
>  static bool can_do_async_pf(struct kvm_vcpu *vcpu)
>  {
> -	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
> +	if (unlikely(!lapic_in_kernel(vcpu->kvm) ||
>  		     kvm_event_needs_reinjection(vcpu)))
>  		return false;
>  
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 8954d7a..7ac9ec2 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3054,7 +3054,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
>  	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
>  	/* instruction emulation calls kvm_set_cr8() */
>  	r = cr_interception(svm);
> -	if (irqchip_in_kernel(svm->vcpu.kvm))
> +	if (lapic_in_kernel(svm->vcpu.kvm))
>  		return r;
>  	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
>  		return r;
> @@ -3295,7 +3295,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
>  	 * If the user space waits to inject interrupts, exit as soon as
>  	 * possible
>  	 */
> -	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
> +	if (!lapic_in_kernel(svm->vcpu.kvm) &&
>  	    kvm_run->request_interrupt_window &&
>  	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
>  		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index bcb61b0..a53747f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -948,7 +948,7 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
>  
>  static inline bool vm_need_tpr_shadow(struct kvm *kvm)
>  {
> -	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
> +	return (cpu_has_vmx_tpr_shadow()) && lapic_in_kernel(kvm);
>  }
>  
>  static inline bool cpu_has_secondary_exec_ctrls(void)
> @@ -1064,7 +1064,7 @@ static inline bool cpu_has_vmx_ple(void)
>  
>  static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
>  {
> -	return flexpriority_enabled && irqchip_in_kernel(kvm);
> +	return flexpriority_enabled && lapic_in_kernel(kvm);
>  }
>  
>  static inline bool cpu_has_vmx_vpid(void)
> @@ -4341,7 +4341,7 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
>  
>  static int vmx_vm_has_apicv(struct kvm *kvm)
>  {
> -	return enable_apicv && irqchip_in_kernel(kvm);
> +	return enable_apicv && lapic_in_kernel(kvm);
>  }
>  
>  static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
> @@ -5317,7 +5317,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
>  				u8 cr8 = (u8)val;
>  				err = kvm_set_cr8(vcpu, cr8);
>  				kvm_complete_insn_gp(vcpu, err);
> -				if (irqchip_in_kernel(vcpu->kvm))
> +				if (lapic_in_kernel(vcpu->kvm))
>  					return 1;
>  				if (cr8_prev <= cr8)
>  					return 1;
> @@ -5534,7 +5534,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
>  	 * If the user space waits to inject interrupts, exit as soon as
>  	 * possible
>  	 */
> -	if (!irqchip_in_kernel(vcpu->kvm) &&
> +	if (!lapic_in_kernel(vcpu->kvm) &&
>  	    vcpu->run->request_interrupt_window &&
>  	    !kvm_cpu_has_interrupt(vcpu)) {
>  		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> @@ -9419,7 +9419,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>  	 * emulated by vmx_set_efer(), below.
>  	 */
> -	vm_entry_controls_init(vmx, 
> +	vm_entry_controls_init(vmx,
>  		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
>  			~VM_ENTRY_IA32E_MODE) |
>  		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cde5d61..7505b39 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -782,7 +782,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
>  {
>  	if (cr8 & CR8_RESERVED_BITS)
>  		return 1;
> -	if (irqchip_in_kernel(vcpu->kvm))
> +	if (lapic_in_kernel(vcpu->kvm))
>  		kvm_lapic_set_tpr(vcpu, cr8);
>  	else
>  		vcpu->arch.cr8 = cr8;
> @@ -792,7 +792,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
>  
>  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
>  {
> -	if (irqchip_in_kernel(vcpu->kvm))
> +	if (lapic_in_kernel(vcpu->kvm))
>  		return kvm_lapic_get_cr8(vcpu);
>  	else
>  		return vcpu->arch.cr8;
> @@ -2800,6 +2800,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_TSC_DEADLINE_TIMER:
>  	case KVM_CAP_ENABLE_CAP_VM:
>  	case KVM_CAP_DISABLE_QUIRKS:
> +	case KVM_CAP_SPLIT_IRQCHIP:
>  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
>  	case KVM_CAP_ASSIGN_DEV_IRQ:
>  	case KVM_CAP_PCI_2_3:
> @@ -3002,7 +3003,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
>  {
>  	if (irq->irq >= KVM_NR_INTERRUPTS)
>  		return -EINVAL;
> -	if (irqchip_in_kernel(vcpu->kvm))
> +	if (lapic_in_kernel(vcpu->kvm))
>  		return -ENXIO;
>  
>  	kvm_queue_interrupt(vcpu, irq->irq, false);
> @@ -3480,7 +3481,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		struct kvm_vapic_addr va;
>  
>  		r = -EINVAL;
> -		if (!irqchip_in_kernel(vcpu->kvm))
> +		if (!lapic_in_kernel(vcpu->kvm))
>  			goto out;
>  		r = -EFAULT;
>  		if (copy_from_user(&va, argp, sizeof va))
> @@ -3838,7 +3839,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
>  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
>  			bool line_status)
>  {
> -	if (!irqchip_in_kernel(kvm))
> +	if (!lapic_in_kernel(kvm))
>  		return -ENXIO;
>  
>  	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
> @@ -4128,6 +4129,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
>  		break;
>  	}
> +	case KVM_CREATE_SPLIT_IRQCHIP: {
> +		mutex_lock(&kvm->lock);
> +		r = -EEXIST;
> +		if (lapic_in_kernel(kvm))
> +			goto split_irqchip_unlock;
> +		r = -EINVAL;
> +		if (atomic_read(&kvm->online_vcpus))
> +			goto split_irqchip_unlock;
> +		r = kvm_setup_empty_irq_routing(kvm);
> +		if (r)
> +			goto split_irqchip_unlock;
> +		kvm->arch.irqchip_split = true;
> +		r = 0;
> +split_irqchip_unlock:
> +		mutex_unlock(&kvm->lock);
> +		break;
> +	}
> +
>  	default:
>  		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
>  	}
> @@ -5893,7 +5912,7 @@ void kvm_arch_exit(void)
>  int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
>  {
>  	++vcpu->stat.halt_exits;
> -	if (irqchip_in_kernel(vcpu->kvm)) {
> +	if (lapic_in_kernel(vcpu->kvm)) {
>  		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
>  		return 1;
>  	} else {
> @@ -6060,7 +6079,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
>   */
>  static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
>  {
> -	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
> +	return (!lapic_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
>  		vcpu->run->request_interrupt_window &&
>  		kvm_arch_interrupt_allowed(vcpu));
>  }
> @@ -6072,7 +6091,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
>  	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
>  	kvm_run->cr8 = kvm_get_cr8(vcpu);
>  	kvm_run->apic_base = kvm_get_apic_base(vcpu);
> -	if (irqchip_in_kernel(vcpu->kvm))
> +	if (lapic_in_kernel(vcpu->kvm))
>  		kvm_run->ready_for_interrupt_injection = 1;
>  	else
>  		kvm_run->ready_for_interrupt_injection =
> @@ -6219,7 +6238,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
>  {
>  	struct page *page = NULL;
>  
> -	if (!irqchip_in_kernel(vcpu->kvm))
> +	if (!lapic_in_kernel(vcpu->kvm))
>  		return;
>  
>  	if (!kvm_x86_ops->set_apic_access_page_addr)
> @@ -6255,7 +6274,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
>  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  {
>  	int r;
> -	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
> +	bool req_int_win = !lapic_in_kernel(vcpu->kvm) &&
>  		vcpu->run->request_interrupt_window;
>  	bool req_immediate_exit = false;
>  
> @@ -6644,7 +6663,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  	}
>  
>  	/* re-sync apic's tpr */
> -	if (!irqchip_in_kernel(vcpu->kvm)) {
> +	if (!lapic_in_kernel(vcpu->kvm)) {
>  		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
>  			r = -EINVAL;
>  			goto out;
> @@ -7340,7 +7359,7 @@ void kvm_arch_check_processor_compat(void *rtn)
>  
>  bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
>  {
> -	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> +	return lapic_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
>  }
>  
>  struct static_key kvm_no_apic_vcpu __read_mostly;
> @@ -7356,7 +7375,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>  
>  	vcpu->arch.pv.pv_unhalted = false;
>  	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
> -	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> +	if (!lapic_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
>  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>  	else
>  		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
> @@ -7374,7 +7393,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>  	if (r < 0)
>  		goto fail_free_pio_data;
>  
> -	if (irqchip_in_kernel(kvm)) {
> +	if (lapic_in_kernel(kvm)) {
>  		r = kvm_create_lapic(vcpu);
>  		if (r < 0)
>  			goto fail_mmu_destroy;
> @@ -7437,7 +7456,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  	kvm_mmu_destroy(vcpu);
>  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
>  	free_page((unsigned long)vcpu->arch.pio_data);
> -	if (!irqchip_in_kernel(vcpu->kvm))
> +	if (!lapic_in_kernel(vcpu->kvm))
>  		static_key_slow_dec(&kvm_no_apic_vcpu);
>  }
>  
> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> index 133ea00..ffe1f4e 100644
> --- a/include/kvm/arm_vgic.h
> +++ b/include/kvm/arm_vgic.h
> @@ -329,6 +329,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
>  int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
>  
>  #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
> +#define lapic_in_kernel(k)      (irqchip_in_kernel(k))
>  #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
>  #define vgic_ready(k)		((k)->arch.vgic.ready)
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 87fd74a..277b7a1 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -934,6 +934,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
>  #endif
>  
>  int kvm_setup_default_irq_routing(struct kvm *kvm);
> +int kvm_setup_empty_irq_routing(struct kvm *kvm);
>  int kvm_set_irq_routing(struct kvm *kvm,
>  			const struct kvm_irq_routing_entry *entries,
>  			unsigned nr,
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 75bd9f7..7d06dc4 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -815,6 +815,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_S390_IRQ_STATE 114
>  #define KVM_CAP_PPC_HWRNG 115
>  #define KVM_CAP_DISABLE_QUIRKS 116
> +#define KVM_CAP_SPLIT_IRQCHIP 117
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1200,6 +1201,8 @@ struct kvm_s390_ucas_mapping {
>  /* Available with KVM_CAP_S390_IRQ_STATE */
>  #define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
>  #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
> +/* Available with KVM_CAP_SPLIT_IRQCHIP */
> +#define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
>  
>  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
>  #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 1d56a90..8aaceed 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -73,7 +73,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
>  {
>  	struct kvm_kernel_irq_routing_entry route;
>  
> -	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
> +	if (!lapic_in_kernel(kvm) || msi->flags != 0)
>  		return -EINVAL;
>  
>  	route.msi.address_lo = msi->address_lo;
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Rutherford May 13, 2015, 10:10 p.m. UTC | #2
On Wed, May 13, 2015 at 09:57:00AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 03:47, Steve Rutherford wrote:
> > First patch in a series which enables the relocation of the
> > PIC/IOAPIC/PIT to userspace.
> > 
> > Adds capability KVM_CAP_SPLIT_IRQCHIP and ioctl KVM_SPLIT_IRQCHIP.
> > 
> > KVM_SPLIT_IRQCHIP enables the construction of LAPICs without the rest
> > of the irqchip.
> 
> The ioctl name in the code is KVM_CREATE_SPLIT_IRQCHIP, but it doesn't
> seem to create a local APIC.
> 

 This enables the creation of LAPICs every time a VCPU is created, which I guess is different. I'll change the language to reflect that.

> > Compile tested for x86.
> 
> The capability is fine.  However, instead of introducing a new ioctl,
> you can use KVM_CAP_ENABLE_CAP_VM and enable the capability once
> (per-VM) using KVM_ENABLE_CAP.  The enabling code is basically the same
> you have in your code.
> 

ENABLE_CAP seems like a better way to go. I'll update this to use it instead of the IOCTL. 

Steve
> Paolo
> 
> > Signed-off-by: Steve Rutherford <srutherford@google.com>
> > Suggested-by: Andrew Honig <ahonig@google.com>
> > ---
> >  Documentation/virtual/kvm/api.txt | 15 ++++++++++++
> >  arch/powerpc/kvm/irq.h            |  5 ++++
> >  arch/s390/kvm/irq.h               |  4 ++++
> >  arch/x86/include/asm/kvm_host.h   |  2 ++
> >  arch/x86/kvm/assigned-dev.c       |  4 ++--
> >  arch/x86/kvm/irq.c                |  6 ++---
> >  arch/x86/kvm/irq.h                | 11 +++++++++
> >  arch/x86/kvm/irq_comm.c           |  7 ++++++
> >  arch/x86/kvm/lapic.c              | 13 +++++++----
> >  arch/x86/kvm/mmu.c                |  2 +-
> >  arch/x86/kvm/svm.c                |  4 ++--
> >  arch/x86/kvm/vmx.c                | 12 +++++-----
> >  arch/x86/kvm/x86.c                | 49 +++++++++++++++++++++++++++------------
> >  include/kvm/arm_vgic.h            |  1 +
> >  include/linux/kvm_host.h          |  1 +
> >  include/uapi/linux/kvm.h          |  3 +++
> >  virt/kvm/irqchip.c                |  2 +-
> >  17 files changed, 106 insertions(+), 35 deletions(-)
> > 
> > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > index 6955444..0744b4e 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -2979,6 +2979,21 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
> >  and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
> >  which is the maximum number of possibly pending cpu-local interrupts.
> >  
> > +4.96 KVM_SPLIT_IRQCHIP
> > +
> > +Capability: KVM_CAP_SPLIT_IRQCHIP
> > +Architectures: x86
> > +Type:  VM ioctl
> > +Parameters: None
> > +Returns: 0 on success, -1 on error
> > +
> > +Create a local apic for each processor in the kernel.  This differs from
> > +KVM_CREATE_IRQCHIP in that it only creates the local apic; it creates neither
> > +the ioapic nor the pic in the kernel. Also, enables in kernel routing of
> > +interrupt requests. Fails if VCPU has already been created, or if the irqchip is
> > +already in the kernel.
> > +
> > +
> >  5. The kvm_run structure
> >  ------------------------
> >  
> > diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
> > index 5a9a10b..5e6fa06 100644
> > --- a/arch/powerpc/kvm/irq.h
> > +++ b/arch/powerpc/kvm/irq.h
> > @@ -17,4 +17,9 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> >  	return ret;
> >  }
> >  
> > +static inline int lapic_in_kernel(struct kvm *kvm)
> > +{
> > +	return irqchip_in_kernel(kvm);
> > +}
> > +
> >  #endif
> > diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
> > index d98e415..db876c3 100644
> > --- a/arch/s390/kvm/irq.h
> > +++ b/arch/s390/kvm/irq.h
> > @@ -19,4 +19,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> >  	return 1;
> >  }
> >  
> > +static inline int lapic_in_kernel(struct kvm *kvm)
> > +{
> > +	return irqchip_in_kernel(kvm);
> > +}
> >  #endif
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index bbb8f4e..3ddc134 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -638,6 +638,8 @@ struct kvm_arch {
> >  	bool boot_vcpu_runs_old_kvmclock;
> >  
> >  	u64 disabled_quirks;
> > +
> > +	bool irqchip_split;
> >  };
> >  
> >  struct kvm_vm_stat {
> > diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
> > index d090ecf..1237e92 100644
> > --- a/arch/x86/kvm/assigned-dev.c
> > +++ b/arch/x86/kvm/assigned-dev.c
> > @@ -291,7 +291,7 @@ static int kvm_deassign_irq(struct kvm *kvm,
> >  {
> >  	unsigned long guest_irq_type, host_irq_type;
> >  
> > -	if (!irqchip_in_kernel(kvm))
> > +	if (!lapic_in_kernel(kvm))
> >  		return -EINVAL;
> >  	/* no irq assignment to deassign */
> >  	if (!assigned_dev->irq_requested_type)
> > @@ -568,7 +568,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
> >  	struct kvm_assigned_dev_kernel *match;
> >  	unsigned long host_irq_type, guest_irq_type;
> >  
> > -	if (!irqchip_in_kernel(kvm))
> > +	if (!lapic_in_kernel(kvm))
> >  		return r;
> >  
> >  	mutex_lock(&kvm->lock);
> > diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> > index a1ec6a50..706e47a 100644
> > --- a/arch/x86/kvm/irq.c
> > +++ b/arch/x86/kvm/irq.c
> > @@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
> >   */
> >  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
> >  {
> > -	if (!irqchip_in_kernel(v->kvm))
> > +	if (!lapic_in_kernel(v->kvm))
> >  		return v->arch.interrupt.pending;
> >  
> >  	if (kvm_cpu_has_extint(v))
> > @@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
> >   */
> >  int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
> >  {
> > -	if (!irqchip_in_kernel(v->kvm))
> > +	if (!lapic_in_kernel(v->kvm))
> >  		return v->arch.interrupt.pending;
> >  
> >  	if (kvm_cpu_has_extint(v))
> > @@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
> >  {
> >  	int vector;
> >  
> > -	if (!irqchip_in_kernel(v->kvm))
> > +	if (!lapic_in_kernel(v->kvm))
> >  		return v->arch.interrupt.nr;
> >  
> >  	vector = kvm_cpu_get_extint(v);
> > diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
> > index ad68c73..e46abf3 100644
> > --- a/arch/x86/kvm/irq.h
> > +++ b/arch/x86/kvm/irq.h
> > @@ -92,6 +92,17 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> >  	return ret;
> >  }
> >  
> > +static inline int irqchip_split(struct kvm *kvm)
> > +{
> > +	return kvm->arch.irqchip_split;
> > +}
> > +
> > +static inline int lapic_in_kernel(struct kvm *kvm)
> > +{
> > +	return irqchip_split(kvm) || irqchip_in_kernel(kvm);
> > +}
> > +
> > +
> >  void kvm_pic_reset(struct kvm_kpic_state *s);
> >  
> >  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
> > diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
> > index 9efff9e..f43c59a 100644
> > --- a/arch/x86/kvm/irq_comm.c
> > +++ b/arch/x86/kvm/irq_comm.c
> > @@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
> >  	return kvm_set_irq_routing(kvm, default_routing,
> >  				   ARRAY_SIZE(default_routing), 0);
> >  }
> > +
> > +static const struct kvm_irq_routing_entry empty_routing[] = {};
> > +
> > +int kvm_setup_empty_irq_routing(struct kvm *kvm)
> > +{
> > +	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
> > +}
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index dc5b57b..bc392a6 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -209,7 +209,8 @@ out:
> >  	if (old)
> >  		kfree_rcu(old, rcu);
> >  
> > -	kvm_vcpu_request_scan_ioapic(kvm);
> > +	if (!irqchip_split(kvm))
> > +		kvm_vcpu_request_scan_ioapic(kvm);
> >  }
> >  
> >  static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> > @@ -1819,7 +1820,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
> >  		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
> >  				apic_find_highest_isr(apic));
> >  	kvm_make_request(KVM_REQ_EVENT, vcpu);
> > -	kvm_rtc_eoi_tracking_restore_one(vcpu);
> > +	if (!irqchip_split(vcpu->kvm))
> > +		kvm_rtc_eoi_tracking_restore_one(vcpu);
> >  }
> >  
> >  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
> > @@ -1902,7 +1904,8 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
> >  	    /* Cache not set: could be safe but we don't bother. */
> >  	    apic->highest_isr_cache == -1 ||
> >  	    /* Need EOI to update ioapic. */
> > -	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
> > +	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache) ||
> > +	    irqchip_split(vcpu->kvm)) {
> >  		/*
> >  		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
> >  		 * so we need not do anything here.
> > @@ -1958,7 +1961,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >  	struct kvm_lapic *apic = vcpu->arch.apic;
> >  	u32 reg = (msr - APIC_BASE_MSR) << 4;
> >  
> > -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> >  		return 1;
> >  
> >  	if (reg == APIC_ICR2)
> > @@ -1975,7 +1978,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
> >  	struct kvm_lapic *apic = vcpu->arch.apic;
> >  	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
> >  
> > -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> >  		return 1;
> >  
> >  	if (reg == APIC_DFR || reg == APIC_ICR2) {
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index b78e83f..e5bf6db 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3465,7 +3465,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> >  
> >  static bool can_do_async_pf(struct kvm_vcpu *vcpu)
> >  {
> > -	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
> > +	if (unlikely(!lapic_in_kernel(vcpu->kvm) ||
> >  		     kvm_event_needs_reinjection(vcpu)))
> >  		return false;
> >  
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 8954d7a..7ac9ec2 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -3054,7 +3054,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
> >  	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
> >  	/* instruction emulation calls kvm_set_cr8() */
> >  	r = cr_interception(svm);
> > -	if (irqchip_in_kernel(svm->vcpu.kvm))
> > +	if (lapic_in_kernel(svm->vcpu.kvm))
> >  		return r;
> >  	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
> >  		return r;
> > @@ -3295,7 +3295,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
> >  	 * If the user space waits to inject interrupts, exit as soon as
> >  	 * possible
> >  	 */
> > -	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
> > +	if (!lapic_in_kernel(svm->vcpu.kvm) &&
> >  	    kvm_run->request_interrupt_window &&
> >  	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
> >  		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index bcb61b0..a53747f 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -948,7 +948,7 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
> >  
> >  static inline bool vm_need_tpr_shadow(struct kvm *kvm)
> >  {
> > -	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
> > +	return (cpu_has_vmx_tpr_shadow()) && lapic_in_kernel(kvm);
> >  }
> >  
> >  static inline bool cpu_has_secondary_exec_ctrls(void)
> > @@ -1064,7 +1064,7 @@ static inline bool cpu_has_vmx_ple(void)
> >  
> >  static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
> >  {
> > -	return flexpriority_enabled && irqchip_in_kernel(kvm);
> > +	return flexpriority_enabled && lapic_in_kernel(kvm);
> >  }
> >  
> >  static inline bool cpu_has_vmx_vpid(void)
> > @@ -4341,7 +4341,7 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
> >  
> >  static int vmx_vm_has_apicv(struct kvm *kvm)
> >  {
> > -	return enable_apicv && irqchip_in_kernel(kvm);
> > +	return enable_apicv && lapic_in_kernel(kvm);
> >  }
> >  
> >  static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
> > @@ -5317,7 +5317,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
> >  				u8 cr8 = (u8)val;
> >  				err = kvm_set_cr8(vcpu, cr8);
> >  				kvm_complete_insn_gp(vcpu, err);
> > -				if (irqchip_in_kernel(vcpu->kvm))
> > +				if (lapic_in_kernel(vcpu->kvm))
> >  					return 1;
> >  				if (cr8_prev <= cr8)
> >  					return 1;
> > @@ -5534,7 +5534,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
> >  	 * If the user space waits to inject interrupts, exit as soon as
> >  	 * possible
> >  	 */
> > -	if (!irqchip_in_kernel(vcpu->kvm) &&
> > +	if (!lapic_in_kernel(vcpu->kvm) &&
> >  	    vcpu->run->request_interrupt_window &&
> >  	    !kvm_cpu_has_interrupt(vcpu)) {
> >  		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> > @@ -9419,7 +9419,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> >  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
> >  	 * emulated by vmx_set_efer(), below.
> >  	 */
> > -	vm_entry_controls_init(vmx, 
> > +	vm_entry_controls_init(vmx,
> >  		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
> >  			~VM_ENTRY_IA32E_MODE) |
> >  		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index cde5d61..7505b39 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -782,7 +782,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
> >  {
> >  	if (cr8 & CR8_RESERVED_BITS)
> >  		return 1;
> > -	if (irqchip_in_kernel(vcpu->kvm))
> > +	if (lapic_in_kernel(vcpu->kvm))
> >  		kvm_lapic_set_tpr(vcpu, cr8);
> >  	else
> >  		vcpu->arch.cr8 = cr8;
> > @@ -792,7 +792,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
> >  
> >  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
> >  {
> > -	if (irqchip_in_kernel(vcpu->kvm))
> > +	if (lapic_in_kernel(vcpu->kvm))
> >  		return kvm_lapic_get_cr8(vcpu);
> >  	else
> >  		return vcpu->arch.cr8;
> > @@ -2800,6 +2800,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> >  	case KVM_CAP_TSC_DEADLINE_TIMER:
> >  	case KVM_CAP_ENABLE_CAP_VM:
> >  	case KVM_CAP_DISABLE_QUIRKS:
> > +	case KVM_CAP_SPLIT_IRQCHIP:
> >  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
> >  	case KVM_CAP_ASSIGN_DEV_IRQ:
> >  	case KVM_CAP_PCI_2_3:
> > @@ -3002,7 +3003,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
> >  {
> >  	if (irq->irq >= KVM_NR_INTERRUPTS)
> >  		return -EINVAL;
> > -	if (irqchip_in_kernel(vcpu->kvm))
> > +	if (lapic_in_kernel(vcpu->kvm))
> >  		return -ENXIO;
> >  
> >  	kvm_queue_interrupt(vcpu, irq->irq, false);
> > @@ -3480,7 +3481,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> >  		struct kvm_vapic_addr va;
> >  
> >  		r = -EINVAL;
> > -		if (!irqchip_in_kernel(vcpu->kvm))
> > +		if (!lapic_in_kernel(vcpu->kvm))
> >  			goto out;
> >  		r = -EFAULT;
> >  		if (copy_from_user(&va, argp, sizeof va))
> > @@ -3838,7 +3839,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
> >  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
> >  			bool line_status)
> >  {
> > -	if (!irqchip_in_kernel(kvm))
> > +	if (!lapic_in_kernel(kvm))
> >  		return -ENXIO;
> >  
> >  	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
> > @@ -4128,6 +4129,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
> >  		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
> >  		break;
> >  	}
> > +	case KVM_CREATE_SPLIT_IRQCHIP: {
> > +		mutex_lock(&kvm->lock);
> > +		r = -EEXIST;
> > +		if (lapic_in_kernel(kvm))
> > +			goto split_irqchip_unlock;
> > +		r = -EINVAL;
> > +		if (atomic_read(&kvm->online_vcpus))
> > +			goto split_irqchip_unlock;
> > +		r = kvm_setup_empty_irq_routing(kvm);
> > +		if (r)
> > +			goto split_irqchip_unlock;
> > +		kvm->arch.irqchip_split = true;
> > +		r = 0;
> > +split_irqchip_unlock:
> > +		mutex_unlock(&kvm->lock);
> > +		break;
> > +	}
> > +
> >  	default:
> >  		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
> >  	}
> > @@ -5893,7 +5912,7 @@ void kvm_arch_exit(void)
> >  int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
> >  {
> >  	++vcpu->stat.halt_exits;
> > -	if (irqchip_in_kernel(vcpu->kvm)) {
> > +	if (lapic_in_kernel(vcpu->kvm)) {
> >  		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
> >  		return 1;
> >  	} else {
> > @@ -6060,7 +6079,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
> >   */
> >  static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
> >  {
> > -	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
> > +	return (!lapic_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
> >  		vcpu->run->request_interrupt_window &&
> >  		kvm_arch_interrupt_allowed(vcpu));
> >  }
> > @@ -6072,7 +6091,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
> >  	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
> >  	kvm_run->cr8 = kvm_get_cr8(vcpu);
> >  	kvm_run->apic_base = kvm_get_apic_base(vcpu);
> > -	if (irqchip_in_kernel(vcpu->kvm))
> > +	if (lapic_in_kernel(vcpu->kvm))
> >  		kvm_run->ready_for_interrupt_injection = 1;
> >  	else
> >  		kvm_run->ready_for_interrupt_injection =
> > @@ -6219,7 +6238,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
> >  {
> >  	struct page *page = NULL;
> >  
> > -	if (!irqchip_in_kernel(vcpu->kvm))
> > +	if (!lapic_in_kernel(vcpu->kvm))
> >  		return;
> >  
> >  	if (!kvm_x86_ops->set_apic_access_page_addr)
> > @@ -6255,7 +6274,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
> >  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >  {
> >  	int r;
> > -	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
> > +	bool req_int_win = !lapic_in_kernel(vcpu->kvm) &&
> >  		vcpu->run->request_interrupt_window;
> >  	bool req_immediate_exit = false;
> >  
> > @@ -6644,7 +6663,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> >  	}
> >  
> >  	/* re-sync apic's tpr */
> > -	if (!irqchip_in_kernel(vcpu->kvm)) {
> > +	if (!lapic_in_kernel(vcpu->kvm)) {
> >  		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
> >  			r = -EINVAL;
> >  			goto out;
> > @@ -7340,7 +7359,7 @@ void kvm_arch_check_processor_compat(void *rtn)
> >  
> >  bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
> >  {
> > -	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> > +	return lapic_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> >  }
> >  
> >  struct static_key kvm_no_apic_vcpu __read_mostly;
> > @@ -7356,7 +7375,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> >  
> >  	vcpu->arch.pv.pv_unhalted = false;
> >  	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
> > -	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> > +	if (!lapic_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> >  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >  	else
> >  		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
> > @@ -7374,7 +7393,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> >  	if (r < 0)
> >  		goto fail_free_pio_data;
> >  
> > -	if (irqchip_in_kernel(kvm)) {
> > +	if (lapic_in_kernel(kvm)) {
> >  		r = kvm_create_lapic(vcpu);
> >  		if (r < 0)
> >  			goto fail_mmu_destroy;
> > @@ -7437,7 +7456,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> >  	kvm_mmu_destroy(vcpu);
> >  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> >  	free_page((unsigned long)vcpu->arch.pio_data);
> > -	if (!irqchip_in_kernel(vcpu->kvm))
> > +	if (!lapic_in_kernel(vcpu->kvm))
> >  		static_key_slow_dec(&kvm_no_apic_vcpu);
> >  }
> >  
> > diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> > index 133ea00..ffe1f4e 100644
> > --- a/include/kvm/arm_vgic.h
> > +++ b/include/kvm/arm_vgic.h
> > @@ -329,6 +329,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
> >  int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
> >  
> >  #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
> > +#define lapic_in_kernel(k)      (irqchip_in_kernel(k))
> >  #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
> >  #define vgic_ready(k)		((k)->arch.vgic.ready)
> >  
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 87fd74a..277b7a1 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -934,6 +934,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
> >  #endif
> >  
> >  int kvm_setup_default_irq_routing(struct kvm *kvm);
> > +int kvm_setup_empty_irq_routing(struct kvm *kvm);
> >  int kvm_set_irq_routing(struct kvm *kvm,
> >  			const struct kvm_irq_routing_entry *entries,
> >  			unsigned nr,
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 75bd9f7..7d06dc4 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -815,6 +815,7 @@ struct kvm_ppc_smmu_info {
> >  #define KVM_CAP_S390_IRQ_STATE 114
> >  #define KVM_CAP_PPC_HWRNG 115
> >  #define KVM_CAP_DISABLE_QUIRKS 116
> > +#define KVM_CAP_SPLIT_IRQCHIP 117
> >  
> >  #ifdef KVM_CAP_IRQ_ROUTING
> >  
> > @@ -1200,6 +1201,8 @@ struct kvm_s390_ucas_mapping {
> >  /* Available with KVM_CAP_S390_IRQ_STATE */
> >  #define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
> >  #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
> > +/* Available with KVM_CAP_SPLIT_IRQCHIP */
> > +#define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
> >  
> >  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
> >  #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> > diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> > index 1d56a90..8aaceed 100644
> > --- a/virt/kvm/irqchip.c
> > +++ b/virt/kvm/irqchip.c
> > @@ -73,7 +73,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
> >  {
> >  	struct kvm_kernel_irq_routing_entry route;
> >  
> > -	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
> > +	if (!lapic_in_kernel(kvm) || msi->flags != 0)
> >  		return -EINVAL;
> >  
> >  	route.msi.address_lo = msi->address_lo;
> > 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wu, Feng May 14, 2015, 9:12 a.m. UTC | #3
> -----Original Message-----
> From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On
> Behalf Of Steve Rutherford
> Sent: Thursday, May 14, 2015 6:10 AM
> To: Paolo Bonzini
> Cc: kvm@vger.kernel.org; ahonig@google.com
> Subject: Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
> 
> On Wed, May 13, 2015 at 09:57:00AM +0200, Paolo Bonzini wrote:
> >
> >
> > On 13/05/2015 03:47, Steve Rutherford wrote:
> > > First patch in a series which enables the relocation of the
> > > PIC/IOAPIC/PIT to userspace.
> > >
> > > Adds capability KVM_CAP_SPLIT_IRQCHIP and ioctl KVM_SPLIT_IRQCHIP.
> > >
> > > KVM_SPLIT_IRQCHIP enables the construction of LAPICs without the rest
> > > of the irqchip.
> >
> > The ioctl name in the code is KVM_CREATE_SPLIT_IRQCHIP, but it doesn't
> > seem to create a local APIC.
> >
> 
>  This enables the creation of LAPICs every time a VCPU is created, which I
> guess is different. I'll change the language to reflect that.

vlapic is always created by kvm_vcpu_init () --> kvm_arch_vcpu_init() --> kvm_create_lapic(),
right? So in this IOCTL, it just set a flag, which is a hint for us to create the lapic or not.

BTW, what is the purpose of this series. If I understand it correctly, you only want to
use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
benefit of it?

Thanks,
Feng

> 
> > > Compile tested for x86.
> >
> > The capability is fine.  However, instead of introducing a new ioctl,
> > you can use KVM_CAP_ENABLE_CAP_VM and enable the capability once
> > (per-VM) using KVM_ENABLE_CAP.  The enabling code is basically the same
> > you have in your code.
> >
> 
> ENABLE_CAP seems like a better way to go. I'll update this to use it instead of
> the IOCTL.
> 
> Steve
> > Paolo
> >
> > > Signed-off-by: Steve Rutherford <srutherford@google.com>
> > > Suggested-by: Andrew Honig <ahonig@google.com>
> > > ---
> > >  Documentation/virtual/kvm/api.txt | 15 ++++++++++++
> > >  arch/powerpc/kvm/irq.h            |  5 ++++
> > >  arch/s390/kvm/irq.h               |  4 ++++
> > >  arch/x86/include/asm/kvm_host.h   |  2 ++
> > >  arch/x86/kvm/assigned-dev.c       |  4 ++--
> > >  arch/x86/kvm/irq.c                |  6 ++---
> > >  arch/x86/kvm/irq.h                | 11 +++++++++
> > >  arch/x86/kvm/irq_comm.c           |  7 ++++++
> > >  arch/x86/kvm/lapic.c              | 13 +++++++----
> > >  arch/x86/kvm/mmu.c                |  2 +-
> > >  arch/x86/kvm/svm.c                |  4 ++--
> > >  arch/x86/kvm/vmx.c                | 12 +++++-----
> > >  arch/x86/kvm/x86.c                | 49
> +++++++++++++++++++++++++++------------
> > >  include/kvm/arm_vgic.h            |  1 +
> > >  include/linux/kvm_host.h          |  1 +
> > >  include/uapi/linux/kvm.h          |  3 +++
> > >  virt/kvm/irqchip.c                |  2 +-
> > >  17 files changed, 106 insertions(+), 35 deletions(-)
> > >
> > > diff --git a/Documentation/virtual/kvm/api.txt
> b/Documentation/virtual/kvm/api.txt
> > > index 6955444..0744b4e 100644
> > > --- a/Documentation/virtual/kvm/api.txt
> > > +++ b/Documentation/virtual/kvm/api.txt
> > > @@ -2979,6 +2979,21 @@ len must be a multiple of sizeof(struct
> kvm_s390_irq). It must be > 0
> > >  and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
> > >  which is the maximum number of possibly pending cpu-local interrupts.
> > >
> > > +4.96 KVM_SPLIT_IRQCHIP
> > > +
> > > +Capability: KVM_CAP_SPLIT_IRQCHIP
> > > +Architectures: x86
> > > +Type:  VM ioctl
> > > +Parameters: None
> > > +Returns: 0 on success, -1 on error
> > > +
> > > +Create a local apic for each processor in the kernel.  This differs from
> > > +KVM_CREATE_IRQCHIP in that it only creates the local apic; it creates
> neither
> > > +the ioapic nor the pic in the kernel. Also, enables in kernel routing of
> > > +interrupt requests. Fails if VCPU has already been created, or if the irqchip
> is
> > > +already in the kernel.
> > > +
> > > +
> > >  5. The kvm_run structure
> > >  ------------------------
> > >
> > > diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
> > > index 5a9a10b..5e6fa06 100644
> > > --- a/arch/powerpc/kvm/irq.h
> > > +++ b/arch/powerpc/kvm/irq.h
> > > @@ -17,4 +17,9 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> > >  	return ret;
> > >  }
> > >
> > > +static inline int lapic_in_kernel(struct kvm *kvm)
> > > +{
> > > +	return irqchip_in_kernel(kvm);
> > > +}
> > > +
> > >  #endif
> > > diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
> > > index d98e415..db876c3 100644
> > > --- a/arch/s390/kvm/irq.h
> > > +++ b/arch/s390/kvm/irq.h
> > > @@ -19,4 +19,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> > >  	return 1;
> > >  }
> > >
> > > +static inline int lapic_in_kernel(struct kvm *kvm)
> > > +{
> > > +	return irqchip_in_kernel(kvm);
> > > +}
> > >  #endif
> > > diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h
> > > index bbb8f4e..3ddc134 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -638,6 +638,8 @@ struct kvm_arch {
> > >  	bool boot_vcpu_runs_old_kvmclock;
> > >
> > >  	u64 disabled_quirks;
> > > +
> > > +	bool irqchip_split;
> > >  };
> > >
> > >  struct kvm_vm_stat {
> > > diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
> > > index d090ecf..1237e92 100644
> > > --- a/arch/x86/kvm/assigned-dev.c
> > > +++ b/arch/x86/kvm/assigned-dev.c
> > > @@ -291,7 +291,7 @@ static int kvm_deassign_irq(struct kvm *kvm,
> > >  {
> > >  	unsigned long guest_irq_type, host_irq_type;
> > >
> > > -	if (!irqchip_in_kernel(kvm))
> > > +	if (!lapic_in_kernel(kvm))
> > >  		return -EINVAL;
> > >  	/* no irq assignment to deassign */
> > >  	if (!assigned_dev->irq_requested_type)
> > > @@ -568,7 +568,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm
> *kvm,
> > >  	struct kvm_assigned_dev_kernel *match;
> > >  	unsigned long host_irq_type, guest_irq_type;
> > >
> > > -	if (!irqchip_in_kernel(kvm))
> > > +	if (!lapic_in_kernel(kvm))
> > >  		return r;
> > >
> > >  	mutex_lock(&kvm->lock);
> > > diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> > > index a1ec6a50..706e47a 100644
> > > --- a/arch/x86/kvm/irq.c
> > > +++ b/arch/x86/kvm/irq.c
> > > @@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
> > >   */
> > >  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
> > >  {
> > > -	if (!irqchip_in_kernel(v->kvm))
> > > +	if (!lapic_in_kernel(v->kvm))
> > >  		return v->arch.interrupt.pending;
> > >
> > >  	if (kvm_cpu_has_extint(v))
> > > @@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
> > >   */
> > >  int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
> > >  {
> > > -	if (!irqchip_in_kernel(v->kvm))
> > > +	if (!lapic_in_kernel(v->kvm))
> > >  		return v->arch.interrupt.pending;
> > >
> > >  	if (kvm_cpu_has_extint(v))
> > > @@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
> > >  {
> > >  	int vector;
> > >
> > > -	if (!irqchip_in_kernel(v->kvm))
> > > +	if (!lapic_in_kernel(v->kvm))
> > >  		return v->arch.interrupt.nr;
> > >
> > >  	vector = kvm_cpu_get_extint(v);
> > > diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
> > > index ad68c73..e46abf3 100644
> > > --- a/arch/x86/kvm/irq.h
> > > +++ b/arch/x86/kvm/irq.h
> > > @@ -92,6 +92,17 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> > >  	return ret;
> > >  }
> > >
> > > +static inline int irqchip_split(struct kvm *kvm)
> > > +{
> > > +	return kvm->arch.irqchip_split;
> > > +}
> > > +
> > > +static inline int lapic_in_kernel(struct kvm *kvm)
> > > +{
> > > +	return irqchip_split(kvm) || irqchip_in_kernel(kvm);
> > > +}
> > > +
> > > +
> > >  void kvm_pic_reset(struct kvm_kpic_state *s);
> > >
> > >  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
> > > diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
> > > index 9efff9e..f43c59a 100644
> > > --- a/arch/x86/kvm/irq_comm.c
> > > +++ b/arch/x86/kvm/irq_comm.c
> > > @@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm
> *kvm)
> > >  	return kvm_set_irq_routing(kvm, default_routing,
> > >  				   ARRAY_SIZE(default_routing), 0);
> > >  }
> > > +
> > > +static const struct kvm_irq_routing_entry empty_routing[] = {};
> > > +
> > > +int kvm_setup_empty_irq_routing(struct kvm *kvm)
> > > +{
> > > +	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
> > > +}
> > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > index dc5b57b..bc392a6 100644
> > > --- a/arch/x86/kvm/lapic.c
> > > +++ b/arch/x86/kvm/lapic.c
> > > @@ -209,7 +209,8 @@ out:
> > >  	if (old)
> > >  		kfree_rcu(old, rcu);
> > >
> > > -	kvm_vcpu_request_scan_ioapic(kvm);
> > > +	if (!irqchip_split(kvm))
> > > +		kvm_vcpu_request_scan_ioapic(kvm);
> > >  }
> > >
> > >  static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> > > @@ -1819,7 +1820,8 @@ void kvm_apic_post_state_restore(struct
> kvm_vcpu *vcpu,
> > >  		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
> > >  				apic_find_highest_isr(apic));
> > >  	kvm_make_request(KVM_REQ_EVENT, vcpu);
> > > -	kvm_rtc_eoi_tracking_restore_one(vcpu);
> > > +	if (!irqchip_split(vcpu->kvm))
> > > +		kvm_rtc_eoi_tracking_restore_one(vcpu);
> > >  }
> > >
> > >  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
> > > @@ -1902,7 +1904,8 @@ static void apic_sync_pv_eoi_to_guest(struct
> kvm_vcpu *vcpu,
> > >  	    /* Cache not set: could be safe but we don't bother. */
> > >  	    apic->highest_isr_cache == -1 ||
> > >  	    /* Need EOI to update ioapic. */
> > > -	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
> > > +	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)
> ||
> > > +	    irqchip_split(vcpu->kvm)) {
> > >  		/*
> > >  		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
> > >  		 * so we need not do anything here.
> > > @@ -1958,7 +1961,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu
> *vcpu, u32 msr, u64 data)
> > >  	struct kvm_lapic *apic = vcpu->arch.apic;
> > >  	u32 reg = (msr - APIC_BASE_MSR) << 4;
> > >
> > > -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > > +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > >  		return 1;
> > >
> > >  	if (reg == APIC_ICR2)
> > > @@ -1975,7 +1978,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu
> *vcpu, u32 msr, u64 *data)
> > >  	struct kvm_lapic *apic = vcpu->arch.apic;
> > >  	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
> > >
> > > -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > > +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > >  		return 1;
> > >
> > >  	if (reg == APIC_DFR || reg == APIC_ICR2) {
> > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > index b78e83f..e5bf6db 100644
> > > --- a/arch/x86/kvm/mmu.c
> > > +++ b/arch/x86/kvm/mmu.c
> > > @@ -3465,7 +3465,7 @@ static int kvm_arch_setup_async_pf(struct
> kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> > >
> > >  static bool can_do_async_pf(struct kvm_vcpu *vcpu)
> > >  {
> > > -	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
> > > +	if (unlikely(!lapic_in_kernel(vcpu->kvm) ||
> > >  		     kvm_event_needs_reinjection(vcpu)))
> > >  		return false;
> > >
> > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > > index 8954d7a..7ac9ec2 100644
> > > --- a/arch/x86/kvm/svm.c
> > > +++ b/arch/x86/kvm/svm.c
> > > @@ -3054,7 +3054,7 @@ static int cr8_write_interception(struct
> vcpu_svm *svm)
> > >  	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
> > >  	/* instruction emulation calls kvm_set_cr8() */
> > >  	r = cr_interception(svm);
> > > -	if (irqchip_in_kernel(svm->vcpu.kvm))
> > > +	if (lapic_in_kernel(svm->vcpu.kvm))
> > >  		return r;
> > >  	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
> > >  		return r;
> > > @@ -3295,7 +3295,7 @@ static int interrupt_window_interception(struct
> vcpu_svm *svm)
> > >  	 * If the user space waits to inject interrupts, exit as soon as
> > >  	 * possible
> > >  	 */
> > > -	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
> > > +	if (!lapic_in_kernel(svm->vcpu.kvm) &&
> > >  	    kvm_run->request_interrupt_window &&
> > >  	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
> > >  		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > > index bcb61b0..a53747f 100644
> > > --- a/arch/x86/kvm/vmx.c
> > > +++ b/arch/x86/kvm/vmx.c
> > > @@ -948,7 +948,7 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
> > >
> > >  static inline bool vm_need_tpr_shadow(struct kvm *kvm)
> > >  {
> > > -	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
> > > +	return (cpu_has_vmx_tpr_shadow()) && lapic_in_kernel(kvm);
> > >  }
> > >
> > >  static inline bool cpu_has_secondary_exec_ctrls(void)
> > > @@ -1064,7 +1064,7 @@ static inline bool cpu_has_vmx_ple(void)
> > >
> > >  static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
> > >  {
> > > -	return flexpriority_enabled && irqchip_in_kernel(kvm);
> > > +	return flexpriority_enabled && lapic_in_kernel(kvm);
> > >  }
> > >
> > >  static inline bool cpu_has_vmx_vpid(void)
> > > @@ -4341,7 +4341,7 @@ static void
> vmx_disable_intercept_msr_write_x2apic(u32 msr)
> > >
> > >  static int vmx_vm_has_apicv(struct kvm *kvm)
> > >  {
> > > -	return enable_apicv && irqchip_in_kernel(kvm);
> > > +	return enable_apicv && lapic_in_kernel(kvm);
> > >  }
> > >
> > >  static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu
> *vcpu)
> > > @@ -5317,7 +5317,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
> > >  				u8 cr8 = (u8)val;
> > >  				err = kvm_set_cr8(vcpu, cr8);
> > >  				kvm_complete_insn_gp(vcpu, err);
> > > -				if (irqchip_in_kernel(vcpu->kvm))
> > > +				if (lapic_in_kernel(vcpu->kvm))
> > >  					return 1;
> > >  				if (cr8_prev <= cr8)
> > >  					return 1;
> > > @@ -5534,7 +5534,7 @@ static int handle_interrupt_window(struct
> kvm_vcpu *vcpu)
> > >  	 * If the user space waits to inject interrupts, exit as soon as
> > >  	 * possible
> > >  	 */
> > > -	if (!irqchip_in_kernel(vcpu->kvm) &&
> > > +	if (!lapic_in_kernel(vcpu->kvm) &&
> > >  	    vcpu->run->request_interrupt_window &&
> > >  	    !kvm_cpu_has_interrupt(vcpu)) {
> > >  		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> > > @@ -9419,7 +9419,7 @@ static void prepare_vmcs02(struct kvm_vcpu
> *vcpu, struct vmcs12 *vmcs12)
> > >  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and
> VM_ENTRY_IA32E_MODE are
> > >  	 * emulated by vmx_set_efer(), below.
> > >  	 */
> > > -	vm_entry_controls_init(vmx,
> > > +	vm_entry_controls_init(vmx,
> > >  		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER
> &
> > >  			~VM_ENTRY_IA32E_MODE) |
> > >  		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index cde5d61..7505b39 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -782,7 +782,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned
> long cr8)
> > >  {
> > >  	if (cr8 & CR8_RESERVED_BITS)
> > >  		return 1;
> > > -	if (irqchip_in_kernel(vcpu->kvm))
> > > +	if (lapic_in_kernel(vcpu->kvm))
> > >  		kvm_lapic_set_tpr(vcpu, cr8);
> > >  	else
> > >  		vcpu->arch.cr8 = cr8;
> > > @@ -792,7 +792,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
> > >
> > >  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
> > >  {
> > > -	if (irqchip_in_kernel(vcpu->kvm))
> > > +	if (lapic_in_kernel(vcpu->kvm))
> > >  		return kvm_lapic_get_cr8(vcpu);
> > >  	else
> > >  		return vcpu->arch.cr8;
> > > @@ -2800,6 +2800,7 @@ int kvm_vm_ioctl_check_extension(struct kvm
> *kvm, long ext)
> > >  	case KVM_CAP_TSC_DEADLINE_TIMER:
> > >  	case KVM_CAP_ENABLE_CAP_VM:
> > >  	case KVM_CAP_DISABLE_QUIRKS:
> > > +	case KVM_CAP_SPLIT_IRQCHIP:
> > >  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
> > >  	case KVM_CAP_ASSIGN_DEV_IRQ:
> > >  	case KVM_CAP_PCI_2_3:
> > > @@ -3002,7 +3003,7 @@ static int kvm_vcpu_ioctl_interrupt(struct
> kvm_vcpu *vcpu,
> > >  {
> > >  	if (irq->irq >= KVM_NR_INTERRUPTS)
> > >  		return -EINVAL;
> > > -	if (irqchip_in_kernel(vcpu->kvm))
> > > +	if (lapic_in_kernel(vcpu->kvm))
> > >  		return -ENXIO;
> > >
> > >  	kvm_queue_interrupt(vcpu, irq->irq, false);
> > > @@ -3480,7 +3481,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> > >  		struct kvm_vapic_addr va;
> > >
> > >  		r = -EINVAL;
> > > -		if (!irqchip_in_kernel(vcpu->kvm))
> > > +		if (!lapic_in_kernel(vcpu->kvm))
> > >  			goto out;
> > >  		r = -EFAULT;
> > >  		if (copy_from_user(&va, argp, sizeof va))
> > > @@ -3838,7 +3839,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
> struct kvm_dirty_log *log)
> > >  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level
> *irq_event,
> > >  			bool line_status)
> > >  {
> > > -	if (!irqchip_in_kernel(kvm))
> > > +	if (!lapic_in_kernel(kvm))
> > >  		return -ENXIO;
> > >
> > >  	irq_event->status = kvm_set_irq(kvm,
> KVM_USERSPACE_IRQ_SOURCE_ID,
> > > @@ -4128,6 +4129,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
> > >  		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
> > >  		break;
> > >  	}
> > > +	case KVM_CREATE_SPLIT_IRQCHIP: {
> > > +		mutex_lock(&kvm->lock);
> > > +		r = -EEXIST;
> > > +		if (lapic_in_kernel(kvm))
> > > +			goto split_irqchip_unlock;
> > > +		r = -EINVAL;
> > > +		if (atomic_read(&kvm->online_vcpus))
> > > +			goto split_irqchip_unlock;
> > > +		r = kvm_setup_empty_irq_routing(kvm);
> > > +		if (r)
> > > +			goto split_irqchip_unlock;
> > > +		kvm->arch.irqchip_split = true;
> > > +		r = 0;
> > > +split_irqchip_unlock:
> > > +		mutex_unlock(&kvm->lock);
> > > +		break;
> > > +	}
> > > +
> > >  	default:
> > >  		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
> > >  	}
> > > @@ -5893,7 +5912,7 @@ void kvm_arch_exit(void)
> > >  int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
> > >  {
> > >  	++vcpu->stat.halt_exits;
> > > -	if (irqchip_in_kernel(vcpu->kvm)) {
> > > +	if (lapic_in_kernel(vcpu->kvm)) {
> > >  		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
> > >  		return 1;
> > >  	} else {
> > > @@ -6060,7 +6079,7 @@ static int emulator_fix_hypercall(struct
> x86_emulate_ctxt *ctxt)
> > >   */
> > >  static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
> > >  {
> > > -	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu)
> &&
> > > +	return (!lapic_in_kernel(vcpu->kvm)
> && !kvm_cpu_has_interrupt(vcpu) &&
> > >  		vcpu->run->request_interrupt_window &&
> > >  		kvm_arch_interrupt_allowed(vcpu));
> > >  }
> > > @@ -6072,7 +6091,7 @@ static void post_kvm_run_save(struct kvm_vcpu
> *vcpu)
> > >  	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
> > >  	kvm_run->cr8 = kvm_get_cr8(vcpu);
> > >  	kvm_run->apic_base = kvm_get_apic_base(vcpu);
> > > -	if (irqchip_in_kernel(vcpu->kvm))
> > > +	if (lapic_in_kernel(vcpu->kvm))
> > >  		kvm_run->ready_for_interrupt_injection = 1;
> > >  	else
> > >  		kvm_run->ready_for_interrupt_injection =
> > > @@ -6219,7 +6238,7 @@ void kvm_vcpu_reload_apic_access_page(struct
> kvm_vcpu *vcpu)
> > >  {
> > >  	struct page *page = NULL;
> > >
> > > -	if (!irqchip_in_kernel(vcpu->kvm))
> > > +	if (!lapic_in_kernel(vcpu->kvm))
> > >  		return;
> > >
> > >  	if (!kvm_x86_ops->set_apic_access_page_addr)
> > > @@ -6255,7 +6274,7 @@ void
> kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
> > >  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > >  {
> > >  	int r;
> > > -	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
> > > +	bool req_int_win = !lapic_in_kernel(vcpu->kvm) &&
> > >  		vcpu->run->request_interrupt_window;
> > >  	bool req_immediate_exit = false;
> > >
> > > @@ -6644,7 +6663,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> *vcpu, struct kvm_run *kvm_run)
> > >  	}
> > >
> > >  	/* re-sync apic's tpr */
> > > -	if (!irqchip_in_kernel(vcpu->kvm)) {
> > > +	if (!lapic_in_kernel(vcpu->kvm)) {
> > >  		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
> > >  			r = -EINVAL;
> > >  			goto out;
> > > @@ -7340,7 +7359,7 @@ void kvm_arch_check_processor_compat(void
> *rtn)
> > >
> > >  bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
> > >  {
> > > -	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> > > +	return lapic_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> > >  }
> > >
> > >  struct static_key kvm_no_apic_vcpu __read_mostly;
> > > @@ -7356,7 +7375,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> > >
> > >  	vcpu->arch.pv.pv_unhalted = false;
> > >  	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
> > > -	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> > > +	if (!lapic_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> > >  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> > >  	else
> > >  		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
> > > @@ -7374,7 +7393,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> > >  	if (r < 0)
> > >  		goto fail_free_pio_data;
> > >
> > > -	if (irqchip_in_kernel(kvm)) {
> > > +	if (lapic_in_kernel(kvm)) {
> > >  		r = kvm_create_lapic(vcpu);
> > >  		if (r < 0)
> > >  			goto fail_mmu_destroy;
> > > @@ -7437,7 +7456,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu
> *vcpu)
> > >  	kvm_mmu_destroy(vcpu);
> > >  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > >  	free_page((unsigned long)vcpu->arch.pio_data);
> > > -	if (!irqchip_in_kernel(vcpu->kvm))
> > > +	if (!lapic_in_kernel(vcpu->kvm))
> > >  		static_key_slow_dec(&kvm_no_apic_vcpu);
> > >  }
> > >
> > > diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> > > index 133ea00..ffe1f4e 100644
> > > --- a/include/kvm/arm_vgic.h
> > > +++ b/include/kvm/arm_vgic.h
> > > @@ -329,6 +329,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu
> *vcpu);
> > >  int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
> > >
> > >  #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
> > > +#define lapic_in_kernel(k)      (irqchip_in_kernel(k))
> > >  #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
> > >  #define vgic_ready(k)		((k)->arch.vgic.ready)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 87fd74a..277b7a1 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -934,6 +934,7 @@ static inline int mmu_notifier_retry(struct kvm
> *kvm, unsigned long mmu_seq)
> > >  #endif
> > >
> > >  int kvm_setup_default_irq_routing(struct kvm *kvm);
> > > +int kvm_setup_empty_irq_routing(struct kvm *kvm);
> > >  int kvm_set_irq_routing(struct kvm *kvm,
> > >  			const struct kvm_irq_routing_entry *entries,
> > >  			unsigned nr,
> > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > index 75bd9f7..7d06dc4 100644
> > > --- a/include/uapi/linux/kvm.h
> > > +++ b/include/uapi/linux/kvm.h
> > > @@ -815,6 +815,7 @@ struct kvm_ppc_smmu_info {
> > >  #define KVM_CAP_S390_IRQ_STATE 114
> > >  #define KVM_CAP_PPC_HWRNG 115
> > >  #define KVM_CAP_DISABLE_QUIRKS 116
> > > +#define KVM_CAP_SPLIT_IRQCHIP 117
> > >
> > >  #ifdef KVM_CAP_IRQ_ROUTING
> > >
> > > @@ -1200,6 +1201,8 @@ struct kvm_s390_ucas_mapping {
> > >  /* Available with KVM_CAP_S390_IRQ_STATE */
> > >  #define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct
> kvm_s390_irq_state)
> > >  #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct
> kvm_s390_irq_state)
> > > +/* Available with KVM_CAP_SPLIT_IRQCHIP */
> > > +#define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
> > >
> > >  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
> > >  #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> > > diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> > > index 1d56a90..8aaceed 100644
> > > --- a/virt/kvm/irqchip.c
> > > +++ b/virt/kvm/irqchip.c
> > > @@ -73,7 +73,7 @@ int kvm_send_userspace_msi(struct kvm *kvm,
> struct kvm_msi *msi)
> > >  {
> > >  	struct kvm_kernel_irq_routing_entry route;
> > >
> > > -	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
> > > +	if (!lapic_in_kernel(kvm) || msi->flags != 0)
> > >  		return -EINVAL;
> > >
> > >  	route.msi.address_lo = msi->address_lo;
> > >
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Honig May 14, 2015, 7:29 p.m. UTC | #4
>
> BTW, what is the purpose of this series. If I understand it correctly, you only want to
> use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
> benefit of it?

The purpose is to achieve the security benefit of removing some of the
interrupt handling into userspace, without incurring a significant
performance penalty.  If you move the entire IRQCHIP into userspace,
we've seen perf impacts from 15-200% depending on the workload.  With
this patch series, we're seeing perf penalty <1% on our tests (TCP_RR
latency, TCP throughput, and Disk I/O).  See
(https://lwn.net/Articles/619332/)

>
> Thanks,
> Feng
>
>>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wu, Feng May 15, 2015, 1:28 a.m. UTC | #5
> -----Original Message-----

> From: Andrew Honig [mailto:ahonig@google.com]

> Sent: Friday, May 15, 2015 3:29 AM

> To: Wu, Feng

> Cc: Steve Rutherford; Paolo Bonzini; kvm@vger.kernel.org

> Subject: Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.

> 

> >

> > BTW, what is the purpose of this series. If I understand it correctly, you only

> want to

> > use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is

> the

> > benefit of it?

> 

> The purpose is to achieve the security benefit of removing some of the

> interrupt handling into userspace, without incurring a significant

> performance penalty.  If you move the entire IRQCHIP into userspace,

> we've seen perf impacts from 15-200% depending on the workload.  With

> this patch series, we're seeing perf penalty <1% on our tests (TCP_RR

> latency, TCP throughput, and Disk I/O).  See

> (https://lwn.net/Articles/619332/)


Good to know this, thanks for the sharing!

Thanks,
Feng

> 

> >

> > Thanks,

> > Feng

> >

> >>
Wanpeng Li May 15, 2015, 5:03 a.m. UTC | #6
On Thu, May 14, 2015 at 12:29:21PM -0700, Andrew Honig wrote:
>>
>> BTW, what is the purpose of this series. If I understand it correctly, you only want to
>> use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
>> benefit of it?
>
>The purpose is to achieve the security benefit of removing some of the
>interrupt handling into userspace, without incurring a significant
>performance penalty.  If you move the entire IRQCHIP into userspace,
>we've seen perf impacts from 15-200% depending on the workload.  With
>this patch series, we're seeing perf penalty <1% on our tests (TCP_RR

Why keep pic and ioapic in kernel space not get obvious benefit, what's
the bottleneck?

Regards,
Wanpeng Li 

>latency, TCP throughput, and Disk I/O).  See
>(https://lwn.net/Articles/619332/)
>
>>
>> Thanks,
>> Feng
>>
>>>
>--
>To unsubscribe from this list: send the line "unsubscribe kvm" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Rutherford May 15, 2015, 6:10 p.m. UTC | #7
On Fri, May 15, 2015 at 01:03:02PM +0800, Wanpeng Li wrote:
> On Thu, May 14, 2015 at 12:29:21PM -0700, Andrew Honig wrote:
> >>
> >> BTW, what is the purpose of this series. If I understand it correctly, you only want to
> >> use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
> >> benefit of it?
> >
> >The purpose is to achieve the security benefit of removing some of the
> >interrupt handling into userspace, without incurring a significant
> >performance penalty.  If you move the entire IRQCHIP into userspace,
> >we've seen perf impacts from 15-200% depending on the workload.  With
> >this patch series, we're seeing perf penalty <1% on our tests (TCP_RR
> 
> Why keep pic and ioapic in kernel space not get obvious benefit, what's
> the bottleneck?

It's the other way around. The PIC and IOAPIC are going up to userspace, and the APICs are staying in the kernel.

> 
> Regards,
> Wanpeng Li 
> 
> >latency, TCP throughput, and Disk I/O).  See
> >(https://lwn.net/Articles/619332/)
> >
> >>
> >> Thanks,
> >> Feng
> >>
> >>>
> >--
> >To unsubscribe from this list: send the line "unsubscribe kvm" in
> >the body of a message to majordomo@vger.kernel.org
> >More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanpeng Li May 18, 2015, 2:11 a.m. UTC | #8
On Fri, May 15, 2015 at 11:10:23AM -0700, Steve Rutherford wrote:
>On Fri, May 15, 2015 at 01:03:02PM +0800, Wanpeng Li wrote:
>> On Thu, May 14, 2015 at 12:29:21PM -0700, Andrew Honig wrote:
>> >>
>> >> BTW, what is the purpose of this series. If I understand it correctly, you only want to
>> >> use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
>> >> benefit of it?
>> >
>> >The purpose is to achieve the security benefit of removing some of the
>> >interrupt handling into userspace, without incurring a significant
>> >performance penalty.  If you move the entire IRQCHIP into userspace,
>> >we've seen perf impacts from 15-200% depending on the workload.  With
>> >this patch series, we're seeing perf penalty <1% on our tests (TCP_RR
>> 
>> Why keep pic and ioapic in kernel space not get obvious benefit, what's
>> the bottleneck?
>
>It's the other way around. The PIC and IOAPIC are going up to userspace, and the APICs are staying in the kernel.

Yeah, this is what you have done in your patchset. I mean do you observe 
that why keep pic and ioapic in kernel space not get obvious benefit than 
move pic and ioapic to userspace.

Regards,
Wanpeng Li 

>
>> 
>> Regards,
>> Wanpeng Li 
>> 
>> >latency, TCP throughput, and Disk I/O).  See
> >(https://lwn.net/Articles/619332/)
>> >
>> >>
>> >> Thanks,
>> >> Feng
>> >>
>> >>>
>> >--
>> >To unsubscribe from this list: send the line "unsubscribe kvm" in
>> >the body of a message to majordomo@vger.kernel.org
>> >More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6955444..0744b4e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2979,6 +2979,21 @@  len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
 and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
 which is the maximum number of possibly pending cpu-local interrupts.
 
+4.96 KVM_SPLIT_IRQCHIP
+
+Capability: KVM_CAP_SPLIT_IRQCHIP
+Architectures: x86
+Type:  VM ioctl
+Parameters: None
+Returns: 0 on success, -1 on error
+
+Create a local apic for each processor in the kernel.  This differs from
+KVM_CREATE_IRQCHIP in that it only creates the local apic; it creates neither
+the ioapic nor the pic in the kernel. Also, enables in kernel routing of
+interrupt requests. Fails if VCPU has already been created, or if the irqchip is
+already in the kernel.
+
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
index 5a9a10b..5e6fa06 100644
--- a/arch/powerpc/kvm/irq.h
+++ b/arch/powerpc/kvm/irq.h
@@ -17,4 +17,9 @@  static inline int irqchip_in_kernel(struct kvm *kvm)
 	return ret;
 }
 
+static inline int lapic_in_kernel(struct kvm *kvm)
+{
+	return irqchip_in_kernel(kvm);
+}
+
 #endif
diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
index d98e415..db876c3 100644
--- a/arch/s390/kvm/irq.h
+++ b/arch/s390/kvm/irq.h
@@ -19,4 +19,8 @@  static inline int irqchip_in_kernel(struct kvm *kvm)
 	return 1;
 }
 
+static inline int lapic_in_kernel(struct kvm *kvm)
+{
+	return irqchip_in_kernel(kvm);
+}
 #endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bbb8f4e..3ddc134 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -638,6 +638,8 @@  struct kvm_arch {
 	bool boot_vcpu_runs_old_kvmclock;
 
 	u64 disabled_quirks;
+
+	bool irqchip_split;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index d090ecf..1237e92 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -291,7 +291,7 @@  static int kvm_deassign_irq(struct kvm *kvm,
 {
 	unsigned long guest_irq_type, host_irq_type;
 
-	if (!irqchip_in_kernel(kvm))
+	if (!lapic_in_kernel(kvm))
 		return -EINVAL;
 	/* no irq assignment to deassign */
 	if (!assigned_dev->irq_requested_type)
@@ -568,7 +568,7 @@  static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 	struct kvm_assigned_dev_kernel *match;
 	unsigned long host_irq_type, guest_irq_type;
 
-	if (!irqchip_in_kernel(kvm))
+	if (!lapic_in_kernel(kvm))
 		return r;
 
 	mutex_lock(&kvm->lock);
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a1ec6a50..706e47a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -57,7 +57,7 @@  static int kvm_cpu_has_extint(struct kvm_vcpu *v)
  */
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v->kvm))
 		return v->arch.interrupt.pending;
 
 	if (kvm_cpu_has_extint(v))
@@ -75,7 +75,7 @@  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
  */
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v->kvm))
 		return v->arch.interrupt.pending;
 
 	if (kvm_cpu_has_extint(v))
@@ -103,7 +103,7 @@  int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 {
 	int vector;
 
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v->kvm))
 		return v->arch.interrupt.nr;
 
 	vector = kvm_cpu_get_extint(v);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ad68c73..e46abf3 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -92,6 +92,17 @@  static inline int irqchip_in_kernel(struct kvm *kvm)
 	return ret;
 }
 
+static inline int irqchip_split(struct kvm *kvm)
+{
+	return kvm->arch.irqchip_split;
+}
+
+static inline int lapic_in_kernel(struct kvm *kvm)
+{
+	return irqchip_split(kvm) || irqchip_in_kernel(kvm);
+}
+
+
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 9efff9e..f43c59a 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -328,3 +328,10 @@  int kvm_setup_default_irq_routing(struct kvm *kvm)
 	return kvm_set_irq_routing(kvm, default_routing,
 				   ARRAY_SIZE(default_routing), 0);
 }
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index dc5b57b..bc392a6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -209,7 +209,8 @@  out:
 	if (old)
 		kfree_rcu(old, rcu);
 
-	kvm_vcpu_request_scan_ioapic(kvm);
+	if (!irqchip_split(kvm))
+		kvm_vcpu_request_scan_ioapic(kvm);
 }
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -1819,7 +1820,8 @@  void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 				apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	kvm_rtc_eoi_tracking_restore_one(vcpu);
+	if (!irqchip_split(vcpu->kvm))
+		kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1902,7 +1904,8 @@  static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
 	    /* Cache not set: could be safe but we don't bother. */
 	    apic->highest_isr_cache == -1 ||
 	    /* Need EOI to update ioapic. */
-	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache) ||
+	    irqchip_split(vcpu->kvm)) {
 		/*
 		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
 		 * so we need not do anything here.
@@ -1958,7 +1961,7 @@  int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
 		return 1;
 
 	if (reg == APIC_ICR2)
@@ -1975,7 +1978,7 @@  int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
 		return 1;
 
 	if (reg == APIC_DFR || reg == APIC_ICR2) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b78e83f..e5bf6db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3465,7 +3465,7 @@  static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 
 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+	if (unlikely(!lapic_in_kernel(vcpu->kvm) ||
 		     kvm_event_needs_reinjection(vcpu)))
 		return false;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8954d7a..7ac9ec2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3054,7 +3054,7 @@  static int cr8_write_interception(struct vcpu_svm *svm)
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
 	r = cr_interception(svm);
-	if (irqchip_in_kernel(svm->vcpu.kvm))
+	if (lapic_in_kernel(svm->vcpu.kvm))
 		return r;
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
 		return r;
@@ -3295,7 +3295,7 @@  static int interrupt_window_interception(struct vcpu_svm *svm)
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
+	if (!lapic_in_kernel(svm->vcpu.kvm) &&
 	    kvm_run->request_interrupt_window &&
 	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bcb61b0..a53747f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -948,7 +948,7 @@  static inline bool cpu_has_vmx_tpr_shadow(void)
 
 static inline bool vm_need_tpr_shadow(struct kvm *kvm)
 {
-	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+	return (cpu_has_vmx_tpr_shadow()) && lapic_in_kernel(kvm);
 }
 
 static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -1064,7 +1064,7 @@  static inline bool cpu_has_vmx_ple(void)
 
 static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
-	return flexpriority_enabled && irqchip_in_kernel(kvm);
+	return flexpriority_enabled && lapic_in_kernel(kvm);
 }
 
 static inline bool cpu_has_vmx_vpid(void)
@@ -4341,7 +4341,7 @@  static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
 
 static int vmx_vm_has_apicv(struct kvm *kvm)
 {
-	return enable_apicv && irqchip_in_kernel(kvm);
+	return enable_apicv && lapic_in_kernel(kvm);
 }
 
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -5317,7 +5317,7 @@  static int handle_cr(struct kvm_vcpu *vcpu)
 				u8 cr8 = (u8)val;
 				err = kvm_set_cr8(vcpu, cr8);
 				kvm_complete_insn_gp(vcpu, err);
-				if (irqchip_in_kernel(vcpu->kvm))
+				if (lapic_in_kernel(vcpu->kvm))
 					return 1;
 				if (cr8_prev <= cr8)
 					return 1;
@@ -5534,7 +5534,7 @@  static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (!irqchip_in_kernel(vcpu->kvm) &&
+	if (!lapic_in_kernel(vcpu->kvm) &&
 	    vcpu->run->request_interrupt_window &&
 	    !kvm_cpu_has_interrupt(vcpu)) {
 		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
@@ -9419,7 +9419,7 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
 	 * emulated by vmx_set_efer(), below.
 	 */
-	vm_entry_controls_init(vmx, 
+	vm_entry_controls_init(vmx,
 		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
 			~VM_ENTRY_IA32E_MODE) |
 		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cde5d61..7505b39 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -782,7 +782,7 @@  int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	if (cr8 & CR8_RESERVED_BITS)
 		return 1;
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu->kvm))
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 		vcpu->arch.cr8 = cr8;
@@ -792,7 +792,7 @@  EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 {
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu->kvm))
 		return kvm_lapic_get_cr8(vcpu);
 	else
 		return vcpu->arch.cr8;
@@ -2800,6 +2800,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_TSC_DEADLINE_TIMER:
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_DISABLE_QUIRKS:
+	case KVM_CAP_SPLIT_IRQCHIP:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@@ -3002,7 +3003,7 @@  static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 {
 	if (irq->irq >= KVM_NR_INTERRUPTS)
 		return -EINVAL;
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu->kvm))
 		return -ENXIO;
 
 	kvm_queue_interrupt(vcpu, irq->irq, false);
@@ -3480,7 +3481,7 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		struct kvm_vapic_addr va;
 
 		r = -EINVAL;
-		if (!irqchip_in_kernel(vcpu->kvm))
+		if (!lapic_in_kernel(vcpu->kvm))
 			goto out;
 		r = -EFAULT;
 		if (copy_from_user(&va, argp, sizeof va))
@@ -3838,7 +3839,7 @@  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 			bool line_status)
 {
-	if (!irqchip_in_kernel(kvm))
+	if (!lapic_in_kernel(kvm))
 		return -ENXIO;
 
 	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
@@ -4128,6 +4129,24 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
 		break;
 	}
+	case KVM_CREATE_SPLIT_IRQCHIP: {
+		mutex_lock(&kvm->lock);
+		r = -EEXIST;
+		if (lapic_in_kernel(kvm))
+			goto split_irqchip_unlock;
+		r = -EINVAL;
+		if (atomic_read(&kvm->online_vcpus))
+			goto split_irqchip_unlock;
+		r = kvm_setup_empty_irq_routing(kvm);
+		if (r)
+			goto split_irqchip_unlock;
+		kvm->arch.irqchip_split = true;
+		r = 0;
+split_irqchip_unlock:
+		mutex_unlock(&kvm->lock);
+		break;
+	}
+
 	default:
 		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
 	}
@@ -5893,7 +5912,7 @@  void kvm_arch_exit(void)
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.halt_exits;
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (lapic_in_kernel(vcpu->kvm)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 		return 1;
 	} else {
@@ -6060,7 +6079,7 @@  static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
  */
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
-	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
+	return (!lapic_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
 		vcpu->run->request_interrupt_window &&
 		kvm_arch_interrupt_allowed(vcpu));
 }
@@ -6072,7 +6091,7 @@  static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection = 1;
 	else
 		kvm_run->ready_for_interrupt_injection =
@@ -6219,7 +6238,7 @@  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
 	struct page *page = NULL;
 
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu->kvm))
 		return;
 
 	if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6255,7 +6274,7 @@  void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+	bool req_int_win = !lapic_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
 	bool req_immediate_exit = false;
 
@@ -6644,7 +6663,7 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	/* re-sync apic's tpr */
-	if (!irqchip_in_kernel(vcpu->kvm)) {
+	if (!lapic_in_kernel(vcpu->kvm)) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
 			r = -EINVAL;
 			goto out;
@@ -7340,7 +7359,7 @@  void kvm_arch_check_processor_compat(void *rtn)
 
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 {
-	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+	return lapic_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
 }
 
 struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7356,7 +7375,7 @@  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.pv.pv_unhalted = false;
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
-	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+	if (!lapic_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
 		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -7374,7 +7393,7 @@  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	if (r < 0)
 		goto fail_free_pio_data;
 
-	if (irqchip_in_kernel(kvm)) {
+	if (lapic_in_kernel(kvm)) {
 		r = kvm_create_lapic(vcpu);
 		if (r < 0)
 			goto fail_mmu_destroy;
@@ -7437,7 +7456,7 @@  void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu->kvm))
 		static_key_slow_dec(&kvm_no_apic_vcpu);
 }
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 133ea00..ffe1f4e 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -329,6 +329,7 @@  int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
+#define lapic_in_kernel(k)      (irqchip_in_kernel(k))
 #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
 #define vgic_ready(k)		((k)->arch.vgic.ready)
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 87fd74a..277b7a1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -934,6 +934,7 @@  static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 #endif
 
 int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_setup_empty_irq_routing(struct kvm *kvm);
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *entries,
 			unsigned nr,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 75bd9f7..7d06dc4 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -815,6 +815,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_IRQ_STATE 114
 #define KVM_CAP_PPC_HWRNG 115
 #define KVM_CAP_DISABLE_QUIRKS 116
+#define KVM_CAP_SPLIT_IRQCHIP 117
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1200,6 +1201,8 @@  struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_S390_IRQ_STATE */
 #define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
 #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
+/* Available with KVM_CAP_SPLIT_IRQCHIP */
+#define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 1d56a90..8aaceed 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -73,7 +73,7 @@  int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
 {
 	struct kvm_kernel_irq_routing_entry route;
 
-	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
+	if (!lapic_in_kernel(kvm) || msi->flags != 0)
 		return -EINVAL;
 
 	route.msi.address_lo = msi->address_lo;