diff mbox

[Patchv5,2/7] KVM: s390: add floating irq controller

Message ID 1381244100-59056-3-git-send-email-borntraeger@de.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Christian Borntraeger Oct. 8, 2013, 2:54 p.m. UTC
From: Jens Freimann <jfrei@linux.vnet.ibm.com>

This patch adds a floating irq controller as a kvm_device.
It will be necessary for migration of floating interrupts as well
as for hardening the reset code by allowing user space to explicitly
remove all pending floating interrupts.

Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
 arch/s390/include/asm/kvm_host.h                |   1 +
 arch/s390/include/uapi/asm/kvm.h                |   5 +
 arch/s390/kvm/interrupt.c                       | 296 ++++++++++++++++++++----
 arch/s390/kvm/kvm-s390.c                        |   1 +
 include/linux/kvm_host.h                        |   1 +
 include/uapi/linux/kvm.h                        |   1 +
 virt/kvm/kvm_main.c                             |   5 +
 8 files changed, 295 insertions(+), 51 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt

Comments

Gleb Natapov Oct. 13, 2013, 8:39 a.m. UTC | #1
On Tue, Oct 08, 2013 at 04:54:55PM +0200, Christian Borntraeger wrote:
> From: Jens Freimann <jfrei@linux.vnet.ibm.com>
> 
> This patch adds a floating irq controller as a kvm_device.
> It will be necessary for migration of floating interrupts as well
> as for hardening the reset code by allowing user space to explicitly
> remove all pending floating interrupts.
> 
> Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
> Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
>  arch/s390/include/asm/kvm_host.h                |   1 +
>  arch/s390/include/uapi/asm/kvm.h                |   5 +
>  arch/s390/kvm/interrupt.c                       | 296 ++++++++++++++++++++----
>  arch/s390/kvm/kvm-s390.c                        |   1 +
>  include/linux/kvm_host.h                        |   1 +
>  include/uapi/linux/kvm.h                        |   1 +
>  virt/kvm/kvm_main.c                             |   5 +
>  8 files changed, 295 insertions(+), 51 deletions(-)
>  create mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt
> 
> diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> new file mode 100644
> index 0000000..06aef31
> --- /dev/null
> +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> @@ -0,0 +1,36 @@
> +FLIC (floating interrupt controller)
> +====================================
> +
> +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> +machine check interruptions. All interrupts are stored in a per-vm list of
> +pending interrupts. FLIC performs operations on this list.
> +
> +Only one FLIC instance may be instantiated.
> +
> +FLIC provides support to
> +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> +
> +Groups:
> +  KVM_DEV_FLIC_ENQUEUE
> +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> +    are taken from this list for injection into the guest. attr contains
> +    a struct kvm_s390_irq which contains all data relevant for
> +    interrupt injection.
> +    The format of the data structure kvm_s390_irq as it is copied from userspace
> +    is defined in usr/include/linux/kvm.h.
> +    For historic reasons list members are stored in a different data structure, i.e.
> +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> +    which can then be added to the list.
> +
> +  KVM_DEV_FLIC_DEQUEUE
> +    Takes one element off the pending interrupts list and copies it into userspace.
> +    Dequeued interrupts are not injected into the guest.
> +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> +    List elements are stored in the format of struct kvm_s390_interrupt_info
> +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> +    (usr/include/linux/kvm.h)
> +
Can interrupt be dequeued on real HW also? When this interface will be
used?

> +  KVM_DEV_FLIC_CLEAR_IRQS
> +    Simply deletes all elements from the list of currently pending floating interrupts.
> +    No interrupts are injected into the guest.
> diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
> index 78b6918..2d09c1d 100644
> --- a/arch/s390/include/asm/kvm_host.h
> +++ b/arch/s390/include/asm/kvm_host.h
> @@ -237,6 +237,7 @@ struct kvm_arch{
>  	struct sca_block *sca;
>  	debug_info_t *dbf;
>  	struct kvm_s390_float_interrupt float_int;
> +	struct kvm_device *flic;
>  	struct gmap *gmap;
>  	int css_support;
>  };
> diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
> index d25da59..33d52b8 100644
> --- a/arch/s390/include/uapi/asm/kvm.h
> +++ b/arch/s390/include/uapi/asm/kvm.h
> @@ -16,6 +16,11 @@
>  
>  #define __KVM_S390
>  
> +/* Device control API: s390-specific devices */
> +#define KVM_DEV_FLIC_DEQUEUE 1
> +#define KVM_DEV_FLIC_ENQUEUE 2
> +#define KVM_DEV_FLIC_CLEAR_IRQS 3
> +
>  /* for KVM_GET_REGS and KVM_SET_REGS */
>  struct kvm_regs {
>  	/* general purpose regs for s390 */
> diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
> index e7323cd..66478a0 100644
> --- a/arch/s390/kvm/interrupt.c
> +++ b/arch/s390/kvm/interrupt.c
> @@ -659,53 +659,85 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
>  	return inti;
>  }
>  
> -int kvm_s390_inject_vm(struct kvm *kvm,
> -		       struct kvm_s390_interrupt *s390int)
> +static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
>  {
>  	struct kvm_s390_local_interrupt *li;
>  	struct kvm_s390_float_interrupt *fi;
> -	struct kvm_s390_interrupt_info *inti, *iter;
> +	struct kvm_s390_interrupt_info *iter;
>  	int sigcpu;
>  
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	if (!is_ioint(inti->type)) {
> +		list_add_tail(&inti->list, &fi->list);
> +	} else {
> +		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> +
> +		/* Keep I/O interrupts sorted in isc order. */
> +		list_for_each_entry(iter, &fi->list, list) {
> +			if (!is_ioint(iter->type))
> +				continue;
> +			if (int_word_to_isc_bits(iter->io.io_int_word) <= isc_bits)
> +				continue;
> +			break;
> +		}
> +		list_add_tail(&inti->list, &iter->list);
> +	}
> +	atomic_set(&fi->active, 1);
> +	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> +	if (sigcpu == KVM_MAX_VCPUS) {
> +		do {
> +			sigcpu = fi->next_rr_cpu++;
> +			if (sigcpu == KVM_MAX_VCPUS)
> +				sigcpu = fi->next_rr_cpu = 0;
> +		} while (fi->local_int[sigcpu] == NULL);
> +	}
> +	li = fi->local_int[sigcpu];
> +	spin_lock_bh(&li->lock);
> +	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> +	if (waitqueue_active(li->wq))
> +		wake_up_interruptible(li->wq);
> +	spin_unlock_bh(&li->lock);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +}
> +
> +int kvm_s390_inject_vm(struct kvm *kvm,
> +		       struct kvm_s390_interrupt *s390int)
> +{
> +	struct kvm_s390_interrupt_info *inti;
> +
>  	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
>  	if (!inti)
>  		return -ENOMEM;
>  
> -	switch (s390int->type) {
> +	inti->type = s390int->type;
> +	switch (inti->type) {
>  	case KVM_S390_INT_VIRTIO:
>  		VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
>  			 s390int->parm, s390int->parm64);
> -		inti->type = s390int->type;
>  		inti->ext.ext_params = s390int->parm;
>  		inti->ext.ext_params2 = s390int->parm64;
>  		break;
>  	case KVM_S390_INT_SERVICE:
>  		VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
> -		inti->type = s390int->type;
>  		inti->ext.ext_params = s390int->parm;
>  		break;
> -	case KVM_S390_PROGRAM_INT:
> -	case KVM_S390_SIGP_STOP:
> -	case KVM_S390_INT_EXTERNAL_CALL:
> -	case KVM_S390_INT_EMERGENCY:
> -		kfree(inti);
> -		return -EINVAL;
>  	case KVM_S390_MCHK:
>  		VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
>  			 s390int->parm64);
> -		inti->type = s390int->type;
>  		inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
>  		inti->mchk.mcic = s390int->parm64;
>  		break;
>  	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> -		if (s390int->type & IOINT_AI_MASK)
> +		if (inti->type & IOINT_AI_MASK)
>  			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
>  		else
>  			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
>  				 s390int->type & IOINT_CSSID_MASK,
>  				 s390int->type & IOINT_SSID_MASK,
>  				 s390int->type & IOINT_SCHID_MASK);
> -		inti->type = s390int->type;
>  		inti->io.subchannel_id = s390int->parm >> 16;
>  		inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
>  		inti->io.io_int_parm = s390int->parm64 >> 32;
> @@ -718,42 +750,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
>  	trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
>  				 2);
>  
> -	mutex_lock(&kvm->lock);
> -	fi = &kvm->arch.float_int;
> -	spin_lock(&fi->lock);
> -	if (!is_ioint(inti->type))
> -		list_add_tail(&inti->list, &fi->list);
> -	else {
> -		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> -
> -		/* Keep I/O interrupts sorted in isc order. */
> -		list_for_each_entry(iter, &fi->list, list) {
> -			if (!is_ioint(iter->type))
> -				continue;
> -			if (int_word_to_isc_bits(iter->io.io_int_word)
> -			    <= isc_bits)
> -				continue;
> -			break;
> -		}
> -		list_add_tail(&inti->list, &iter->list);
> -	}
> -	atomic_set(&fi->active, 1);
> -	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> -	if (sigcpu == KVM_MAX_VCPUS) {
> -		do {
> -			sigcpu = fi->next_rr_cpu++;
> -			if (sigcpu == KVM_MAX_VCPUS)
> -				sigcpu = fi->next_rr_cpu = 0;
> -		} while (fi->local_int[sigcpu] == NULL);
> -	}
> -	li = fi->local_int[sigcpu];
> -	spin_lock_bh(&li->lock);
> -	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> -	if (waitqueue_active(li->wq))
> -		wake_up_interruptible(li->wq);
> -	spin_unlock_bh(&li->lock);
> -	spin_unlock(&fi->lock);
> -	mutex_unlock(&kvm->lock);
> +	__inject_vm(kvm, inti);
>  	return 0;
>  }
>  
> @@ -841,3 +838,200 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
>  	mutex_unlock(&vcpu->kvm->lock);
>  	return 0;
>  }
> +
> +static void clear_floating_interrupts(struct kvm *kvm)
> +{
> +	struct kvm_s390_float_interrupt *fi;
> +	struct kvm_s390_interrupt_info	*n, *inti = NULL;
> +
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	list_for_each_entry_safe(inti, n, &fi->list, list) {
> +		list_del(&inti->list);
> +		kfree(inti);
> +	}
> +	atomic_set(&fi->active, 0);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +}
> +
> +static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
> +				   u64 addr)
> +{
> +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> +	void __user *target;
> +	void *source;
> +	u64 size;
> +	int r = 0;
> +
> +	switch (inti->type) {
> +	case KVM_S390_INT_VIRTIO:
> +	case KVM_S390_INT_SERVICE:
> +		source = &inti->ext;
> +		target = &uptr->u.ext;
> +		size = sizeof(inti->ext);
> +		break;
> +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> +		source = &inti->io;
> +		target = &uptr->u.io;
> +		size = sizeof(inti->io);
> +		break;
> +	case KVM_S390_MCHK:
> +		source = &inti->mchk;
> +		target = &uptr->u.mchk;
> +		size = sizeof(inti->mchk);
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	r = put_user(inti->type, (u64 __user *) &uptr->type);
> +	if (copy_to_user(target, source, size))
> +		r = -EFAULT;
> +
> +	return r;
> +}
> +
> +static int dequeue_floating_irq(struct kvm *kvm, __u64 addr)
> +{
> +	struct kvm_s390_interrupt_info *inti;
> +	struct kvm_s390_float_interrupt *fi;
> +	int r = 0;
> +
> +
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	if (list_empty(&fi->list)) {
> +		mutex_unlock(&kvm->lock);
> +		spin_unlock(&fi->lock);
> +		return -ENODATA;
> +	}
> +	inti = list_first_entry(&fi->list, struct kvm_s390_interrupt_info, list);
> +	list_del(&inti->list);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +
> +	r = copy_irq_to_user(inti, addr);
> +
> +	kfree(inti);
> +	return r;
> +}
> +
> +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	int r;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_FLIC_DEQUEUE:
> +		r = dequeue_floating_irq(dev->kvm, attr->addr);
> +		break;
> +	default:
> +		r = -EINVAL;
> +	}
> +
> +	return r;
> +}
> +
> +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
> +				     u64 addr)
> +{
> +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> +	void *target = NULL;
> +	void __user *source;
> +	u64 size;
> +	int r = 0;
> +
> +	if (get_user(inti->type, (u64 __user *)addr))
> +		return -EFAULT;
> +	switch (inti->type) {
> +	case KVM_S390_INT_VIRTIO:
> +	case KVM_S390_INT_SERVICE:
> +		target = (void *) &inti->ext;
> +		source = &uptr->u.ext;
> +		size = sizeof(inti->ext);
> +		break;
> +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> +		target = (void *) &inti->io;
> +		source = &uptr->u.io;
> +		size = sizeof(inti->io);
> +		break;
> +	case KVM_S390_MCHK:
> +		target = (void *) &inti->mchk;
> +		source = &uptr->u.mchk;
> +		size = sizeof(inti->mchk);
> +		break;
> +	default:
> +		r = -EINVAL;
> +		return r;
> +	}
> +
> +	if (copy_from_user(target, source, size))
> +		r = -EFAULT;
> +
> +	return r;
> +}
> +
> +static int enqueue_floating_irq(struct kvm_device *dev,
> +				 struct kvm_device_attr *attr)
> +{
> +	struct kvm_s390_interrupt_info *inti = NULL;
> +	int r = 0;
> +
> +	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> +	if (!inti)
> +		return -ENOMEM;
> +
> +	r = copy_irq_from_user(inti, attr->addr);
> +	if (r) {
> +		kfree(inti);
> +		return r;
> +	}
> +	__inject_vm(dev->kvm, inti);
> +
> +	return r;
> +}
> +
> +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	int r = 0;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_FLIC_ENQUEUE:
> +		r = enqueue_floating_irq(dev, attr);
> +		break;
> +	case KVM_DEV_FLIC_CLEAR_IRQS:
> +		r = 0;
> +		clear_floating_interrupts(dev->kvm);
> +		break;
> +	default:
> +		r = -EINVAL;
> +	}
> +
> +	return r;
> +}
> +
> +static int flic_create(struct kvm_device *dev, u32 type)
> +{
> +	if (!dev)
> +		return -EINVAL;
> +	if (dev->kvm->arch.flic)
> +		return -EINVAL;
> +	dev->kvm->arch.flic = dev;
> +	return 0;
> +}
> +
> +static void flic_destroy(struct kvm_device *dev)
> +{
> +	dev->kvm->arch.flic = NULL;
You need to call kfree(dev) here. There is a patch that moves this free
to a common code, but it is not yet in.

> +}
> +
> +/* s390 floating irq controller (flic) */
> +struct kvm_device_ops kvm_flic_ops = {
> +	.name = "kvm-flic",
> +	.get_attr = flic_get_attr,
> +	.set_attr = flic_set_attr,
> +	.create = flic_create,
> +	.destroy = flic_destroy,
> +};
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 1e4e7b9..30e2c9a 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -157,6 +157,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_ENABLE_CAP:
>  	case KVM_CAP_S390_CSS_SUPPORT:
>  	case KVM_CAP_IOEVENTFD:
> +	case KVM_CAP_DEVICE_CTRL:
>  		r = 1;
>  		break;
>  	case KVM_CAP_NR_VCPUS:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7c961e1..2077dd0 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1065,6 +1065,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp);
>  
>  extern struct kvm_device_ops kvm_mpic_ops;
>  extern struct kvm_device_ops kvm_xics_ops;
> +extern struct kvm_device_ops kvm_flic_ops;
>  
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>  
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 450fae8..fa59f1a 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -906,6 +906,7 @@ struct kvm_device_attr {
>  #define KVM_DEV_TYPE_FSL_MPIC_20	1
>  #define KVM_DEV_TYPE_FSL_MPIC_42	2
>  #define KVM_DEV_TYPE_XICS		3
> +#define KVM_DEV_TYPE_FLIC		5
>  
>  /*
>   * ioctls for VM fds
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d469114..dd2cc28 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2270,6 +2270,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
>  		ops = &kvm_xics_ops;
>  		break;
>  #endif
> +#ifdef CONFIG_S390
> +	case KVM_DEV_TYPE_FLIC:
> +		ops = &kvm_flic_ops;
> +		break;
> +#endif
>  	default:
>  		return -ENODEV;
>  	}
> -- 
> 1.8.3.1

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christian Borntraeger Oct. 14, 2013, 7:58 a.m. UTC | #2
On 13/10/13 10:39, Gleb Natapov wrote:
> On Tue, Oct 08, 2013 at 04:54:55PM +0200, Christian Borntraeger wrote:
>> From: Jens Freimann <jfrei@linux.vnet.ibm.com>
>>
>> This patch adds a floating irq controller as a kvm_device.
>> It will be necessary for migration of floating interrupts as well
>> as for hardening the reset code by allowing user space to explicitly
>> remove all pending floating interrupts.
>>
>> Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
>> Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>> ---
>>  Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
>>  arch/s390/include/asm/kvm_host.h                |   1 +
>>  arch/s390/include/uapi/asm/kvm.h                |   5 +
>>  arch/s390/kvm/interrupt.c                       | 296 ++++++++++++++++++++----
>>  arch/s390/kvm/kvm-s390.c                        |   1 +
>>  include/linux/kvm_host.h                        |   1 +
>>  include/uapi/linux/kvm.h                        |   1 +
>>  virt/kvm/kvm_main.c                             |   5 +
>>  8 files changed, 295 insertions(+), 51 deletions(-)
>>  create mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt
>>
>> diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
>> new file mode 100644
>> index 0000000..06aef31
>> --- /dev/null
>> +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
>> @@ -0,0 +1,36 @@
>> +FLIC (floating interrupt controller)
>> +====================================
>> +
>> +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
>> +machine check interruptions. All interrupts are stored in a per-vm list of
>> +pending interrupts. FLIC performs operations on this list.
>> +
>> +Only one FLIC instance may be instantiated.
>> +
>> +FLIC provides support to
>> +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
>> +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
>> +
>> +Groups:
>> +  KVM_DEV_FLIC_ENQUEUE
>> +    Adds one interrupt to the list of pending floating interrupts. Interrupts
>> +    are taken from this list for injection into the guest. attr contains
>> +    a struct kvm_s390_irq which contains all data relevant for
>> +    interrupt injection.
>> +    The format of the data structure kvm_s390_irq as it is copied from userspace
>> +    is defined in usr/include/linux/kvm.h.
>> +    For historic reasons list members are stored in a different data structure, i.e.
>> +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
>> +    which can then be added to the list.
>> +
>> +  KVM_DEV_FLIC_DEQUEUE
>> +    Takes one element off the pending interrupts list and copies it into userspace.
>> +    Dequeued interrupts are not injected into the guest.
>> +    attr->addr contains the userspace address of a struct kvm_s390_irq.
>> +    List elements are stored in the format of struct kvm_s390_interrupt_info
>> +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
>> +    (usr/include/linux/kvm.h)
>> +
> Can interrupt be dequeued on real HW also? When this interface will be
> used?

This is used for migration. (Will send the qemu patches soon). 

The thing is,that we dont have classic interrupt lines from a software perspective. We have
external interrupts, I/O interrupts, machine check interrupts, program interrupts, restart
interrupts, supervisor call interrupts. Several interrupts are cpu local (restart, supervisor
call, program check interrupts). This is simple, because only one interrupt can be pending
at a CPU.

There are several types of external interrupts. Some are cpu local (after a sigp --> IPI)
others are floating (pending on all CPUs).

All I/O interrupts are floating. The thing is now, that each classic I/O interrupts has a 12
byte chunk of per interrupt payload. (There is an additional interrupt response block that has
to be queried by the guest with TSCH). 

Since we can have up to 256k devices per guest, we could in theory have up to 256k classic 
interrupts with different payload pending. (plus machine checks, plus other floating external
interupts)
We dont want to always dump this big queue, therefore we decided to keep these in a list.

I think Jens will give some introduction about s390 architecture at the KVM forum this year.


PS: There is some upcoming changes that will use adapter interrupts for virtio (I/O interrupts
with limited payload that do interrupt coalescing for all pending devices), Still we need to
be able to handle the classic interrupts as well for ccw configuration handling.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Freimann Oct. 14, 2013, 8:28 a.m. UTC | #3
On Sun, Oct 13, 2013 at 11:39:55AM +0300, Gleb Natapov wrote:
> On Tue, Oct 08, 2013 at 04:54:55PM +0200, Christian Borntraeger wrote:
> > From: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > 
> > This patch adds a floating irq controller as a kvm_device.
> > It will be necessary for migration of floating interrupts as well
> > as for hardening the reset code by allowing user space to explicitly
> > remove all pending floating interrupts.
> > 
> > Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> > Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > ---
> >  Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
> >  arch/s390/include/asm/kvm_host.h                |   1 +
> >  arch/s390/include/uapi/asm/kvm.h                |   5 +
> >  arch/s390/kvm/interrupt.c                       | 296 ++++++++++++++++++++----
> >  arch/s390/kvm/kvm-s390.c                        |   1 +
> >  include/linux/kvm_host.h                        |   1 +
> >  include/uapi/linux/kvm.h                        |   1 +
> >  virt/kvm/kvm_main.c                             |   5 +
> >  8 files changed, 295 insertions(+), 51 deletions(-)
> >  create mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt
> > 
> > diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> > new file mode 100644
> > index 0000000..06aef31
> > --- /dev/null
> > +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> > @@ -0,0 +1,36 @@
> > +FLIC (floating interrupt controller)
> > +====================================
> > +
> > +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> > +machine check interruptions. All interrupts are stored in a per-vm list of
> > +pending interrupts. FLIC performs operations on this list.
> > +
> > +Only one FLIC instance may be instantiated.
> > +
> > +FLIC provides support to
> > +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> > +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> > +
> > +Groups:
> > +  KVM_DEV_FLIC_ENQUEUE
> > +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> > +    are taken from this list for injection into the guest. attr contains
> > +    a struct kvm_s390_irq which contains all data relevant for
> > +    interrupt injection.
> > +    The format of the data structure kvm_s390_irq as it is copied from userspace
> > +    is defined in usr/include/linux/kvm.h.
> > +    For historic reasons list members are stored in a different data structure, i.e.
> > +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> > +    which can then be added to the list.
> > +
> > +  KVM_DEV_FLIC_DEQUEUE
> > +    Takes one element off the pending interrupts list and copies it into userspace.
> > +    Dequeued interrupts are not injected into the guest.
> > +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> > +    List elements are stored in the format of struct kvm_s390_interrupt_info
> > +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> > +    (usr/include/linux/kvm.h)
> > +
> Can interrupt be dequeued on real HW also? When this interface will be
> used?

We will it for migration. (See Christians mail)  
 
> > +  KVM_DEV_FLIC_CLEAR_IRQS
> > +    Simply deletes all elements from the list of currently pending floating interrupts.
> > +    No interrupts are injected into the guest.
> > diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
> > index 78b6918..2d09c1d 100644
> > --- a/arch/s390/include/asm/kvm_host.h
> > +++ b/arch/s390/include/asm/kvm_host.h
> > @@ -237,6 +237,7 @@ struct kvm_arch{
> >  	struct sca_block *sca;
> >  	debug_info_t *dbf;
> >  	struct kvm_s390_float_interrupt float_int;
> > +	struct kvm_device *flic;
> >  	struct gmap *gmap;
> >  	int css_support;
> >  };
> > diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
> > index d25da59..33d52b8 100644
> > --- a/arch/s390/include/uapi/asm/kvm.h
> > +++ b/arch/s390/include/uapi/asm/kvm.h
> > @@ -16,6 +16,11 @@
> >  
> >  #define __KVM_S390
> >  
> > +/* Device control API: s390-specific devices */
> > +#define KVM_DEV_FLIC_DEQUEUE 1
> > +#define KVM_DEV_FLIC_ENQUEUE 2
> > +#define KVM_DEV_FLIC_CLEAR_IRQS 3
> > +
> >  /* for KVM_GET_REGS and KVM_SET_REGS */
> >  struct kvm_regs {
> >  	/* general purpose regs for s390 */
> > diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
> > index e7323cd..66478a0 100644
> > --- a/arch/s390/kvm/interrupt.c
> > +++ b/arch/s390/kvm/interrupt.c
> > @@ -659,53 +659,85 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
> >  	return inti;
> >  }
> >  
> > -int kvm_s390_inject_vm(struct kvm *kvm,
> > -		       struct kvm_s390_interrupt *s390int)
> > +static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
> >  {
> >  	struct kvm_s390_local_interrupt *li;
> >  	struct kvm_s390_float_interrupt *fi;
> > -	struct kvm_s390_interrupt_info *inti, *iter;
> > +	struct kvm_s390_interrupt_info *iter;
> >  	int sigcpu;
> >  
> > +	mutex_lock(&kvm->lock);
> > +	fi = &kvm->arch.float_int;
> > +	spin_lock(&fi->lock);
> > +	if (!is_ioint(inti->type)) {
> > +		list_add_tail(&inti->list, &fi->list);
> > +	} else {
> > +		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> > +
> > +		/* Keep I/O interrupts sorted in isc order. */
> > +		list_for_each_entry(iter, &fi->list, list) {
> > +			if (!is_ioint(iter->type))
> > +				continue;
> > +			if (int_word_to_isc_bits(iter->io.io_int_word) <= isc_bits)
> > +				continue;
> > +			break;
> > +		}
> > +		list_add_tail(&inti->list, &iter->list);
> > +	}
> > +	atomic_set(&fi->active, 1);
> > +	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> > +	if (sigcpu == KVM_MAX_VCPUS) {
> > +		do {
> > +			sigcpu = fi->next_rr_cpu++;
> > +			if (sigcpu == KVM_MAX_VCPUS)
> > +				sigcpu = fi->next_rr_cpu = 0;
> > +		} while (fi->local_int[sigcpu] == NULL);
> > +	}
> > +	li = fi->local_int[sigcpu];
> > +	spin_lock_bh(&li->lock);
> > +	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> > +	if (waitqueue_active(li->wq))
> > +		wake_up_interruptible(li->wq);
> > +	spin_unlock_bh(&li->lock);
> > +	spin_unlock(&fi->lock);
> > +	mutex_unlock(&kvm->lock);
> > +}
> > +
> > +int kvm_s390_inject_vm(struct kvm *kvm,
> > +		       struct kvm_s390_interrupt *s390int)
> > +{
> > +	struct kvm_s390_interrupt_info *inti;
> > +
> >  	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> >  	if (!inti)
> >  		return -ENOMEM;
> >  
> > -	switch (s390int->type) {
> > +	inti->type = s390int->type;
> > +	switch (inti->type) {
> >  	case KVM_S390_INT_VIRTIO:
> >  		VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
> >  			 s390int->parm, s390int->parm64);
> > -		inti->type = s390int->type;
> >  		inti->ext.ext_params = s390int->parm;
> >  		inti->ext.ext_params2 = s390int->parm64;
> >  		break;
> >  	case KVM_S390_INT_SERVICE:
> >  		VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
> > -		inti->type = s390int->type;
> >  		inti->ext.ext_params = s390int->parm;
> >  		break;
> > -	case KVM_S390_PROGRAM_INT:
> > -	case KVM_S390_SIGP_STOP:
> > -	case KVM_S390_INT_EXTERNAL_CALL:
> > -	case KVM_S390_INT_EMERGENCY:
> > -		kfree(inti);
> > -		return -EINVAL;
> >  	case KVM_S390_MCHK:
> >  		VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
> >  			 s390int->parm64);
> > -		inti->type = s390int->type;
> >  		inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
> >  		inti->mchk.mcic = s390int->parm64;
> >  		break;
> >  	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> > -		if (s390int->type & IOINT_AI_MASK)
> > +		if (inti->type & IOINT_AI_MASK)
> >  			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
> >  		else
> >  			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
> >  				 s390int->type & IOINT_CSSID_MASK,
> >  				 s390int->type & IOINT_SSID_MASK,
> >  				 s390int->type & IOINT_SCHID_MASK);
> > -		inti->type = s390int->type;
> >  		inti->io.subchannel_id = s390int->parm >> 16;
> >  		inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
> >  		inti->io.io_int_parm = s390int->parm64 >> 32;
> > @@ -718,42 +750,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
> >  	trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
> >  				 2);
> >  
> > -	mutex_lock(&kvm->lock);
> > -	fi = &kvm->arch.float_int;
> > -	spin_lock(&fi->lock);
> > -	if (!is_ioint(inti->type))
> > -		list_add_tail(&inti->list, &fi->list);
> > -	else {
> > -		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> > -
> > -		/* Keep I/O interrupts sorted in isc order. */
> > -		list_for_each_entry(iter, &fi->list, list) {
> > -			if (!is_ioint(iter->type))
> > -				continue;
> > -			if (int_word_to_isc_bits(iter->io.io_int_word)
> > -			    <= isc_bits)
> > -				continue;
> > -			break;
> > -		}
> > -		list_add_tail(&inti->list, &iter->list);
> > -	}
> > -	atomic_set(&fi->active, 1);
> > -	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> > -	if (sigcpu == KVM_MAX_VCPUS) {
> > -		do {
> > -			sigcpu = fi->next_rr_cpu++;
> > -			if (sigcpu == KVM_MAX_VCPUS)
> > -				sigcpu = fi->next_rr_cpu = 0;
> > -		} while (fi->local_int[sigcpu] == NULL);
> > -	}
> > -	li = fi->local_int[sigcpu];
> > -	spin_lock_bh(&li->lock);
> > -	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> > -	if (waitqueue_active(li->wq))
> > -		wake_up_interruptible(li->wq);
> > -	spin_unlock_bh(&li->lock);
> > -	spin_unlock(&fi->lock);
> > -	mutex_unlock(&kvm->lock);
> > +	__inject_vm(kvm, inti);
> >  	return 0;
> >  }
> >  
> > @@ -841,3 +838,200 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
> >  	mutex_unlock(&vcpu->kvm->lock);
> >  	return 0;
> >  }
> > +
> > +static void clear_floating_interrupts(struct kvm *kvm)
> > +{
> > +	struct kvm_s390_float_interrupt *fi;
> > +	struct kvm_s390_interrupt_info	*n, *inti = NULL;
> > +
> > +	mutex_lock(&kvm->lock);
> > +	fi = &kvm->arch.float_int;
> > +	spin_lock(&fi->lock);
> > +	list_for_each_entry_safe(inti, n, &fi->list, list) {
> > +		list_del(&inti->list);
> > +		kfree(inti);
> > +	}
> > +	atomic_set(&fi->active, 0);
> > +	spin_unlock(&fi->lock);
> > +	mutex_unlock(&kvm->lock);
> > +}
> > +
> > +static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
> > +				   u64 addr)
> > +{
> > +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> > +	void __user *target;
> > +	void *source;
> > +	u64 size;
> > +	int r = 0;
> > +
> > +	switch (inti->type) {
> > +	case KVM_S390_INT_VIRTIO:
> > +	case KVM_S390_INT_SERVICE:
> > +		source = &inti->ext;
> > +		target = &uptr->u.ext;
> > +		size = sizeof(inti->ext);
> > +		break;
> > +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> > +		source = &inti->io;
> > +		target = &uptr->u.io;
> > +		size = sizeof(inti->io);
> > +		break;
> > +	case KVM_S390_MCHK:
> > +		source = &inti->mchk;
> > +		target = &uptr->u.mchk;
> > +		size = sizeof(inti->mchk);
> > +		break;
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +
> > +	r = put_user(inti->type, (u64 __user *) &uptr->type);
> > +	if (copy_to_user(target, source, size))
> > +		r = -EFAULT;
> > +
> > +	return r;
> > +}
> > +
> > +static int dequeue_floating_irq(struct kvm *kvm, __u64 addr)
> > +{
> > +	struct kvm_s390_interrupt_info *inti;
> > +	struct kvm_s390_float_interrupt *fi;
> > +	int r = 0;
> > +
> > +
> > +	mutex_lock(&kvm->lock);
> > +	fi = &kvm->arch.float_int;
> > +	spin_lock(&fi->lock);
> > +	if (list_empty(&fi->list)) {
> > +		mutex_unlock(&kvm->lock);
> > +		spin_unlock(&fi->lock);
> > +		return -ENODATA;
> > +	}
> > +	inti = list_first_entry(&fi->list, struct kvm_s390_interrupt_info, list);
> > +	list_del(&inti->list);
> > +	spin_unlock(&fi->lock);
> > +	mutex_unlock(&kvm->lock);
> > +
> > +	r = copy_irq_to_user(inti, addr);
> > +
> > +	kfree(inti);
> > +	return r;
> > +}
> > +
> > +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> > +{
> > +	int r;
> > +
> > +	switch (attr->group) {
> > +	case KVM_DEV_FLIC_DEQUEUE:
> > +		r = dequeue_floating_irq(dev->kvm, attr->addr);
> > +		break;
> > +	default:
> > +		r = -EINVAL;
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
> > +				     u64 addr)
> > +{
> > +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> > +	void *target = NULL;
> > +	void __user *source;
> > +	u64 size;
> > +	int r = 0;
> > +
> > +	if (get_user(inti->type, (u64 __user *)addr))
> > +		return -EFAULT;
> > +	switch (inti->type) {
> > +	case KVM_S390_INT_VIRTIO:
> > +	case KVM_S390_INT_SERVICE:
> > +		target = (void *) &inti->ext;
> > +		source = &uptr->u.ext;
> > +		size = sizeof(inti->ext);
> > +		break;
> > +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> > +		target = (void *) &inti->io;
> > +		source = &uptr->u.io;
> > +		size = sizeof(inti->io);
> > +		break;
> > +	case KVM_S390_MCHK:
> > +		target = (void *) &inti->mchk;
> > +		source = &uptr->u.mchk;
> > +		size = sizeof(inti->mchk);
> > +		break;
> > +	default:
> > +		r = -EINVAL;
> > +		return r;
> > +	}
> > +
> > +	if (copy_from_user(target, source, size))
> > +		r = -EFAULT;
> > +
> > +	return r;
> > +}
> > +
> > +static int enqueue_floating_irq(struct kvm_device *dev,
> > +				 struct kvm_device_attr *attr)
> > +{
> > +	struct kvm_s390_interrupt_info *inti = NULL;
> > +	int r = 0;
> > +
> > +	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> > +	if (!inti)
> > +		return -ENOMEM;
> > +
> > +	r = copy_irq_from_user(inti, attr->addr);
> > +	if (r) {
> > +		kfree(inti);
> > +		return r;
> > +	}
> > +	__inject_vm(dev->kvm, inti);
> > +
> > +	return r;
> > +}
> > +
> > +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> > +{
> > +	int r = 0;
> > +
> > +	switch (attr->group) {
> > +	case KVM_DEV_FLIC_ENQUEUE:
> > +		r = enqueue_floating_irq(dev, attr);
> > +		break;
> > +	case KVM_DEV_FLIC_CLEAR_IRQS:
> > +		r = 0;
> > +		clear_floating_interrupts(dev->kvm);
> > +		break;
> > +	default:
> > +		r = -EINVAL;
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static int flic_create(struct kvm_device *dev, u32 type)
> > +{
> > +	if (!dev)
> > +		return -EINVAL;
> > +	if (dev->kvm->arch.flic)
> > +		return -EINVAL;
> > +	dev->kvm->arch.flic = dev;
> > +	return 0;
> > +}
> > +
> > +static void flic_destroy(struct kvm_device *dev)
> > +{
> > +	dev->kvm->arch.flic = NULL;
> You need to call kfree(dev) here. There is a patch that moves this free
> to a common code, but it is not yet in.

Ok, I wasn't aware of this. Will fix.

regards
Jens
 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Oct. 14, 2013, 9:07 a.m. UTC | #4
On Mon, Oct 14, 2013 at 10:28:57AM +0200, Jens Freimann wrote:
> On Sun, Oct 13, 2013 at 11:39:55AM +0300, Gleb Natapov wrote:
> > On Tue, Oct 08, 2013 at 04:54:55PM +0200, Christian Borntraeger wrote:
> > > From: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > > 
> > > This patch adds a floating irq controller as a kvm_device.
> > > It will be necessary for migration of floating interrupts as well
> > > as for hardening the reset code by allowing user space to explicitly
> > > remove all pending floating interrupts.
> > > 
> > > Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > > Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> > > Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > > ---
> > >  Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
> > >  arch/s390/include/asm/kvm_host.h                |   1 +
> > >  arch/s390/include/uapi/asm/kvm.h                |   5 +
> > >  arch/s390/kvm/interrupt.c                       | 296 ++++++++++++++++++++----
> > >  arch/s390/kvm/kvm-s390.c                        |   1 +
> > >  include/linux/kvm_host.h                        |   1 +
> > >  include/uapi/linux/kvm.h                        |   1 +
> > >  virt/kvm/kvm_main.c                             |   5 +
> > >  8 files changed, 295 insertions(+), 51 deletions(-)
> > >  create mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt
> > > 
> > > diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> > > new file mode 100644
> > > index 0000000..06aef31
> > > --- /dev/null
> > > +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> > > @@ -0,0 +1,36 @@
> > > +FLIC (floating interrupt controller)
> > > +====================================
> > > +
> > > +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> > > +machine check interruptions. All interrupts are stored in a per-vm list of
> > > +pending interrupts. FLIC performs operations on this list.
> > > +
> > > +Only one FLIC instance may be instantiated.
> > > +
> > > +FLIC provides support to
> > > +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> > > +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> > > +
> > > +Groups:
> > > +  KVM_DEV_FLIC_ENQUEUE
> > > +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> > > +    are taken from this list for injection into the guest. attr contains
> > > +    a struct kvm_s390_irq which contains all data relevant for
> > > +    interrupt injection.
> > > +    The format of the data structure kvm_s390_irq as it is copied from userspace
> > > +    is defined in usr/include/linux/kvm.h.
> > > +    For historic reasons list members are stored in a different data structure, i.e.
> > > +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> > > +    which can then be added to the list.
> > > +
> > > +  KVM_DEV_FLIC_DEQUEUE
> > > +    Takes one element off the pending interrupts list and copies it into userspace.
> > > +    Dequeued interrupts are not injected into the guest.
> > > +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> > > +    List elements are stored in the format of struct kvm_s390_interrupt_info
> > > +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> > > +    (usr/include/linux/kvm.h)
> > > +
> > Can interrupt be dequeued on real HW also? When this interface will be
> > used?
> 
> We will it for migration. (See Christians mail)  
>  
For migration you do not need dequeue semantics though, dequeuing
does not hurt in case of migration, but what if we will want to add
QEMU monitor command that inspects interrupt queue (like we have for
inspecting processor's register state). The destructive nature of the
command will prevent us from doing so. We need non destructive way to
inspect the state, no?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Oct. 14, 2013, 10:21 a.m. UTC | #5
On Mon, Oct 14, 2013 at 09:58:47AM +0200, Christian Borntraeger wrote:
> On 13/10/13 10:39, Gleb Natapov wrote:
> > On Tue, Oct 08, 2013 at 04:54:55PM +0200, Christian Borntraeger wrote:
> >> From: Jens Freimann <jfrei@linux.vnet.ibm.com>
> >>
> >> This patch adds a floating irq controller as a kvm_device.
> >> It will be necessary for migration of floating interrupts as well
> >> as for hardening the reset code by allowing user space to explicitly
> >> remove all pending floating interrupts.
> >>
> >> Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
> >> Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> >> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> >> ---
> >>  Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
> >>  arch/s390/include/asm/kvm_host.h                |   1 +
> >>  arch/s390/include/uapi/asm/kvm.h                |   5 +
> >>  arch/s390/kvm/interrupt.c                       | 296 ++++++++++++++++++++----
> >>  arch/s390/kvm/kvm-s390.c                        |   1 +
> >>  include/linux/kvm_host.h                        |   1 +
> >>  include/uapi/linux/kvm.h                        |   1 +
> >>  virt/kvm/kvm_main.c                             |   5 +
> >>  8 files changed, 295 insertions(+), 51 deletions(-)
> >>  create mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt
> >>
> >> diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> >> new file mode 100644
> >> index 0000000..06aef31
> >> --- /dev/null
> >> +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> >> @@ -0,0 +1,36 @@
> >> +FLIC (floating interrupt controller)
> >> +====================================
> >> +
> >> +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> >> +machine check interruptions. All interrupts are stored in a per-vm list of
> >> +pending interrupts. FLIC performs operations on this list.
> >> +
> >> +Only one FLIC instance may be instantiated.
> >> +
> >> +FLIC provides support to
> >> +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> >> +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> >> +
> >> +Groups:
> >> +  KVM_DEV_FLIC_ENQUEUE
> >> +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> >> +    are taken from this list for injection into the guest. attr contains
> >> +    a struct kvm_s390_irq which contains all data relevant for
> >> +    interrupt injection.
> >> +    The format of the data structure kvm_s390_irq as it is copied from userspace
> >> +    is defined in usr/include/linux/kvm.h.
> >> +    For historic reasons list members are stored in a different data structure, i.e.
> >> +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> >> +    which can then be added to the list.
> >> +
> >> +  KVM_DEV_FLIC_DEQUEUE
> >> +    Takes one element off the pending interrupts list and copies it into userspace.
> >> +    Dequeued interrupts are not injected into the guest.
> >> +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> >> +    List elements are stored in the format of struct kvm_s390_interrupt_info
> >> +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> >> +    (usr/include/linux/kvm.h)
> >> +
> > Can interrupt be dequeued on real HW also? When this interface will be
> > used?
> 
> This is used for migration. (Will send the qemu patches soon). 
> 
> The thing is,that we dont have classic interrupt lines from a software perspective. We have
> external interrupts, I/O interrupts, machine check interrupts, program interrupts, restart
> interrupts, supervisor call interrupts. Several interrupts are cpu local (restart, supervisor
> call, program check interrupts). This is simple, because only one interrupt can be pending
> at a CPU.
> 
> There are several types of external interrupts. Some are cpu local (after a sigp --> IPI)
> others are floating (pending on all CPUs).
> 
> All I/O interrupts are floating. The thing is now, that each classic I/O interrupts has a 12
> byte chunk of per interrupt payload. (There is an additional interrupt response block that has
> to be queried by the guest with TSCH). 
> 
> Since we can have up to 256k devices per guest, we could in theory have up to 256k classic 
> interrupts with different payload pending. (plus machine checks, plus other floating external
> interupts)
> We dont want to always dump this big queue, therefore we decided to keep these in a list.
> 
But you need to limit the queue anyway otherwise userspace can allocate
quite a bit of kernel memory by filling in the queue, no? It is strange
to have destructive interface here because it makes queue inspection
impossible (at least without stopping a guest, dequeuing everything and
queuing it back again). What about an interface where userspace provides
an array to store queue elements and if an array is not big enough
appropriate array is returned, so userspace can retry with bigger one?
Using list internally is OK as long as its length is limited somehow.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Freimann Oct. 14, 2013, 11:13 a.m. UTC | #6
On Mon, Oct 14, 2013 at 12:07:24PM +0300, Gleb Natapov wrote:
> On Mon, Oct 14, 2013 at 10:28:57AM +0200, Jens Freimann wrote:
> > On Sun, Oct 13, 2013 at 11:39:55AM +0300, Gleb Natapov wrote:
> > > On Tue, Oct 08, 2013 at 04:54:55PM +0200, Christian Borntraeger wrote:
> > > > From: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > > > 
> > > > This patch adds a floating irq controller as a kvm_device.
> > > > It will be necessary for migration of floating interrupts as well
> > > > as for hardening the reset code by allowing user space to explicitly
> > > > remove all pending floating interrupts.
> > > > 
> > > > Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > > > Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> > > > Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > > > ---
> > > >  Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
> > > >  arch/s390/include/asm/kvm_host.h                |   1 +
> > > >  arch/s390/include/uapi/asm/kvm.h                |   5 +
> > > >  arch/s390/kvm/interrupt.c                       | 296 ++++++++++++++++++++----
> > > >  arch/s390/kvm/kvm-s390.c                        |   1 +
> > > >  include/linux/kvm_host.h                        |   1 +
> > > >  include/uapi/linux/kvm.h                        |   1 +
> > > >  virt/kvm/kvm_main.c                             |   5 +
> > > >  8 files changed, 295 insertions(+), 51 deletions(-)
> > > >  create mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt
> > > > 
> > > > diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> > > > new file mode 100644
> > > > index 0000000..06aef31
> > > > --- /dev/null
> > > > +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> > > > @@ -0,0 +1,36 @@
> > > > +FLIC (floating interrupt controller)
> > > > +====================================
> > > > +
> > > > +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> > > > +machine check interruptions. All interrupts are stored in a per-vm list of
> > > > +pending interrupts. FLIC performs operations on this list.
> > > > +
> > > > +Only one FLIC instance may be instantiated.
> > > > +
> > > > +FLIC provides support to
> > > > +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> > > > +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> > > > +
> > > > +Groups:
> > > > +  KVM_DEV_FLIC_ENQUEUE
> > > > +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> > > > +    are taken from this list for injection into the guest. attr contains
> > > > +    a struct kvm_s390_irq which contains all data relevant for
> > > > +    interrupt injection.
> > > > +    The format of the data structure kvm_s390_irq as it is copied from userspace
> > > > +    is defined in usr/include/linux/kvm.h.
> > > > +    For historic reasons list members are stored in a different data structure, i.e.
> > > > +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> > > > +    which can then be added to the list.
> > > > +
> > > > +  KVM_DEV_FLIC_DEQUEUE
> > > > +    Takes one element off the pending interrupts list and copies it into userspace.
> > > > +    Dequeued interrupts are not injected into the guest.
> > > > +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> > > > +    List elements are stored in the format of struct kvm_s390_interrupt_info
> > > > +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> > > > +    (usr/include/linux/kvm.h)
> > > > +
> > > Can interrupt be dequeued on real HW also? When this interface will be
> > > used?
> > 
> > We will it for migration. (See Christians mail)  
> >  
> For migration you do not need dequeue semantics though, dequeuing
> does not hurt in case of migration, but what if we will want to add
> QEMU monitor command that inspects interrupt queue (like we have for
> inspecting processor's register state). The destructive nature of the
> command will prevent us from doing so. We need non destructive way to
> inspect the state, no?

Inspection is a requirement that we didn't have in mind when we designed 
this. But yes, it should be non-destructive in that case.

Christian and I agree that we should defer these patches for now. It would
be good if we could discuss this interface next week at the KVM Forum.


regards
Jens

> 
> --
> 			Gleb.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Oct. 14, 2013, 11:31 a.m. UTC | #7
On Mon, Oct 14, 2013 at 01:13:30PM +0200, Jens Freimann wrote:
> On Mon, Oct 14, 2013 at 12:07:24PM +0300, Gleb Natapov wrote:
> > On Mon, Oct 14, 2013 at 10:28:57AM +0200, Jens Freimann wrote:
> > > On Sun, Oct 13, 2013 at 11:39:55AM +0300, Gleb Natapov wrote:
> > > > On Tue, Oct 08, 2013 at 04:54:55PM +0200, Christian Borntraeger wrote:
> > > > > From: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > > > > 
> > > > > This patch adds a floating irq controller as a kvm_device.
> > > > > It will be necessary for migration of floating interrupts as well
> > > > > as for hardening the reset code by allowing user space to explicitly
> > > > > remove all pending floating interrupts.
> > > > > 
> > > > > Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > > > > Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> > > > > Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > > > > ---
> > > > >  Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
> > > > >  arch/s390/include/asm/kvm_host.h                |   1 +
> > > > >  arch/s390/include/uapi/asm/kvm.h                |   5 +
> > > > >  arch/s390/kvm/interrupt.c                       | 296 ++++++++++++++++++++----
> > > > >  arch/s390/kvm/kvm-s390.c                        |   1 +
> > > > >  include/linux/kvm_host.h                        |   1 +
> > > > >  include/uapi/linux/kvm.h                        |   1 +
> > > > >  virt/kvm/kvm_main.c                             |   5 +
> > > > >  8 files changed, 295 insertions(+), 51 deletions(-)
> > > > >  create mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt
> > > > > 
> > > > > diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> > > > > new file mode 100644
> > > > > index 0000000..06aef31
> > > > > --- /dev/null
> > > > > +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> > > > > @@ -0,0 +1,36 @@
> > > > > +FLIC (floating interrupt controller)
> > > > > +====================================
> > > > > +
> > > > > +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> > > > > +machine check interruptions. All interrupts are stored in a per-vm list of
> > > > > +pending interrupts. FLIC performs operations on this list.
> > > > > +
> > > > > +Only one FLIC instance may be instantiated.
> > > > > +
> > > > > +FLIC provides support to
> > > > > +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> > > > > +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> > > > > +
> > > > > +Groups:
> > > > > +  KVM_DEV_FLIC_ENQUEUE
> > > > > +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> > > > > +    are taken from this list for injection into the guest. attr contains
> > > > > +    a struct kvm_s390_irq which contains all data relevant for
> > > > > +    interrupt injection.
> > > > > +    The format of the data structure kvm_s390_irq as it is copied from userspace
> > > > > +    is defined in usr/include/linux/kvm.h.
> > > > > +    For historic reasons list members are stored in a different data structure, i.e.
> > > > > +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> > > > > +    which can then be added to the list.
> > > > > +
> > > > > +  KVM_DEV_FLIC_DEQUEUE
> > > > > +    Takes one element off the pending interrupts list and copies it into userspace.
> > > > > +    Dequeued interrupts are not injected into the guest.
> > > > > +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> > > > > +    List elements are stored in the format of struct kvm_s390_interrupt_info
> > > > > +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> > > > > +    (usr/include/linux/kvm.h)
> > > > > +
> > > > Can interrupt be dequeued on real HW also? When this interface will be
> > > > used?
> > > 
> > > We will it for migration. (See Christians mail)  
> > >  
> > For migration you do not need dequeue semantics though, dequeuing
> > does not hurt in case of migration, but what if we will want to add
> > QEMU monitor command that inspects interrupt queue (like we have for
> > inspecting processor's register state). The destructive nature of the
> > command will prevent us from doing so. We need non destructive way to
> > inspect the state, no?
> 
> Inspection is a requirement that we didn't have in mind when we designed 
> this. But yes, it should be non-destructive in that case.
> 
> Christian and I agree that we should defer these patches for now. It would
> be good if we could discuss this interface next week at the KVM Forum.
> 
Of course.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Oct. 14, 2013, 1:35 p.m. UTC | #8
On Mon, Oct 14, 2013 at 01:13:30PM +0200, Jens Freimann wrote:
> On Mon, Oct 14, 2013 at 12:07:24PM +0300, Gleb Natapov wrote:
> > On Mon, Oct 14, 2013 at 10:28:57AM +0200, Jens Freimann wrote:
> > > On Sun, Oct 13, 2013 at 11:39:55AM +0300, Gleb Natapov wrote:
> > > > On Tue, Oct 08, 2013 at 04:54:55PM +0200, Christian Borntraeger wrote:
> > > > > From: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > > > > 
> > > > > This patch adds a floating irq controller as a kvm_device.
> > > > > It will be necessary for migration of floating interrupts as well
> > > > > as for hardening the reset code by allowing user space to explicitly
> > > > > remove all pending floating interrupts.
> > > > > 
> > > > > Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
> > > > > Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
> > > > > Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > > > > ---
> > > > >  Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
> > > > >  arch/s390/include/asm/kvm_host.h                |   1 +
> > > > >  arch/s390/include/uapi/asm/kvm.h                |   5 +
> > > > >  arch/s390/kvm/interrupt.c                       | 296 ++++++++++++++++++++----
> > > > >  arch/s390/kvm/kvm-s390.c                        |   1 +
> > > > >  include/linux/kvm_host.h                        |   1 +
> > > > >  include/uapi/linux/kvm.h                        |   1 +
> > > > >  virt/kvm/kvm_main.c                             |   5 +
> > > > >  8 files changed, 295 insertions(+), 51 deletions(-)
> > > > >  create mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt
> > > > > 
> > > > > diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> > > > > new file mode 100644
> > > > > index 0000000..06aef31
> > > > > --- /dev/null
> > > > > +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> > > > > @@ -0,0 +1,36 @@
> > > > > +FLIC (floating interrupt controller)
> > > > > +====================================
> > > > > +
> > > > > +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> > > > > +machine check interruptions. All interrupts are stored in a per-vm list of
> > > > > +pending interrupts. FLIC performs operations on this list.
> > > > > +
> > > > > +Only one FLIC instance may be instantiated.
> > > > > +
> > > > > +FLIC provides support to
> > > > > +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> > > > > +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> > > > > +
> > > > > +Groups:
> > > > > +  KVM_DEV_FLIC_ENQUEUE
> > > > > +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> > > > > +    are taken from this list for injection into the guest. attr contains
> > > > > +    a struct kvm_s390_irq which contains all data relevant for
> > > > > +    interrupt injection.
> > > > > +    The format of the data structure kvm_s390_irq as it is copied from userspace
> > > > > +    is defined in usr/include/linux/kvm.h.
> > > > > +    For historic reasons list members are stored in a different data structure, i.e.
> > > > > +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> > > > > +    which can then be added to the list.
> > > > > +
> > > > > +  KVM_DEV_FLIC_DEQUEUE
> > > > > +    Takes one element off the pending interrupts list and copies it into userspace.
> > > > > +    Dequeued interrupts are not injected into the guest.
> > > > > +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> > > > > +    List elements are stored in the format of struct kvm_s390_interrupt_info
> > > > > +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> > > > > +    (usr/include/linux/kvm.h)
> > > > > +
> > > > Can interrupt be dequeued on real HW also? When this interface will be
> > > > used?
> > > 
> > > We will it for migration. (See Christians mail)  
> > >  
> > For migration you do not need dequeue semantics though, dequeuing
> > does not hurt in case of migration, but what if we will want to add
> > QEMU monitor command that inspects interrupt queue (like we have for
> > inspecting processor's register state). The destructive nature of the
> > command will prevent us from doing so. We need non destructive way to
> > inspect the state, no?
> 
> Inspection is a requirement that we didn't have in mind when we designed 
> this. But yes, it should be non-destructive in that case.
> 
BTW, not destructive interface is better for migration too. What if
migration fails and src needs to be restarted?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
new file mode 100644
index 0000000..06aef31
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -0,0 +1,36 @@ 
+FLIC (floating interrupt controller)
+====================================
+
+FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
+machine check interruptions. All interrupts are stored in a per-vm list of
+pending interrupts. FLIC performs operations on this list.
+
+Only one FLIC instance may be instantiated.
+
+FLIC provides support to
+- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
+- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
+
+Groups:
+  KVM_DEV_FLIC_ENQUEUE
+    Adds one interrupt to the list of pending floating interrupts. Interrupts
+    are taken from this list for injection into the guest. attr contains
+    a struct kvm_s390_irq which contains all data relevant for
+    interrupt injection.
+    The format of the data structure kvm_s390_irq as it is copied from userspace
+    is defined in usr/include/linux/kvm.h.
+    For historic reasons list members are stored in a different data structure, i.e.
+    we need to copy the relevant data into a struct kvm_s390_interrupt_info
+    which can then be added to the list.
+
+  KVM_DEV_FLIC_DEQUEUE
+    Takes one element off the pending interrupts list and copies it into userspace.
+    Dequeued interrupts are not injected into the guest.
+    attr->addr contains the userspace address of a struct kvm_s390_irq.
+    List elements are stored in the format of struct kvm_s390_interrupt_info
+    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
+    (usr/include/linux/kvm.h)
+
+  KVM_DEV_FLIC_CLEAR_IRQS
+    Simply deletes all elements from the list of currently pending floating interrupts.
+    No interrupts are injected into the guest.
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 78b6918..2d09c1d 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -237,6 +237,7 @@  struct kvm_arch{
 	struct sca_block *sca;
 	debug_info_t *dbf;
 	struct kvm_s390_float_interrupt float_int;
+	struct kvm_device *flic;
 	struct gmap *gmap;
 	int css_support;
 };
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index d25da59..33d52b8 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -16,6 +16,11 @@ 
 
 #define __KVM_S390
 
+/* Device control API: s390-specific devices */
+#define KVM_DEV_FLIC_DEQUEUE 1
+#define KVM_DEV_FLIC_ENQUEUE 2
+#define KVM_DEV_FLIC_CLEAR_IRQS 3
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
 	/* general purpose regs for s390 */
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index e7323cd..66478a0 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -659,53 +659,85 @@  struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
 	return inti;
 }
 
-int kvm_s390_inject_vm(struct kvm *kvm,
-		       struct kvm_s390_interrupt *s390int)
+static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 {
 	struct kvm_s390_local_interrupt *li;
 	struct kvm_s390_float_interrupt *fi;
-	struct kvm_s390_interrupt_info *inti, *iter;
+	struct kvm_s390_interrupt_info *iter;
 	int sigcpu;
 
+	mutex_lock(&kvm->lock);
+	fi = &kvm->arch.float_int;
+	spin_lock(&fi->lock);
+	if (!is_ioint(inti->type)) {
+		list_add_tail(&inti->list, &fi->list);
+	} else {
+		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
+
+		/* Keep I/O interrupts sorted in isc order. */
+		list_for_each_entry(iter, &fi->list, list) {
+			if (!is_ioint(iter->type))
+				continue;
+			if (int_word_to_isc_bits(iter->io.io_int_word) <= isc_bits)
+				continue;
+			break;
+		}
+		list_add_tail(&inti->list, &iter->list);
+	}
+	atomic_set(&fi->active, 1);
+	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
+	if (sigcpu == KVM_MAX_VCPUS) {
+		do {
+			sigcpu = fi->next_rr_cpu++;
+			if (sigcpu == KVM_MAX_VCPUS)
+				sigcpu = fi->next_rr_cpu = 0;
+		} while (fi->local_int[sigcpu] == NULL);
+	}
+	li = fi->local_int[sigcpu];
+	spin_lock_bh(&li->lock);
+	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+	if (waitqueue_active(li->wq))
+		wake_up_interruptible(li->wq);
+	spin_unlock_bh(&li->lock);
+	spin_unlock(&fi->lock);
+	mutex_unlock(&kvm->lock);
+}
+
+int kvm_s390_inject_vm(struct kvm *kvm,
+		       struct kvm_s390_interrupt *s390int)
+{
+	struct kvm_s390_interrupt_info *inti;
+
 	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
 	if (!inti)
 		return -ENOMEM;
 
-	switch (s390int->type) {
+	inti->type = s390int->type;
+	switch (inti->type) {
 	case KVM_S390_INT_VIRTIO:
 		VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
 			 s390int->parm, s390int->parm64);
-		inti->type = s390int->type;
 		inti->ext.ext_params = s390int->parm;
 		inti->ext.ext_params2 = s390int->parm64;
 		break;
 	case KVM_S390_INT_SERVICE:
 		VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
-		inti->type = s390int->type;
 		inti->ext.ext_params = s390int->parm;
 		break;
-	case KVM_S390_PROGRAM_INT:
-	case KVM_S390_SIGP_STOP:
-	case KVM_S390_INT_EXTERNAL_CALL:
-	case KVM_S390_INT_EMERGENCY:
-		kfree(inti);
-		return -EINVAL;
 	case KVM_S390_MCHK:
 		VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
 			 s390int->parm64);
-		inti->type = s390int->type;
 		inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
 		inti->mchk.mcic = s390int->parm64;
 		break;
 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-		if (s390int->type & IOINT_AI_MASK)
+		if (inti->type & IOINT_AI_MASK)
 			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
 		else
 			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
 				 s390int->type & IOINT_CSSID_MASK,
 				 s390int->type & IOINT_SSID_MASK,
 				 s390int->type & IOINT_SCHID_MASK);
-		inti->type = s390int->type;
 		inti->io.subchannel_id = s390int->parm >> 16;
 		inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
 		inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -718,42 +750,7 @@  int kvm_s390_inject_vm(struct kvm *kvm,
 	trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
 				 2);
 
-	mutex_lock(&kvm->lock);
-	fi = &kvm->arch.float_int;
-	spin_lock(&fi->lock);
-	if (!is_ioint(inti->type))
-		list_add_tail(&inti->list, &fi->list);
-	else {
-		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
-
-		/* Keep I/O interrupts sorted in isc order. */
-		list_for_each_entry(iter, &fi->list, list) {
-			if (!is_ioint(iter->type))
-				continue;
-			if (int_word_to_isc_bits(iter->io.io_int_word)
-			    <= isc_bits)
-				continue;
-			break;
-		}
-		list_add_tail(&inti->list, &iter->list);
-	}
-	atomic_set(&fi->active, 1);
-	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
-	if (sigcpu == KVM_MAX_VCPUS) {
-		do {
-			sigcpu = fi->next_rr_cpu++;
-			if (sigcpu == KVM_MAX_VCPUS)
-				sigcpu = fi->next_rr_cpu = 0;
-		} while (fi->local_int[sigcpu] == NULL);
-	}
-	li = fi->local_int[sigcpu];
-	spin_lock_bh(&li->lock);
-	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-	if (waitqueue_active(li->wq))
-		wake_up_interruptible(li->wq);
-	spin_unlock_bh(&li->lock);
-	spin_unlock(&fi->lock);
-	mutex_unlock(&kvm->lock);
+	__inject_vm(kvm, inti);
 	return 0;
 }
 
@@ -841,3 +838,200 @@  int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 	mutex_unlock(&vcpu->kvm->lock);
 	return 0;
 }
+
+static void clear_floating_interrupts(struct kvm *kvm)
+{
+	struct kvm_s390_float_interrupt *fi;
+	struct kvm_s390_interrupt_info	*n, *inti = NULL;
+
+	mutex_lock(&kvm->lock);
+	fi = &kvm->arch.float_int;
+	spin_lock(&fi->lock);
+	list_for_each_entry_safe(inti, n, &fi->list, list) {
+		list_del(&inti->list);
+		kfree(inti);
+	}
+	atomic_set(&fi->active, 0);
+	spin_unlock(&fi->lock);
+	mutex_unlock(&kvm->lock);
+}
+
+static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
+				   u64 addr)
+{
+	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+	void __user *target;
+	void *source;
+	u64 size;
+	int r = 0;
+
+	switch (inti->type) {
+	case KVM_S390_INT_VIRTIO:
+	case KVM_S390_INT_SERVICE:
+		source = &inti->ext;
+		target = &uptr->u.ext;
+		size = sizeof(inti->ext);
+		break;
+	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+		source = &inti->io;
+		target = &uptr->u.io;
+		size = sizeof(inti->io);
+		break;
+	case KVM_S390_MCHK:
+		source = &inti->mchk;
+		target = &uptr->u.mchk;
+		size = sizeof(inti->mchk);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	r = put_user(inti->type, (u64 __user *) &uptr->type);
+	if (copy_to_user(target, source, size))
+		r = -EFAULT;
+
+	return r;
+}
+
+static int dequeue_floating_irq(struct kvm *kvm, __u64 addr)
+{
+	struct kvm_s390_interrupt_info *inti;
+	struct kvm_s390_float_interrupt *fi;
+	int r = 0;
+
+
+	mutex_lock(&kvm->lock);
+	fi = &kvm->arch.float_int;
+	spin_lock(&fi->lock);
+	if (list_empty(&fi->list)) {
+		mutex_unlock(&kvm->lock);
+		spin_unlock(&fi->lock);
+		return -ENODATA;
+	}
+	inti = list_first_entry(&fi->list, struct kvm_s390_interrupt_info, list);
+	list_del(&inti->list);
+	spin_unlock(&fi->lock);
+	mutex_unlock(&kvm->lock);
+
+	r = copy_irq_to_user(inti, addr);
+
+	kfree(inti);
+	return r;
+}
+
+static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	int r;
+
+	switch (attr->group) {
+	case KVM_DEV_FLIC_DEQUEUE:
+		r = dequeue_floating_irq(dev->kvm, attr->addr);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
+static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
+				     u64 addr)
+{
+	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+	void *target = NULL;
+	void __user *source;
+	u64 size;
+	int r = 0;
+
+	if (get_user(inti->type, (u64 __user *)addr))
+		return -EFAULT;
+	switch (inti->type) {
+	case KVM_S390_INT_VIRTIO:
+	case KVM_S390_INT_SERVICE:
+		target = (void *) &inti->ext;
+		source = &uptr->u.ext;
+		size = sizeof(inti->ext);
+		break;
+	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+		target = (void *) &inti->io;
+		source = &uptr->u.io;
+		size = sizeof(inti->io);
+		break;
+	case KVM_S390_MCHK:
+		target = (void *) &inti->mchk;
+		source = &uptr->u.mchk;
+		size = sizeof(inti->mchk);
+		break;
+	default:
+		r = -EINVAL;
+		return r;
+	}
+
+	if (copy_from_user(target, source, size))
+		r = -EFAULT;
+
+	return r;
+}
+
+static int enqueue_floating_irq(struct kvm_device *dev,
+				 struct kvm_device_attr *attr)
+{
+	struct kvm_s390_interrupt_info *inti = NULL;
+	int r = 0;
+
+	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+	if (!inti)
+		return -ENOMEM;
+
+	r = copy_irq_from_user(inti, attr->addr);
+	if (r) {
+		kfree(inti);
+		return r;
+	}
+	__inject_vm(dev->kvm, inti);
+
+	return r;
+}
+
+static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	int r = 0;
+
+	switch (attr->group) {
+	case KVM_DEV_FLIC_ENQUEUE:
+		r = enqueue_floating_irq(dev, attr);
+		break;
+	case KVM_DEV_FLIC_CLEAR_IRQS:
+		r = 0;
+		clear_floating_interrupts(dev->kvm);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
+static int flic_create(struct kvm_device *dev, u32 type)
+{
+	if (!dev)
+		return -EINVAL;
+	if (dev->kvm->arch.flic)
+		return -EINVAL;
+	dev->kvm->arch.flic = dev;
+	return 0;
+}
+
+static void flic_destroy(struct kvm_device *dev)
+{
+	dev->kvm->arch.flic = NULL;
+}
+
+/* s390 floating irq controller (flic) */
+struct kvm_device_ops kvm_flic_ops = {
+	.name = "kvm-flic",
+	.get_attr = flic_get_attr,
+	.set_attr = flic_set_attr,
+	.create = flic_create,
+	.destroy = flic_destroy,
+};
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1e4e7b9..30e2c9a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -157,6 +157,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_S390_CSS_SUPPORT:
 	case KVM_CAP_IOEVENTFD:
+	case KVM_CAP_DEVICE_CTRL:
 		r = 1;
 		break;
 	case KVM_CAP_NR_VCPUS:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7c961e1..2077dd0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1065,6 +1065,7 @@  struct kvm_device *kvm_device_from_filp(struct file *filp);
 
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
+extern struct kvm_device_ops kvm_flic_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 450fae8..fa59f1a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -906,6 +906,7 @@  struct kvm_device_attr {
 #define KVM_DEV_TYPE_FSL_MPIC_20	1
 #define KVM_DEV_TYPE_FSL_MPIC_42	2
 #define KVM_DEV_TYPE_XICS		3
+#define KVM_DEV_TYPE_FLIC		5
 
 /*
  * ioctls for VM fds
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d469114..dd2cc28 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2270,6 +2270,11 @@  static int kvm_ioctl_create_device(struct kvm *kvm,
 		ops = &kvm_xics_ops;
 		break;
 #endif
+#ifdef CONFIG_S390
+	case KVM_DEV_TYPE_FLIC:
+		ops = &kvm_flic_ops;
+		break;
+#endif
 	default:
 		return -ENODEV;
 	}