diff mbox

[8/8] KVM: PPC: Book 3S: Add API for in-kernel XICS emulation

Message ID 20130418063255.GI25033@drongo (mailing list archive)
State New, archived
Headers show

Commit Message

Paul Mackerras April 18, 2013, 6:32 a.m. UTC
This adds the API for userspace to instantiate an XICS device in a VM
and connect VCPUs to it.  The API consists of a new device type for
the KVM_CREATE_DEVICE ioctl, a new capability KVM_CAP_IRQ_XICS, which
functions similarly to KVM_CAP_IRQ_MPIC, and the KVM_IRQ_LINE ioctl,
which is used to assert and deassert interrupt inputs of the XICS.

The XICS device has one attribute group, KVM_DEV_XICS_GRP_SOURCES.
Each attribute within this group corresponds to the state of one
interrupt source.  The attribute number is the same as the interrupt
source number.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 Documentation/virtual/kvm/api.txt          |    8 ++
 Documentation/virtual/kvm/devices/xics.txt |   66 +++++++++
 arch/powerpc/include/asm/kvm_ppc.h         |    2 +
 arch/powerpc/include/uapi/asm/kvm.h        |   12 ++
 arch/powerpc/kvm/book3s_xics.c             |  211 ++++++++++++++++++++++++++--
 arch/powerpc/kvm/book3s_xics.h             |    6 +
 arch/powerpc/kvm/powerpc.c                 |   35 +++++
 include/linux/kvm_host.h                   |    1 +
 include/uapi/linux/kvm.h                   |    2 +
 virt/kvm/kvm_main.c                        |    5 +
 10 files changed, 339 insertions(+), 9 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/xics.txt
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 54bb6ad..db230f8 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2756,3 +2756,11 @@  Parameters: args[0] is the MPIC device fd
             args[1] is the MPIC CPU number for this vcpu
 
 This capability connects the vcpu to an in-kernel MPIC device.
+
+6.7 KVM_CAP_IRQ_XICS
+
+Architectures: ppc
+Parameters: args[0] is the XICS device fd
+            args[1] is the XICS CPU number (server ID) for this vcpu
+
+This capability connects the vcpu to an in-kernel XICS device.
diff --git a/Documentation/virtual/kvm/devices/xics.txt b/Documentation/virtual/kvm/devices/xics.txt
new file mode 100644
index 0000000..4286493
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/xics.txt
@@ -0,0 +1,66 @@ 
+XICS interrupt controller
+
+Device type supported: KVM_DEV_TYPE_XICS
+
+Groups:
+  KVM_DEV_XICS_SOURCES
+  Attributes: One per interrupt source, indexed by the source number.
+
+This device emulates the XICS (eXternal Interrupt Controller
+Specification) defined in PAPR.  The XICS has a set of interrupt
+sources, each identified by a 20-bit source number, and a set of
+Interrupt Control Presentation (ICP) entities, also called "servers",
+each associated with a virtual CPU.
+
+The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH
+capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and
+the interrupt server number (i.e. the vcpu number from the XICS's
+point of view) in args[1] of the kvm_enable_cap struct.  Each ICP has
+64 bits of state which can be read and written using the
+KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu.  The 64 bit
+state word has the following bitfields, starting at the
+least-significant end of the word:
+
+* Unused, 16 bits
+
+* Pending interrupt priority, 8 bits
+  Zero is the highest priority, 255 means no interrupt is pending.
+
+* Pending IPI (inter-processor interrupt) priority, 8 bits
+  Zero is the highest priority, 255 means no IPI is pending.
+
+* Pending interrupt source number, 24 bits
+  Zero means no interrupt pending, 2 means an IPI is pending
+
+* Current processor priority, 8 bits
+  Zero is the highest priority, meaning no interrupts can be
+  delivered, and 255 is the lowest priority.
+
+Each source has 64 bits of state that can be read and written using
+the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the
+KVM_DEV_XICS_SOURCES attribute group, with the attribute number being
+the interrupt source number.  The 64 bit state word has the following
+bitfields, starting from the least-significant end of the word:
+
+* Destination (server number), 32 bits
+  This specifies where the interrupt should be sent, and is the
+  interrupt server number specified for the destination vcpu.
+
+* Priority, 8 bits
+  This is the priority specified for this interrupt source, where 0 is
+  the highest priority and 255 is the lowest.  An interrupt with a
+  priority of 255 will never be delivered.
+
+* Level sensitive flag, 1 bit
+  This bit is 1 for a level-sensitive interrupt source, or 0 for
+  edge-sensitive (or MSI).
+
+* Masked flag, 1 bit
+  This bit is set to 1 if the interrupt is masked (cannot be delivered
+  regardless of its priority), for example by the ibm,int-off RTAS
+  call, or 0 if it is not masked.
+
+* Pending flag, 1 bit
+  This bit is 1 if the source has a pending interrupt, otherwise 0.
+
+Only one XICS instance may be created per VM.
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 44c8ea1..6041406 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -315,6 +315,8 @@  extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
 extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
 extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
+extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
+			struct kvm_vcpu *vcpu, u32 cpu);
 #else
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 1388282..ed5f6a2 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -451,4 +451,16 @@  struct kvm_get_htab_header {
 #define KVM_DEV_MPIC_GRP_REGISTER	2	/* 32-bit */
 #define KVM_DEV_MPIC_GRP_IRQ_ACTIVE	3	/* 32-bit */
 
+/* PPC64 eXternal Interrupt Controller Specification */
+#define KVM_DEV_XICS_GRP_SOURCES	1	/* 64-bit source attributes */
+
+/* Layout of 64-bit source attribute values */
+#define  KVM_XICS_DESTINATION_SHIFT	0
+#define  KVM_XICS_DESTINATION_MASK	0xffffffffULL
+#define  KVM_XICS_PRIORITY_SHIFT	32
+#define  KVM_XICS_PRIORITY_MASK		0xff
+#define  KVM_XICS_LEVEL_SENSITIVE	(1ULL << 40)
+#define  KVM_XICS_MASKED		(1ULL << 41)
+#define  KVM_XICS_PENDING		(1ULL << 42)
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index ee841ed..0189f72 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -11,6 +11,7 @@ 
 #include <linux/kvm_host.h>
 #include <linux/err.h>
 #include <linux/gfp.h>
+#include <linux/anon_inodes.h>
 
 #include <asm/uaccess.h>
 #include <asm/kvm_book3s.h>
@@ -45,6 +46,34 @@ 
  */
 
 /*
+ * Lifetimes
+ * =========
+ *
+ * The kvm_device code takes a reference to the struct kvm, so the
+ * struct kvm will exist while the device fd is open.  That implies
+ * that we can rely on the presence of the struct kvm in the device
+ * get_attr and set_attr calls.  Similarly, we can rely on the presence
+ * of the struct kvm in anything called from within a vcpu KVM_RUN
+ * call, i.e. any hcall or rtas call.
+ *
+ * Vcpus take a reference to the kvm_device, which is only dropped in
+ * kvmppc_xics_free_icp, which only gets called when the vcpu struct
+ * is freed, which only happens (currently) when the kvm struct is
+ * being freed.  That implies that we can rely on the presence of the
+ * kvm_device and the struct kvmppc_xics (which has the same lifetime)
+ * in any hcall or rtas call.  We can also rely on the kvm->xics pointer
+ * in those circumstances, since it only gets cleared kvmppc_xics_free.
+ *
+ * The remaining case is then the kvm_vm_ioctl_xics_irq function,
+ * called from the KVM_IRQ_LINE vm ioctl.  If there are no vcpus
+ * connected, it would be possible for the kvm_device and the struct
+ * kvmppc_xics to go away concurrently with kvm_vm_ioctl_xics_irq.
+ * To guard against this possibility, we use an RCU read lock in
+ * kvm_vm_ioctl_xics_irq and free the struct kvmppc_xics and associated
+ * structures with kfree_rcu().
+ */
+
+/*
  * TODO
  * ====
  *
@@ -55,8 +84,6 @@ 
  *
  * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
  *   locks array to improve scalability
- *
- * - ioctl's to save/restore the entire state for snapshot & migration
  */
 
 /* -- ICS routines -- */
@@ -891,8 +918,8 @@  static void xics_debugfs_init(struct kvmppc_xics *xics)
 	kfree(name);
 }
 
-struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
-					  struct kvmppc_xics *xics, int irq)
+static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
+					struct kvmppc_xics *xics, int irq)
 {
 	struct kvmppc_ics *ics;
 	int i, icsid;
@@ -1044,6 +1071,93 @@  int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
 	return 0;
 }
 
+static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+	int ret;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val, prio;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return -ENOENT;
+
+	irqp = &ics->irq_state[idx];
+	mutex_lock(&ics->lock);
+	ret = -ENOENT;
+	if (irqp->exists) {
+		val = irqp->server;
+		prio = irqp->priority;
+		if (prio == MASKED) {
+			val |= KVM_XICS_MASKED;
+			prio = irqp->saved_priority;
+		}
+		val |= prio << KVM_XICS_PRIORITY_SHIFT;
+		if (irqp->asserted)
+			val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING;
+		else if (irqp->masked_pending || irqp->resend)
+			val |= KVM_XICS_PENDING;
+		ret = 0;
+	}
+	mutex_unlock(&ics->lock);
+
+	if (!ret && put_user(val, ubufp))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val;
+	u8 prio;
+	u32 server;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics) {
+		ics = kvmppc_xics_create_ics(xics->kvm, xics, irq);
+		if (!ics)
+			return -ENOMEM;
+	}
+	irqp = &ics->irq_state[idx];
+	if (get_user(val, ubufp))
+		return -EFAULT;
+
+	server = val & KVM_XICS_DESTINATION_MASK;
+	prio = val >> KVM_XICS_PRIORITY_SHIFT;
+	if (prio != MASKED &&
+	    kvmppc_xics_find_server(xics->kvm, server) == NULL)
+		return -EINVAL;
+
+	mutex_lock(&ics->lock);
+	irqp->server = server;
+	irqp->saved_priority = prio;
+	if (val & KVM_XICS_MASKED)
+		prio = MASKED;
+	irqp->priority = prio;
+	irqp->resend = 0;
+	irqp->masked_pending = 0;
+	irqp->asserted = 0;
+	if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
+		irqp->asserted = 1;
+	irqp->exists = 1;
+	mutex_unlock(&ics->lock);
+
+	if (val & KVM_XICS_PENDING)
+		icp_deliver_irq(xics, NULL, irqp->number);
+
+	return 0;
+}
+
 /* -- ioctls -- */
 
 int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
@@ -1053,9 +1167,11 @@  int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
 
 	/* locking against multiple callers? */
 
+	rcu_read_lock();
+	r = -ENODEV;
 	xics = kvm->arch.xics;
 	if (!xics)
-		return -ENODEV;
+		goto out;
 
 	switch (args->level) {
 	case KVM_INTERRUPT_SET:
@@ -1067,11 +1183,48 @@  int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
 		r = -EINVAL;
 	}
 
+ out:
+	rcu_read_unlock();
 	return r;
 }
 
-void kvmppc_xics_free(struct kvmppc_xics *xics)
+static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_set_source(xics, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_get_source(xics, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XICS_NR_IRQS)
+			return 0;
+		break;
+	}
+	return -ENXIO;
+}
+
+static void kvmppc_xics_free(struct kvm_device *dev)
 {
+	struct kvmppc_xics *xics = dev->private;
 	int i;
 	struct kvm *kvm = xics->kvm;
 
@@ -1081,19 +1234,24 @@  void kvmppc_xics_free(struct kvmppc_xics *xics)
 		kvm->arch.xics = NULL;
 
 	for (i = 0; i <= xics->max_icsid; i++)
-		kfree(xics->ics[i]);
-	kfree(xics);
+		if (xics->ics[i])
+			kfree_rcu(xics->ics[i], rhead);
+	kfree_rcu(xics, rhead);
+	kfree(dev);
 }
 
-int kvm_xics_create(struct kvm *kvm, u32 type)
+static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 {
 	struct kvmppc_xics *xics;
+	struct kvm *kvm = dev->kvm;
 	int ret = 0;
 
 	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
 	if (!xics)
 		return -ENOMEM;
 
+	dev->private = xics;
+	xics->dev = dev;
 	xics->kvm = kvm;
 
 	/* Already there ? */
@@ -1120,11 +1278,46 @@  int kvm_xics_create(struct kvm *kvm, u32 type)
 	return 0;
 }
 
+struct kvm_device_ops kvm_xics_ops = {
+	.name = "kvm-xics",
+	.create = kvmppc_xics_create,
+	.destroy = kvmppc_xics_free,
+	.set_attr = xics_set_attr,
+	.get_attr = xics_get_attr,
+	.has_attr = xics_has_attr,
+};
+
+int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+			     u32 xcpu)
+{
+	struct kvmppc_xics *xics = dev->private;
+	int r = -EBUSY;
+
+	if (dev->ops != &kvm_xics_ops)
+		return -EPERM;
+	if (xics->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type)
+		return -EBUSY;
+
+	r = kvmppc_xics_create_icp(vcpu, xcpu);
+	if (!r) {
+		vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+		kvm_device_get(xics->dev);
+	}
+
+	return r;
+}
+
 void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+
 	if (!vcpu->arch.icp)
 		return;
 	kfree(vcpu->arch.icp);
 	vcpu->arch.icp = NULL;
 	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+	if (xics)
+		kvm_device_put(xics->dev);
 }
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index e4fdec3..939074c 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -10,6 +10,9 @@ 
 #ifndef _KVM_PPC_BOOK3S_XICS_H
 #define _KVM_PPC_BOOK3S_XICS_H
 
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+
 /*
  * We use a two-level tree to store interrupt source information.
  * There are up to 1024 ICS nodes, each of which can represent
@@ -83,12 +86,15 @@  struct kvmppc_icp {
 struct kvmppc_ics {
 	struct mutex lock;
 	u16 icsid;
+	struct rcu_head rhead;
 	struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
 };
 
 struct kvmppc_xics {
 	struct kvm *kvm;
+	struct kvm_device *dev;
 	struct dentry *dentry;
+	struct rcu_head rhead;
 	u32 max_icsid;
 	bool real_mode;
 	bool real_mode_dbg;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2441b81..232a174 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -341,6 +341,9 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SPAPR_TCE:
 	case KVM_CAP_PPC_ALLOC_HTAB:
 	case KVM_CAP_PPC_RTAS:
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS:
+#endif
 		r = 1;
 		break;
 #endif /* CONFIG_PPC_BOOK3S_64 */
@@ -829,6 +832,25 @@  static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		break;
 	}
 #endif
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS: {
+		struct file *filp;
+		struct kvm_device *dev;
+
+		r = -EBADF;
+		filp = fget(cap->args[0]);
+		if (!filp)
+			break;
+
+		r = -EPERM;
+		dev = kvm_device_from_filp(filp);
+		if (dev)
+			r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+
+		fput(filp);
+		break;
+	}
+#endif /* CONFIG_KVM_XICS */
 	default:
 		r = -EINVAL;
 		break;
@@ -1037,6 +1059,19 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
+	case KVM_IRQ_LINE: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_irq_level args;
+
+		r = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+
+		/* Call all the interrupt controllers */
+		r = kvm_vm_ioctl_xics_irq(kvm, &args);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 55d3d9c..adfb42c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1102,6 +1102,7 @@  void kvm_device_put(struct kvm_device *dev);
 struct kvm_device *kvm_device_from_filp(struct file *filp);
 
 extern struct kvm_device_ops kvm_mpic_ops;
+extern struct kvm_device_ops kvm_xics_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d100ac6..7189175 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -671,6 +671,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_DEVICE_CTRL 89
 #define KVM_CAP_IRQ_MPIC 90
 #define KVM_CAP_PPC_RTAS 91
+#define KVM_CAP_IRQ_XICS 92
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -933,6 +934,7 @@  struct kvm_device_attr {
 
 #define KVM_DEV_TYPE_FSL_MPIC_20	1
 #define KVM_DEV_TYPE_FSL_MPIC_42	2
+#define KVM_DEV_TYPE_XICS		3
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5a51869..3ba2076 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2244,6 +2244,11 @@  static int kvm_ioctl_create_device(struct kvm *kvm,
 		ops = &kvm_mpic_ops;
 		break;
 #endif
+#ifdef CONFIG_KVM_XICS
+	case KVM_DEV_TYPE_XICS:
+		ops = &kvm_xics_ops;
+		break;
+#endif
 	default:
 		return -ENODEV;
 	}