diff mbox series

[v5,22/36] spapr/xive: add models for KVM support

Message ID 20181116105729.23240-23-clg@kaod.org (mailing list archive)
State New, archived
Headers show
Series ppc: support for the XIVE interrupt controller (POWER9) | expand

Commit Message

Cédric Le Goater Nov. 16, 2018, 10:57 a.m. UTC
This introduces a set of XIVE models specific to KVM which derive from
the XIVE base models. The interfaces with KVM are a new capability and
a new KVM device for the XIVE native exploitation interrupt mode.

They handle the initialization of the TIMA and the source ESB memory
regions which have a different type under KVM. These are 'ram device'
memory mappings, similarly to VFIO, exposed to the guest and the
associated VMAs on the host are populated dynamically with the
appropriate pages using a fault handler.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 default-configs/ppc64-softmmu.mak |   1 +
 include/hw/ppc/spapr_xive.h       |  18 ++
 include/hw/ppc/xive.h             |   3 +
 linux-headers/asm-powerpc/kvm.h   |  12 +
 linux-headers/linux/kvm.h         |   4 +
 target/ppc/kvm_ppc.h              |   6 +
 hw/intc/spapr_xive_kvm.c          | 430 ++++++++++++++++++++++++++++++
 hw/ppc/spapr.c                    |   7 +-
 hw/ppc/spapr_irq.c                |  19 +-
 target/ppc/kvm.c                  |   7 +
 hw/intc/Makefile.objs             |   1 +
 11 files changed, 503 insertions(+), 5 deletions(-)
 create mode 100644 hw/intc/spapr_xive_kvm.c

Comments

David Gibson Nov. 28, 2018, 5:52 a.m. UTC | #1
On Fri, Nov 16, 2018 at 11:57:15AM +0100, Cédric Le Goater wrote:
> This introduces a set of XIVE models specific to KVM which derive from
> the XIVE base models. The interfaces with KVM are a new capability and
> a new KVM device for the XIVE native exploitation interrupt mode.
> 
> They handle the initialization of the TIMA and the source ESB memory
> regions which have a different type under KVM. These are 'ram device'
> memory mappings, similarly to VFIO, exposed to the guest and the
> associated VMAs on the host are populated dynamically with the
> appropriate pages using a fault handler.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>

The logic here looks fine, but I think it would be better to activate
it with explicit if (kvm) type logic rather than using a subclass.

> ---
>  default-configs/ppc64-softmmu.mak |   1 +
>  include/hw/ppc/spapr_xive.h       |  18 ++
>  include/hw/ppc/xive.h             |   3 +
>  linux-headers/asm-powerpc/kvm.h   |  12 +
>  linux-headers/linux/kvm.h         |   4 +
>  target/ppc/kvm_ppc.h              |   6 +
>  hw/intc/spapr_xive_kvm.c          | 430 ++++++++++++++++++++++++++++++
>  hw/ppc/spapr.c                    |   7 +-
>  hw/ppc/spapr_irq.c                |  19 +-
>  target/ppc/kvm.c                  |   7 +
>  hw/intc/Makefile.objs             |   1 +
>  11 files changed, 503 insertions(+), 5 deletions(-)
>  create mode 100644 hw/intc/spapr_xive_kvm.c
> 
> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> index 7f34ad0528ed..c1bf5cd951f5 100644
> --- a/default-configs/ppc64-softmmu.mak
> +++ b/default-configs/ppc64-softmmu.mak
> @@ -18,6 +18,7 @@ CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
>  CONFIG_XICS_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
>  CONFIG_XIVE=$(CONFIG_PSERIES)
>  CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
> +CONFIG_XIVE_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
>  CONFIG_MEM_DEVICE=y
>  CONFIG_DIMM=y
>  CONFIG_SPAPR_RNG=y
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index aca2969a09ab..9c817bb7ae74 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -40,6 +40,10 @@ typedef struct sPAPRXive {
>      /* TIMA mapping address */
>      hwaddr        tm_base;
>      MemoryRegion  tm_mmio;
> +
> +    /* KVM support */
> +    int           fd;
> +    void          *tm_mmap;
>  } sPAPRXive;
>  
>  #define SPAPR_XIVE_BASE_CLASS(klass) \
> @@ -83,4 +87,18 @@ void spapr_xive_hcall_init(sPAPRMachineState *spapr);
>  void spapr_dt_xive(sPAPRXive *xive, int nr_servers, void *fdt,
>                     uint32_t phandle);
>  
> +/*
> + * XIVE KVM models
> + */
> +
> +#define TYPE_SPAPR_XIVE_KVM  "spapr-xive-kvm"
> +#define SPAPR_XIVE_KVM(obj)  OBJECT_CHECK(sPAPRXive, (obj), TYPE_SPAPR_XIVE_KVM)
> +
> +#define TYPE_XIVE_SOURCE_KVM "xive-source-kvm"
> +#define XIVE_SOURCE_KVM(obj) \
> +    OBJECT_CHECK(XiveSource, (obj), TYPE_XIVE_SOURCE_KVM)
> +
> +#define TYPE_XIVE_TCTX_KVM   "xive-tctx-kvm"
> +#define XIVE_TCTX_KVM(obj)   OBJECT_CHECK(XiveTCTX, (obj), TYPE_XIVE_TCTX_KVM)
> +
>  #endif /* PPC_SPAPR_XIVE_H */
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 281ed370121c..7aaf5a182cb3 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -69,6 +69,9 @@ typedef struct XiveSource {
>      uint32_t        esb_shift;
>      MemoryRegion    esb_mmio;
>  
> +    /* KVM support */
> +    void            *esb_mmap;
> +
>      XiveFabric      *xive;
>  } XiveSource;
>  
> diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
> index 8c876c166ef2..f34c971491dd 100644
> --- a/linux-headers/asm-powerpc/kvm.h
> +++ b/linux-headers/asm-powerpc/kvm.h

Updates to linux-headers need to be split out into a separate patch.
Eventually (i.e. by the time we merge) they should be just "update
headers to SHA XXX" not picking and choosing pieces.

> @@ -675,4 +675,16 @@ struct kvm_ppc_cpu_char {
>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>  
> +/* POWER9 XIVE Native Interrupt Controller */
> +#define KVM_DEV_XIVE_GRP_CTRL		1
> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
> +#define   KVM_DEV_XIVE_GET_TIMA_FD	2
> +#define   KVM_DEV_XIVE_VC_BASE		3
> +#define KVM_DEV_XIVE_GRP_SOURCES	2	/* 64-bit source attributes */
> +
> +/* Layout of 64-bit XIVE source attribute values */
> +#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
> +#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
> +
> +
>  #endif /* __LINUX_KVM_POWERPC_H */
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index f11a7eb49cfa..59fa8d8d7f39 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h
> @@ -965,6 +965,8 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_COALESCED_PIO 162
>  #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
>  #define KVM_CAP_EXCEPTION_PAYLOAD 164
> +#define KVM_CAP_ARM_VM_IPA_SIZE 165
> +#define KVM_CAP_PPC_IRQ_XIVE 166
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1188,6 +1190,8 @@ enum kvm_device_type {
>  #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
>  	KVM_DEV_TYPE_ARM_VGIC_ITS,
>  #define KVM_DEV_TYPE_ARM_VGIC_ITS	KVM_DEV_TYPE_ARM_VGIC_ITS
> +	KVM_DEV_TYPE_XIVE,
> +#define KVM_DEV_TYPE_XIVE		KVM_DEV_TYPE_XIVE
>  	KVM_DEV_TYPE_MAX,
>  };
>  
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index bdfaa4e70a83..d2159660f9f2 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -59,6 +59,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
>  bool kvmppc_has_cap_htm(void);
>  bool kvmppc_has_cap_mmu_radix(void);
>  bool kvmppc_has_cap_mmu_hash_v3(void);
> +bool kvmppc_has_cap_xive(void);
>  int kvmppc_get_cap_safe_cache(void);
>  int kvmppc_get_cap_safe_bounds_check(void);
>  int kvmppc_get_cap_safe_indirect_branch(void);
> @@ -307,6 +308,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
>      return false;
>  }
>  
> +static inline bool kvmppc_has_cap_xive(void)
> +{
> +    return false;
> +}
> +
>  static inline int kvmppc_get_cap_safe_cache(void)
>  {
>      return 0;
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> new file mode 100644
> index 000000000000..767f90826e43
> --- /dev/null
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -0,0 +1,430 @@
> +/*
> + * QEMU PowerPC sPAPR XIVE interrupt controller model
> + *
> + * Copyright (c) 2017-2018, IBM Corporation.
> + *
> + * This code is licensed under the GPL version 2 or later. See the
> + * COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "target/ppc/cpu.h"
> +#include "sysemu/cpus.h"
> +#include "sysemu/kvm.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_xive.h"
> +#include "hw/ppc/xive.h"
> +#include "kvm_ppc.h"
> +
> +#include <sys/ioctl.h>
> +
> +/*
> + * Helpers for CPU hotplug
> + */
> +typedef struct KVMEnabledCPU {
> +    unsigned long vcpu_id;
> +    QLIST_ENTRY(KVMEnabledCPU) node;
> +} KVMEnabledCPU;
> +
> +static QLIST_HEAD(, KVMEnabledCPU)
> +    kvm_enabled_cpus = QLIST_HEAD_INITIALIZER(&kvm_enabled_cpus);
> +
> +static bool kvm_cpu_is_enabled(CPUState *cs)
> +{
> +    KVMEnabledCPU *enabled_cpu;
> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> +
> +    QLIST_FOREACH(enabled_cpu, &kvm_enabled_cpus, node) {
> +        if (enabled_cpu->vcpu_id == vcpu_id) {
> +            return true;
> +        }
> +    }
> +    return false;
> +}
> +
> +static void kvm_cpu_enable(CPUState *cs)
> +{
> +    KVMEnabledCPU *enabled_cpu;
> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
> +
> +    enabled_cpu = g_malloc(sizeof(*enabled_cpu));
> +    enabled_cpu->vcpu_id = vcpu_id;
> +    QLIST_INSERT_HEAD(&kvm_enabled_cpus, enabled_cpu, node);
> +}

Blech, I hope we can find a better way of tracking this than an ugly
list.

> +
> +/*
> + * XIVE Thread Interrupt Management context (KVM)
> + */
> +
> +static void xive_tctx_kvm_init(XiveTCTX *tctx, Error **errp)
> +{
> +    sPAPRXive *xive;
> +    unsigned long vcpu_id;
> +    int ret;
> +
> +    /* Check if CPU was hot unplugged and replugged. */
> +    if (kvm_cpu_is_enabled(tctx->cs)) {
> +        return;
> +    }
> +
> +    vcpu_id = kvm_arch_vcpu_id(tctx->cs);
> +    xive = SPAPR_XIVE_KVM(tctx->xrtr);

Is this the first use of tctx->xrtr?

> +    ret = kvm_vcpu_enable_cap(tctx->cs, KVM_CAP_PPC_IRQ_XIVE, 0, xive->fd,
> +                              vcpu_id, 0);
> +    if (ret < 0) {
> +        error_setg(errp, "Unable to connect CPU%ld to KVM XIVE device: %s",
> +                   vcpu_id, strerror(errno));
> +        return;
> +    }
> +
> +    kvm_cpu_enable(tctx->cs);
> +}
> +
> +static void xive_tctx_kvm_realize(DeviceState *dev, Error **errp)
> +{
> +    XiveTCTX *tctx = XIVE_TCTX_KVM(dev);
> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(dev);
> +    Error *local_err = NULL;
> +
> +    xtc->parent_realize(dev, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    xive_tctx_kvm_init(tctx, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +}
> +
> +static void xive_tctx_kvm_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_CLASS(klass);
> +
> +    dc->desc = "sPAPR XIVE KVM Interrupt Thread Context";
> +
> +    device_class_set_parent_realize(dc, xive_tctx_kvm_realize,
> +                                    &xtc->parent_realize);
> +}
> +
> +static const TypeInfo xive_tctx_kvm_info = {
> +    .name          = TYPE_XIVE_TCTX_KVM,
> +    .parent        = TYPE_XIVE_TCTX_BASE,
> +    .instance_size = sizeof(XiveTCTX),
> +    .class_init    = xive_tctx_kvm_class_init,
> +    .class_size    = sizeof(XiveTCTXClass),
> +};
> +
> +/*
> + * XIVE Interrupt Source (KVM)
> + */
> +
> +static void xive_source_kvm_init(XiveSource *xsrc, Error **errp)
> +{
> +    sPAPRXive *xive = SPAPR_XIVE_KVM(xsrc->xive);
> +    int i;
> +
> +    /*
> +     * At reset, interrupt sources are simply created and MASKED. We
> +     * only need to inform the KVM device about their type: LSI or
> +     * MSI.
> +     */
> +    for (i = 0; i < xsrc->nr_irqs; i++) {
> +        Error *local_err = NULL;
> +        uint64_t state = 0;
> +
> +        if (xive_source_irq_is_lsi(xsrc, i)) {
> +            state |= KVM_XIVE_LEVEL_SENSITIVE;
> +            if (xsrc->status[i] & XIVE_STATUS_ASSERTED) {
> +                state |= KVM_XIVE_LEVEL_ASSERTED;
> +            }
> +        }
> +
> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCES, i, &state,
> +                          true, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +}
> +
> +static void xive_source_kvm_reset(DeviceState *dev)
> +{
> +    XiveSource *xsrc = XIVE_SOURCE_KVM(dev);
> +    XiveSourceClass *xsc = XIVE_SOURCE_BASE_GET_CLASS(dev);
> +
> +    xsc->parent_reset(dev);
> +
> +    xive_source_kvm_init(xsrc, &error_fatal);
> +}
> +
> +static void xive_source_kvm_set_irq(void *opaque, int srcno, int val)
> +{
> +    XiveSource *xsrc = opaque;
> +    struct kvm_irq_level args;
> +    int rc;
> +
> +    args.irq = srcno;
> +    if (!xive_source_irq_is_lsi(xsrc, srcno)) {
> +        if (!val) {
> +            return;
> +        }
> +        args.level = KVM_INTERRUPT_SET;
> +    } else {
> +        if (val) {
> +            xsrc->status[srcno] |= XIVE_STATUS_ASSERTED;
> +            args.level = KVM_INTERRUPT_SET_LEVEL;
> +        } else {
> +            xsrc->status[srcno] &= ~XIVE_STATUS_ASSERTED;
> +            args.level = KVM_INTERRUPT_UNSET;
> +        }
> +    }
> +    rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args);
> +    if (rc < 0) {
> +        error_report("kvm_irq_line() failed : %s", strerror(errno));
> +    }
> +}
> +
> +static void *spapr_xive_kvm_mmap(sPAPRXive *xive, int ctrl, size_t len,
> +                                 Error **errp)
> +{
> +    Error *local_err = NULL;
> +    void *addr;
> +    int fd;
> +
> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, ctrl, &fd, false,
> +                      &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return NULL;
> +    }
> +
> +    addr = mmap(NULL, len, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
> +    close(fd);
> +    if (addr == MAP_FAILED) {
> +        error_setg_errno(errp, errno, "Unable to set XIVE mmaping");
> +        return NULL;
> +    }
> +
> +    return addr;
> +}
> +
> +/*
> + * The sPAPRXive KVM model should have initialized the KVM device
> + * before initializing the source
> + */
> +static void xive_source_kvm_mmap(XiveSource *xsrc, Error **errp)
> +{
> +    sPAPRXive *xive = SPAPR_XIVE_KVM(xsrc->xive);
> +    Error *local_err = NULL;
> +    size_t esb_len;
> +
> +    esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
> +    xsrc->esb_mmap = spapr_xive_kvm_mmap(xive, KVM_DEV_XIVE_GET_ESB_FD,
> +                                         esb_len, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    memory_region_init_ram_device_ptr(&xsrc->esb_mmio, OBJECT(xsrc),
> +                                      "xive.esb", esb_len, xsrc->esb_mmap);
> +    sysbus_init_mmio(SYS_BUS_DEVICE(xsrc), &xsrc->esb_mmio);
> +}
> +
> +static void xive_source_kvm_realize(DeviceState *dev, Error **errp)
> +{
> +    XiveSource *xsrc = XIVE_SOURCE_KVM(dev);
> +    XiveSourceClass *xsc = XIVE_SOURCE_BASE_GET_CLASS(dev);
> +    Error *local_err = NULL;
> +
> +    xsc->parent_realize(dev, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    xsrc->qirqs = qemu_allocate_irqs(xive_source_kvm_set_irq, xsrc,
> +                                     xsrc->nr_irqs);
> +
> +    xive_source_kvm_mmap(xsrc, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +}
> +
> +static void xive_source_kvm_unrealize(DeviceState *dev, Error **errp)
> +{
> +    XiveSource *xsrc = XIVE_SOURCE_KVM(dev);
> +    size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
> +
> +    munmap(xsrc->esb_mmap, esb_len);
> +}
> +
> +static void xive_source_kvm_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    XiveSourceClass *xsc = XIVE_SOURCE_BASE_CLASS(klass);
> +
> +    device_class_set_parent_realize(dc, xive_source_kvm_realize,
> +                                    &xsc->parent_realize);
> +    device_class_set_parent_reset(dc, xive_source_kvm_reset,
> +                                  &xsc->parent_reset);
> +
> +    dc->desc = "sPAPR XIVE KVM Interrupt Source";
> +    dc->unrealize = xive_source_kvm_unrealize;
> +}
> +
> +static const TypeInfo xive_source_kvm_info = {
> +    .name = TYPE_XIVE_SOURCE_KVM,
> +    .parent = TYPE_XIVE_SOURCE_BASE,
> +    .instance_size = sizeof(XiveSource),
> +    .class_init    = xive_source_kvm_class_init,
> +    .class_size    = sizeof(XiveSourceClass),
> +};
> +
> +/*
> + * sPAPR XIVE Router (KVM)
> + */
> +
> +static void spapr_xive_kvm_instance_init(Object *obj)
> +{
> +    sPAPRXive *xive = SPAPR_XIVE_KVM(obj);
> +
> +    xive->fd = -1;
> +
> +    /* We need a KVM flavored source */
> +    object_initialize(&xive->source, sizeof(xive->source),
> +                      TYPE_XIVE_SOURCE_KVM);
> +    object_property_add_child(obj, "source", OBJECT(&xive->source), NULL);
> +
> +    /* No KVM support for END ESBs. OPAL doesn't either */
> +    object_initialize(&xive->end_source, sizeof(xive->end_source),
> +                      TYPE_XIVE_END_SOURCE);
> +    object_property_add_child(obj, "end_source", OBJECT(&xive->end_source),
> +                              NULL);
> +}
> +
> +static void spapr_xive_kvm_init(sPAPRXive *xive, Error **errp)
> +{
> +    Error *local_err = NULL;
> +    size_t tima_len;
> +
> +    if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> +        error_setg(errp,
> +                   "IRQ_XIVE capability must be present for KVM XIVE device");
> +        return;
> +    }
> +
> +    /* First, create the KVM XIVE device */
> +    xive->fd = kvm_create_device(kvm_state, KVM_DEV_TYPE_XIVE, false);
> +    if (xive->fd < 0) {
> +        error_setg_errno(errp, -xive->fd, "error creating KVM XIVE device");
> +        return;
> +    }
> +
> +    /* Source ESBs KVM mapping
> +     *
> +     * Inform KVM where we will map the ESB pages. This is needed by
> +     * the H_INT_GET_SOURCE_INFO hcall which returns the source
> +     * characteristics, among which the ESB page address.
> +     */
> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, KVM_DEV_XIVE_VC_BASE,
> +                      &xive->vc_base, true, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    /* Let the XiveSource KVM model handle the mapping for the moment */
> +
> +    /* TIMA KVM mapping
> +     *
> +     * We could also inform KVM where the TIMA will be mapped but as
> +     * this is a fixed MMIO address for the system it does not seem
> +     * necessary to provide a KVM ioctl to change it.
> +     */
> +    tima_len = 4ull << TM_SHIFT;
> +    xive->tm_mmap = spapr_xive_kvm_mmap(xive, KVM_DEV_XIVE_GET_TIMA_FD,
> +                                        tima_len, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +    memory_region_init_ram_device_ptr(&xive->tm_mmio, OBJECT(xive),
> +                                      "xive.tima", tima_len, xive->tm_mmap);
> +    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
> +
> +    kvm_kernel_irqchip = true;
> +    kvm_msi_via_irqfd_allowed = true;
> +    kvm_gsi_direct_mapping = true;
> +}
> +
> +static void spapr_xive_kvm_realize(DeviceState *dev, Error **errp)
> +{
> +    sPAPRXive *xive = SPAPR_XIVE_KVM(dev);
> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(dev);
> +    Error *local_err = NULL;
> +
> +    spapr_xive_kvm_init(xive, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    /* Initialize the source and the local routing tables */
> +    sxc->parent_realize(dev, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +}
> +
> +static void spapr_xive_kvm_unrealize(DeviceState *dev, Error **errp)
> +{
> +    sPAPRXive *xive = SPAPR_XIVE_KVM(dev);
> +
> +    close(xive->fd);
> +    xive->fd = -1;
> +
> +    munmap(xive->tm_mmap, 4ull << TM_SHIFT);
> +}
> +
> +static void spapr_xive_kvm_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_CLASS(klass);
> +
> +    device_class_set_parent_realize(dc, spapr_xive_kvm_realize,
> +                                    &sxc->parent_realize);
> +
> +    dc->desc = "sPAPR XIVE KVM Interrupt Controller";
> +    dc->unrealize = spapr_xive_kvm_unrealize;
> +}
> +
> +static const TypeInfo spapr_xive_kvm_info = {
> +    .name = TYPE_SPAPR_XIVE_KVM,
> +    .parent = TYPE_SPAPR_XIVE_BASE,
> +    .instance_init = spapr_xive_kvm_instance_init,
> +    .instance_size = sizeof(sPAPRXive),
> +    .class_init = spapr_xive_kvm_class_init,
> +    .class_size = sizeof(sPAPRXiveClass),
> +};
> +
> +static void xive_kvm_register_types(void)
> +{
> +    type_register_static(&spapr_xive_kvm_info);
> +    type_register_static(&xive_source_kvm_info);
> +    type_register_static(&xive_tctx_kvm_info);
> +}
> +
> +type_init(xive_kvm_register_types)
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index f9cf2debff5a..d1be2579cd9b 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1125,8 +1125,11 @@ static void spapr_dt_ov5_platform_support(sPAPRMachineState *spapr, void *fdt,
>          } else {
>              val[3] = 0x00; /* Hash */
>          }
> -        /* TODO: test KVM support */
> -        val[1] = smc->irq->ov5;
> +        if (kvmppc_has_cap_xive()) {
> +            val[1] = smc->irq->ov5;
> +        } else {
> +            val[1] = 0x00;
> +        }
>      } else {
>          val[1] = smc->irq->ov5;
>  
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index 33dd5da7d255..92ef53743b64 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -273,9 +273,22 @@ static void spapr_irq_init_xive(sPAPRMachineState *spapr, int nr_irqs,
>      Error *local_err = NULL;
>  
>      /* KVM XIVE support */
> -    if (kvm_enabled()) {
> -        if (machine_kernel_irqchip_required(machine)) {
> -            error_setg(errp, "kernel_irqchip requested. no XIVE support");
> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +        spapr->xive_tctx_type = TYPE_XIVE_TCTX_KVM;
> +        spapr->xive = spapr_xive_create(spapr, TYPE_SPAPR_XIVE_KVM, nr_irqs,
> +                                        nr_servers, &local_err);
> +
> +        if (local_err && machine_kernel_irqchip_required(machine)) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "kernel_irqchip requested but init failed : ");
> +            return;
> +        }
> +
> +        /*
> +         * XIVE support is activated under KVM. No need to initialize
> +         * the fallback mode under QEMU
> +         */
> +        if (spapr->xive) {
>              return;
>          }
>      }
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index f81327d6cd47..3b7cf106242b 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -86,6 +86,7 @@ static int cap_fixup_hcalls;
>  static int cap_htm;             /* Hardware transactional memory support */
>  static int cap_mmu_radix;
>  static int cap_mmu_hash_v3;
> +static int cap_xive;
>  static int cap_resize_hpt;
>  static int cap_ppc_pvr_compat;
>  static int cap_ppc_safe_cache;
> @@ -149,6 +150,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>      cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM);
>      cap_mmu_radix = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_RADIX);
>      cap_mmu_hash_v3 = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_HASH_V3);
> +    cap_xive = kvm_vm_check_extension(s, KVM_CAP_PPC_IRQ_XIVE);
>      cap_resize_hpt = kvm_vm_check_extension(s, KVM_CAP_SPAPR_RESIZE_HPT);
>      kvmppc_get_cpu_characteristics(s);
>      cap_ppc_nested_kvm_hv = kvm_vm_check_extension(s, KVM_CAP_PPC_NESTED_HV);
> @@ -2385,6 +2387,11 @@ static int parse_cap_ppc_safe_indirect_branch(struct kvm_ppc_cpu_char c)
>      return 0;
>  }
>  
> +bool kvmppc_has_cap_xive(void)
> +{
> +    return cap_xive;
> +}
> +
>  static void kvmppc_get_cpu_characteristics(KVMState *s)
>  {
>      struct kvm_ppc_cpu_char c;
> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> index eacd26836ebf..dd4d69db2bdd 100644
> --- a/hw/intc/Makefile.objs
> +++ b/hw/intc/Makefile.objs
> @@ -39,6 +39,7 @@ obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
>  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
>  obj-$(CONFIG_XIVE) += xive.o
>  obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o spapr_xive_hcall.o
> +obj-$(CONFIG_XIVE_KVM) += spapr_xive_kvm.o
>  obj-$(CONFIG_POWERNV) += xics_pnv.o
>  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
>  obj-$(CONFIG_S390_FLIC) += s390_flic.o
Cédric Le Goater Nov. 28, 2018, 10:45 p.m. UTC | #2
On 11/28/18 6:52 AM, David Gibson wrote:
> On Fri, Nov 16, 2018 at 11:57:15AM +0100, Cédric Le Goater wrote:
>> This introduces a set of XIVE models specific to KVM which derive from
>> the XIVE base models. The interfaces with KVM are a new capability and
>> a new KVM device for the XIVE native exploitation interrupt mode.
>>
>> They handle the initialization of the TIMA and the source ESB memory
>> regions which have a different type under KVM. These are 'ram device'
>> memory mappings, similarly to VFIO, exposed to the guest and the
>> associated VMAs on the host are populated dynamically with the
>> appropriate pages using a fault handler.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> 
> The logic here looks fine, but I think it would be better to activate
> it with explicit if (kvm) type logic rather than using a subclass.

ok. ARM has taken a different path, the one proposed below, but it should 
be possible to use a "if (kvm)" type logic. There should be less noise 
in the object design.

 
>> ---
>>  default-configs/ppc64-softmmu.mak |   1 +
>>  include/hw/ppc/spapr_xive.h       |  18 ++
>>  include/hw/ppc/xive.h             |   3 +
>>  linux-headers/asm-powerpc/kvm.h   |  12 +
>>  linux-headers/linux/kvm.h         |   4 +
>>  target/ppc/kvm_ppc.h              |   6 +
>>  hw/intc/spapr_xive_kvm.c          | 430 ++++++++++++++++++++++++++++++
>>  hw/ppc/spapr.c                    |   7 +-
>>  hw/ppc/spapr_irq.c                |  19 +-
>>  target/ppc/kvm.c                  |   7 +
>>  hw/intc/Makefile.objs             |   1 +
>>  11 files changed, 503 insertions(+), 5 deletions(-)
>>  create mode 100644 hw/intc/spapr_xive_kvm.c
>>
>> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
>> index 7f34ad0528ed..c1bf5cd951f5 100644
>> --- a/default-configs/ppc64-softmmu.mak
>> +++ b/default-configs/ppc64-softmmu.mak
>> @@ -18,6 +18,7 @@ CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
>>  CONFIG_XICS_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
>>  CONFIG_XIVE=$(CONFIG_PSERIES)
>>  CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
>> +CONFIG_XIVE_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
>>  CONFIG_MEM_DEVICE=y
>>  CONFIG_DIMM=y
>>  CONFIG_SPAPR_RNG=y
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index aca2969a09ab..9c817bb7ae74 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -40,6 +40,10 @@ typedef struct sPAPRXive {
>>      /* TIMA mapping address */
>>      hwaddr        tm_base;
>>      MemoryRegion  tm_mmio;
>> +
>> +    /* KVM support */
>> +    int           fd;
>> +    void          *tm_mmap;
>>  } sPAPRXive;
>>  
>>  #define SPAPR_XIVE_BASE_CLASS(klass) \
>> @@ -83,4 +87,18 @@ void spapr_xive_hcall_init(sPAPRMachineState *spapr);
>>  void spapr_dt_xive(sPAPRXive *xive, int nr_servers, void *fdt,
>>                     uint32_t phandle);
>>  
>> +/*
>> + * XIVE KVM models
>> + */
>> +
>> +#define TYPE_SPAPR_XIVE_KVM  "spapr-xive-kvm"
>> +#define SPAPR_XIVE_KVM(obj)  OBJECT_CHECK(sPAPRXive, (obj), TYPE_SPAPR_XIVE_KVM)
>> +
>> +#define TYPE_XIVE_SOURCE_KVM "xive-source-kvm"
>> +#define XIVE_SOURCE_KVM(obj) \
>> +    OBJECT_CHECK(XiveSource, (obj), TYPE_XIVE_SOURCE_KVM)
>> +
>> +#define TYPE_XIVE_TCTX_KVM   "xive-tctx-kvm"
>> +#define XIVE_TCTX_KVM(obj)   OBJECT_CHECK(XiveTCTX, (obj), TYPE_XIVE_TCTX_KVM)
>> +
>>  #endif /* PPC_SPAPR_XIVE_H */
>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>> index 281ed370121c..7aaf5a182cb3 100644
>> --- a/include/hw/ppc/xive.h
>> +++ b/include/hw/ppc/xive.h
>> @@ -69,6 +69,9 @@ typedef struct XiveSource {
>>      uint32_t        esb_shift;
>>      MemoryRegion    esb_mmio;
>>  
>> +    /* KVM support */
>> +    void            *esb_mmap;
>> +
>>      XiveFabric      *xive;
>>  } XiveSource;
>>  
>> diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
>> index 8c876c166ef2..f34c971491dd 100644
>> --- a/linux-headers/asm-powerpc/kvm.h
>> +++ b/linux-headers/asm-powerpc/kvm.h
> 
> Updates to linux-headers need to be split out into a separate patch.
> Eventually (i.e. by the time we merge) they should be just "update
> headers to SHA XXX" not picking and choosing pieces.

ok. I am starting to separate the KVM definition from the patch now
that the interface is stabilizing. 

>> @@ -675,4 +675,16 @@ struct kvm_ppc_cpu_char {
>>  #define  KVM_XICS_PRESENTED		(1ULL << 43)
>>  #define  KVM_XICS_QUEUED		(1ULL << 44)
>>  
>> +/* POWER9 XIVE Native Interrupt Controller */
>> +#define KVM_DEV_XIVE_GRP_CTRL		1
>> +#define   KVM_DEV_XIVE_GET_ESB_FD	1
>> +#define   KVM_DEV_XIVE_GET_TIMA_FD	2
>> +#define   KVM_DEV_XIVE_VC_BASE		3
>> +#define KVM_DEV_XIVE_GRP_SOURCES	2	/* 64-bit source attributes */
>> +
>> +/* Layout of 64-bit XIVE source attribute values */
>> +#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
>> +#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
>> +
>> +
>>  #endif /* __LINUX_KVM_POWERPC_H */
>> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
>> index f11a7eb49cfa..59fa8d8d7f39 100644
>> --- a/linux-headers/linux/kvm.h
>> +++ b/linux-headers/linux/kvm.h
>> @@ -965,6 +965,8 @@ struct kvm_ppc_resize_hpt {
>>  #define KVM_CAP_COALESCED_PIO 162
>>  #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
>>  #define KVM_CAP_EXCEPTION_PAYLOAD 164
>> +#define KVM_CAP_ARM_VM_IPA_SIZE 165
>> +#define KVM_CAP_PPC_IRQ_XIVE 166
>>  
>>  #ifdef KVM_CAP_IRQ_ROUTING
>>  
>> @@ -1188,6 +1190,8 @@ enum kvm_device_type {
>>  #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
>>  	KVM_DEV_TYPE_ARM_VGIC_ITS,
>>  #define KVM_DEV_TYPE_ARM_VGIC_ITS	KVM_DEV_TYPE_ARM_VGIC_ITS
>> +	KVM_DEV_TYPE_XIVE,
>> +#define KVM_DEV_TYPE_XIVE		KVM_DEV_TYPE_XIVE
>>  	KVM_DEV_TYPE_MAX,
>>  };
>>  
>> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
>> index bdfaa4e70a83..d2159660f9f2 100644
>> --- a/target/ppc/kvm_ppc.h
>> +++ b/target/ppc/kvm_ppc.h
>> @@ -59,6 +59,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
>>  bool kvmppc_has_cap_htm(void);
>>  bool kvmppc_has_cap_mmu_radix(void);
>>  bool kvmppc_has_cap_mmu_hash_v3(void);
>> +bool kvmppc_has_cap_xive(void);
>>  int kvmppc_get_cap_safe_cache(void);
>>  int kvmppc_get_cap_safe_bounds_check(void);
>>  int kvmppc_get_cap_safe_indirect_branch(void);
>> @@ -307,6 +308,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
>>      return false;
>>  }
>>  
>> +static inline bool kvmppc_has_cap_xive(void)
>> +{
>> +    return false;
>> +}
>> +
>>  static inline int kvmppc_get_cap_safe_cache(void)
>>  {
>>      return 0;
>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>> new file mode 100644
>> index 000000000000..767f90826e43
>> --- /dev/null
>> +++ b/hw/intc/spapr_xive_kvm.c
>> @@ -0,0 +1,430 @@
>> +/*
>> + * QEMU PowerPC sPAPR XIVE interrupt controller model
>> + *
>> + * Copyright (c) 2017-2018, IBM Corporation.
>> + *
>> + * This code is licensed under the GPL version 2 or later. See the
>> + * COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/log.h"
>> +#include "qemu/error-report.h"
>> +#include "qapi/error.h"
>> +#include "target/ppc/cpu.h"
>> +#include "sysemu/cpus.h"
>> +#include "sysemu/kvm.h"
>> +#include "hw/ppc/spapr.h"
>> +#include "hw/ppc/spapr_xive.h"
>> +#include "hw/ppc/xive.h"
>> +#include "kvm_ppc.h"
>> +
>> +#include <sys/ioctl.h>
>> +
>> +/*
>> + * Helpers for CPU hotplug
>> + */
>> +typedef struct KVMEnabledCPU {
>> +    unsigned long vcpu_id;
>> +    QLIST_ENTRY(KVMEnabledCPU) node;
>> +} KVMEnabledCPU;
>> +
>> +static QLIST_HEAD(, KVMEnabledCPU)
>> +    kvm_enabled_cpus = QLIST_HEAD_INITIALIZER(&kvm_enabled_cpus);
>> +
>> +static bool kvm_cpu_is_enabled(CPUState *cs)
>> +{
>> +    KVMEnabledCPU *enabled_cpu;
>> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
>> +
>> +    QLIST_FOREACH(enabled_cpu, &kvm_enabled_cpus, node) {
>> +        if (enabled_cpu->vcpu_id == vcpu_id) {
>> +            return true;
>> +        }
>> +    }
>> +    return false;
>> +}
>> +
>> +static void kvm_cpu_enable(CPUState *cs)
>> +{
>> +    KVMEnabledCPU *enabled_cpu;
>> +    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
>> +
>> +    enabled_cpu = g_malloc(sizeof(*enabled_cpu));
>> +    enabled_cpu->vcpu_id = vcpu_id;
>> +    QLIST_INSERT_HEAD(&kvm_enabled_cpus, enabled_cpu, node);
>> +}
> 
> Blech, I hope we can find a better way of tracking this than an ugly
> list.

yes ... We have one similar for XICS.

>> +
>> +/*
>> + * XIVE Thread Interrupt Management context (KVM)
>> + */
>> +
>> +static void xive_tctx_kvm_init(XiveTCTX *tctx, Error **errp)
>> +{
>> +    sPAPRXive *xive;
>> +    unsigned long vcpu_id;
>> +    int ret;
>> +
>> +    /* Check if CPU was hot unplugged and replugged. */
>> +    if (kvm_cpu_is_enabled(tctx->cs)) {
>> +        return;
>> +    }
>> +
>> +    vcpu_id = kvm_arch_vcpu_id(tctx->cs);
>> +    xive = SPAPR_XIVE_KVM(tctx->xrtr);
> 
> Is this the first use of tctx->xrtr?

No, the second. the first is the reset_tctx() ops doing the CAM reset.
But we said that we could remove it.

> 
>> +    ret = kvm_vcpu_enable_cap(tctx->cs, KVM_CAP_PPC_IRQ_XIVE, 0, xive->fd,
>> +                              vcpu_id, 0);
>> +    if (ret < 0) {
>> +        error_setg(errp, "Unable to connect CPU%ld to KVM XIVE device: %s",
>> +                   vcpu_id, strerror(errno));
>> +        return;
>> +    }
>> +
>> +    kvm_cpu_enable(tctx->cs);
>> +}
>> +
>> +static void xive_tctx_kvm_realize(DeviceState *dev, Error **errp)
>> +{
>> +    XiveTCTX *tctx = XIVE_TCTX_KVM(dev);
>> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(dev);
>> +    Error *local_err = NULL;
>> +
>> +    xtc->parent_realize(dev, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +
>> +    xive_tctx_kvm_init(tctx, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +}
>> +
>> +static void xive_tctx_kvm_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +    XiveTCTXClass *xtc = XIVE_TCTX_BASE_CLASS(klass);
>> +
>> +    dc->desc = "sPAPR XIVE KVM Interrupt Thread Context";
>> +
>> +    device_class_set_parent_realize(dc, xive_tctx_kvm_realize,
>> +                                    &xtc->parent_realize);
>> +}
>> +
>> +static const TypeInfo xive_tctx_kvm_info = {
>> +    .name          = TYPE_XIVE_TCTX_KVM,
>> +    .parent        = TYPE_XIVE_TCTX_BASE,
>> +    .instance_size = sizeof(XiveTCTX),
>> +    .class_init    = xive_tctx_kvm_class_init,
>> +    .class_size    = sizeof(XiveTCTXClass),
>> +};
>> +
>> +/*
>> + * XIVE Interrupt Source (KVM)
>> + */
>> +
>> +static void xive_source_kvm_init(XiveSource *xsrc, Error **errp)
>> +{
>> +    sPAPRXive *xive = SPAPR_XIVE_KVM(xsrc->xive);
>> +    int i;
>> +
>> +    /*
>> +     * At reset, interrupt sources are simply created and MASKED. We
>> +     * only need to inform the KVM device about their type: LSI or
>> +     * MSI.
>> +     */
>> +    for (i = 0; i < xsrc->nr_irqs; i++) {
>> +        Error *local_err = NULL;
>> +        uint64_t state = 0;
>> +
>> +        if (xive_source_irq_is_lsi(xsrc, i)) {
>> +            state |= KVM_XIVE_LEVEL_SENSITIVE;
>> +            if (xsrc->status[i] & XIVE_STATUS_ASSERTED) {
>> +                state |= KVM_XIVE_LEVEL_ASSERTED;
>> +            }
>> +        }
>> +
>> +        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCES, i, &state,
>> +                          true, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>> +    }
>> +}
>> +
>> +static void xive_source_kvm_reset(DeviceState *dev)
>> +{
>> +    XiveSource *xsrc = XIVE_SOURCE_KVM(dev);
>> +    XiveSourceClass *xsc = XIVE_SOURCE_BASE_GET_CLASS(dev);
>> +
>> +    xsc->parent_reset(dev);
>> +
>> +    xive_source_kvm_init(xsrc, &error_fatal);
>> +}
>> +
>> +static void xive_source_kvm_set_irq(void *opaque, int srcno, int val)
>> +{
>> +    XiveSource *xsrc = opaque;
>> +    struct kvm_irq_level args;
>> +    int rc;
>> +
>> +    args.irq = srcno;
>> +    if (!xive_source_irq_is_lsi(xsrc, srcno)) {
>> +        if (!val) {
>> +            return;
>> +        }
>> +        args.level = KVM_INTERRUPT_SET;
>> +    } else {
>> +        if (val) {
>> +            xsrc->status[srcno] |= XIVE_STATUS_ASSERTED;
>> +            args.level = KVM_INTERRUPT_SET_LEVEL;
>> +        } else {
>> +            xsrc->status[srcno] &= ~XIVE_STATUS_ASSERTED;
>> +            args.level = KVM_INTERRUPT_UNSET;
>> +        }
>> +    }
>> +    rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args);
>> +    if (rc < 0) {
>> +        error_report("kvm_irq_line() failed : %s", strerror(errno));
>> +    }
>> +}
>> +
>> +static void *spapr_xive_kvm_mmap(sPAPRXive *xive, int ctrl, size_t len,
>> +                                 Error **errp)
>> +{
>> +    Error *local_err = NULL;
>> +    void *addr;
>> +    int fd;
>> +
>> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, ctrl, &fd, false,
>> +                      &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return NULL;
>> +    }
>> +
>> +    addr = mmap(NULL, len, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
>> +    close(fd);
>> +    if (addr == MAP_FAILED) {
>> +        error_setg_errno(errp, errno, "Unable to set XIVE mmaping");
>> +        return NULL;
>> +    }
>> +
>> +    return addr;
>> +}
>> +
>> +/*
>> + * The sPAPRXive KVM model should have initialized the KVM device
>> + * before initializing the source
>> + */
>> +static void xive_source_kvm_mmap(XiveSource *xsrc, Error **errp)
>> +{
>> +    sPAPRXive *xive = SPAPR_XIVE_KVM(xsrc->xive);
>> +    Error *local_err = NULL;
>> +    size_t esb_len;
>> +
>> +    esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
>> +    xsrc->esb_mmap = spapr_xive_kvm_mmap(xive, KVM_DEV_XIVE_GET_ESB_FD,
>> +                                         esb_len, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +
>> +    memory_region_init_ram_device_ptr(&xsrc->esb_mmio, OBJECT(xsrc),
>> +                                      "xive.esb", esb_len, xsrc->esb_mmap);
>> +    sysbus_init_mmio(SYS_BUS_DEVICE(xsrc), &xsrc->esb_mmio);
>> +}
>> +
>> +static void xive_source_kvm_realize(DeviceState *dev, Error **errp)
>> +{
>> +    XiveSource *xsrc = XIVE_SOURCE_KVM(dev);
>> +    XiveSourceClass *xsc = XIVE_SOURCE_BASE_GET_CLASS(dev);
>> +    Error *local_err = NULL;
>> +
>> +    xsc->parent_realize(dev, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +
>> +    xsrc->qirqs = qemu_allocate_irqs(xive_source_kvm_set_irq, xsrc,
>> +                                     xsrc->nr_irqs);
>> +
>> +    xive_source_kvm_mmap(xsrc, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +}
>> +
>> +static void xive_source_kvm_unrealize(DeviceState *dev, Error **errp)
>> +{
>> +    XiveSource *xsrc = XIVE_SOURCE_KVM(dev);
>> +    size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
>> +
>> +    munmap(xsrc->esb_mmap, esb_len);
>> +}
>> +
>> +static void xive_source_kvm_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +    XiveSourceClass *xsc = XIVE_SOURCE_BASE_CLASS(klass);
>> +
>> +    device_class_set_parent_realize(dc, xive_source_kvm_realize,
>> +                                    &xsc->parent_realize);
>> +    device_class_set_parent_reset(dc, xive_source_kvm_reset,
>> +                                  &xsc->parent_reset);
>> +
>> +    dc->desc = "sPAPR XIVE KVM Interrupt Source";
>> +    dc->unrealize = xive_source_kvm_unrealize;
>> +}
>> +
>> +static const TypeInfo xive_source_kvm_info = {
>> +    .name = TYPE_XIVE_SOURCE_KVM,
>> +    .parent = TYPE_XIVE_SOURCE_BASE,
>> +    .instance_size = sizeof(XiveSource),
>> +    .class_init    = xive_source_kvm_class_init,
>> +    .class_size    = sizeof(XiveSourceClass),
>> +};
>> +
>> +/*
>> + * sPAPR XIVE Router (KVM)
>> + */
>> +
>> +static void spapr_xive_kvm_instance_init(Object *obj)
>> +{
>> +    sPAPRXive *xive = SPAPR_XIVE_KVM(obj);
>> +
>> +    xive->fd = -1;
>> +
>> +    /* We need a KVM flavored source */
>> +    object_initialize(&xive->source, sizeof(xive->source),
>> +                      TYPE_XIVE_SOURCE_KVM);
>> +    object_property_add_child(obj, "source", OBJECT(&xive->source), NULL);
>> +
>> +    /* No KVM support for END ESBs. OPAL doesn't either */
>> +    object_initialize(&xive->end_source, sizeof(xive->end_source),
>> +                      TYPE_XIVE_END_SOURCE);
>> +    object_property_add_child(obj, "end_source", OBJECT(&xive->end_source),
>> +                              NULL);
>> +}
>> +
>> +static void spapr_xive_kvm_init(sPAPRXive *xive, Error **errp)
>> +{
>> +    Error *local_err = NULL;
>> +    size_t tima_len;
>> +
>> +    if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
>> +        error_setg(errp,
>> +                   "IRQ_XIVE capability must be present for KVM XIVE device");
>> +        return;
>> +    }
>> +
>> +    /* First, create the KVM XIVE device */
>> +    xive->fd = kvm_create_device(kvm_state, KVM_DEV_TYPE_XIVE, false);
>> +    if (xive->fd < 0) {
>> +        error_setg_errno(errp, -xive->fd, "error creating KVM XIVE device");
>> +        return;
>> +    }
>> +
>> +    /* Source ESBs KVM mapping
>> +     *
>> +     * Inform KVM where we will map the ESB pages. This is needed by
>> +     * the H_INT_GET_SOURCE_INFO hcall which returns the source
>> +     * characteristics, among which the ESB page address.
>> +     */
>> +    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, KVM_DEV_XIVE_VC_BASE,
>> +                      &xive->vc_base, true, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +
>> +    /* Let the XiveSource KVM model handle the mapping for the moment */
>> +
>> +    /* TIMA KVM mapping
>> +     *
>> +     * We could also inform KVM where the TIMA will be mapped but as
>> +     * this is a fixed MMIO address for the system it does not seem
>> +     * necessary to provide a KVM ioctl to change it.
>> +     */
>> +    tima_len = 4ull << TM_SHIFT;
>> +    xive->tm_mmap = spapr_xive_kvm_mmap(xive, KVM_DEV_XIVE_GET_TIMA_FD,
>> +                                        tima_len, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +    memory_region_init_ram_device_ptr(&xive->tm_mmio, OBJECT(xive),
>> +                                      "xive.tima", tima_len, xive->tm_mmap);
>> +    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
>> +
>> +    kvm_kernel_irqchip = true;
>> +    kvm_msi_via_irqfd_allowed = true;
>> +    kvm_gsi_direct_mapping = true;
>> +}
>> +
>> +static void spapr_xive_kvm_realize(DeviceState *dev, Error **errp)
>> +{
>> +    sPAPRXive *xive = SPAPR_XIVE_KVM(dev);
>> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(dev);
>> +    Error *local_err = NULL;
>> +
>> +    spapr_xive_kvm_init(xive, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +
>> +    /* Initialize the source and the local routing tables */
>> +    sxc->parent_realize(dev, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +}
>> +
>> +static void spapr_xive_kvm_unrealize(DeviceState *dev, Error **errp)
>> +{
>> +    sPAPRXive *xive = SPAPR_XIVE_KVM(dev);
>> +
>> +    close(xive->fd);
>> +    xive->fd = -1;
>> +
>> +    munmap(xive->tm_mmap, 4ull << TM_SHIFT);
>> +}
>> +
>> +static void spapr_xive_kvm_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_CLASS(klass);
>> +
>> +    device_class_set_parent_realize(dc, spapr_xive_kvm_realize,
>> +                                    &sxc->parent_realize);
>> +
>> +    dc->desc = "sPAPR XIVE KVM Interrupt Controller";
>> +    dc->unrealize = spapr_xive_kvm_unrealize;
>> +}
>> +
>> +static const TypeInfo spapr_xive_kvm_info = {
>> +    .name = TYPE_SPAPR_XIVE_KVM,
>> +    .parent = TYPE_SPAPR_XIVE_BASE,
>> +    .instance_init = spapr_xive_kvm_instance_init,
>> +    .instance_size = sizeof(sPAPRXive),
>> +    .class_init = spapr_xive_kvm_class_init,
>> +    .class_size = sizeof(sPAPRXiveClass),
>> +};
>> +
>> +static void xive_kvm_register_types(void)
>> +{
>> +    type_register_static(&spapr_xive_kvm_info);
>> +    type_register_static(&xive_source_kvm_info);
>> +    type_register_static(&xive_tctx_kvm_info);
>> +}
>> +
>> +type_init(xive_kvm_register_types)
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index f9cf2debff5a..d1be2579cd9b 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -1125,8 +1125,11 @@ static void spapr_dt_ov5_platform_support(sPAPRMachineState *spapr, void *fdt,
>>          } else {
>>              val[3] = 0x00; /* Hash */
>>          }
>> -        /* TODO: test KVM support */
>> -        val[1] = smc->irq->ov5;
>> +        if (kvmppc_has_cap_xive()) {
>> +            val[1] = smc->irq->ov5;
>> +        } else {
>> +            val[1] = 0x00;
>> +        }
>>      } else {
>>          val[1] = smc->irq->ov5;
>>  
>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>> index 33dd5da7d255..92ef53743b64 100644
>> --- a/hw/ppc/spapr_irq.c
>> +++ b/hw/ppc/spapr_irq.c
>> @@ -273,9 +273,22 @@ static void spapr_irq_init_xive(sPAPRMachineState *spapr, int nr_irqs,
>>      Error *local_err = NULL;
>>  
>>      /* KVM XIVE support */
>> -    if (kvm_enabled()) {
>> -        if (machine_kernel_irqchip_required(machine)) {
>> -            error_setg(errp, "kernel_irqchip requested. no XIVE support");
>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +        spapr->xive_tctx_type = TYPE_XIVE_TCTX_KVM;
>> +        spapr->xive = spapr_xive_create(spapr, TYPE_SPAPR_XIVE_KVM, nr_irqs,
>> +                                        nr_servers, &local_err);
>> +
>> +        if (local_err && machine_kernel_irqchip_required(machine)) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "kernel_irqchip requested but init failed : ");
>> +            return;
>> +        }
>> +
>> +        /*
>> +         * XIVE support is activated under KVM. No need to initialize
>> +         * the fallback mode under QEMU
>> +         */
>> +        if (spapr->xive) {
>>              return;
>>          }
>>      }
>> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
>> index f81327d6cd47..3b7cf106242b 100644
>> --- a/target/ppc/kvm.c
>> +++ b/target/ppc/kvm.c
>> @@ -86,6 +86,7 @@ static int cap_fixup_hcalls;
>>  static int cap_htm;             /* Hardware transactional memory support */
>>  static int cap_mmu_radix;
>>  static int cap_mmu_hash_v3;
>> +static int cap_xive;
>>  static int cap_resize_hpt;
>>  static int cap_ppc_pvr_compat;
>>  static int cap_ppc_safe_cache;
>> @@ -149,6 +150,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>>      cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM);
>>      cap_mmu_radix = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_RADIX);
>>      cap_mmu_hash_v3 = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_HASH_V3);
>> +    cap_xive = kvm_vm_check_extension(s, KVM_CAP_PPC_IRQ_XIVE);
>>      cap_resize_hpt = kvm_vm_check_extension(s, KVM_CAP_SPAPR_RESIZE_HPT);
>>      kvmppc_get_cpu_characteristics(s);
>>      cap_ppc_nested_kvm_hv = kvm_vm_check_extension(s, KVM_CAP_PPC_NESTED_HV);
>> @@ -2385,6 +2387,11 @@ static int parse_cap_ppc_safe_indirect_branch(struct kvm_ppc_cpu_char c)
>>      return 0;
>>  }
>>  
>> +bool kvmppc_has_cap_xive(void)
>> +{
>> +    return cap_xive;
>> +}
>> +
>>  static void kvmppc_get_cpu_characteristics(KVMState *s)
>>  {
>>      struct kvm_ppc_cpu_char c;
>> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
>> index eacd26836ebf..dd4d69db2bdd 100644
>> --- a/hw/intc/Makefile.objs
>> +++ b/hw/intc/Makefile.objs
>> @@ -39,6 +39,7 @@ obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
>>  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
>>  obj-$(CONFIG_XIVE) += xive.o
>>  obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o spapr_xive_hcall.o
>> +obj-$(CONFIG_XIVE_KVM) += spapr_xive_kvm.o
>>  obj-$(CONFIG_POWERNV) += xics_pnv.o
>>  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
>>  obj-$(CONFIG_S390_FLIC) += s390_flic.o
>
David Gibson Nov. 29, 2018, 3:33 a.m. UTC | #3
On Wed, Nov 28, 2018 at 11:45:46PM +0100, Cédric Le Goater wrote:
> On 11/28/18 6:52 AM, David Gibson wrote:
> > On Fri, Nov 16, 2018 at 11:57:15AM +0100, Cédric Le Goater wrote:
> >> This introduces a set of XIVE models specific to KVM which derive from
> >> the XIVE base models. The interfaces with KVM are a new capability and
> >> a new KVM device for the XIVE native exploitation interrupt mode.
> >>
> >> They handle the initialization of the TIMA and the source ESB memory
> >> regions which have a different type under KVM. These are 'ram device'
> >> memory mappings, similarly to VFIO, exposed to the guest and the
> >> associated VMAs on the host are populated dynamically with the
> >> appropriate pages using a fault handler.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> > 
> > The logic here looks fine, but I think it would be better to activate
> > it with explicit if (kvm) type logic rather than using a subclass.
> 
> ok. ARM has taken a different path, the one proposed below, but it should 
> be possible to use a "if (kvm)" type logic. There should be less noise 
> in the object design.

Yeah, it seemed like a good path when I wrote the XICS code, but
experience with that has led me to decide it wasn't a good idea.  I'm
not sure if the ARM people copied that or came up with it on their
own.

[snip]
> >> +/*
> >> + * XIVE Thread Interrupt Management context (KVM)
> >> + */
> >> +
> >> +static void xive_tctx_kvm_init(XiveTCTX *tctx, Error **errp)
> >> +{
> >> +    sPAPRXive *xive;
> >> +    unsigned long vcpu_id;
> >> +    int ret;
> >> +
> >> +    /* Check if CPU was hot unplugged and replugged. */
> >> +    if (kvm_cpu_is_enabled(tctx->cs)) {
> >> +        return;
> >> +    }
> >> +
> >> +    vcpu_id = kvm_arch_vcpu_id(tctx->cs);
> >> +    xive = SPAPR_XIVE_KVM(tctx->xrtr);
> > 
> > Is this the first use of tctx->xrtr?
> 
> No, the second. the first is the reset_tctx() ops doing the CAM reset.
> But we said that we could remove it.

And I think we can remove it here too.  We know this is PAPR specific
so we can go qdev_get_machine() -> PAPR xive object.

I normally don't like using qdev_get_machine(), but I think it's a
little less ugly than including this backlink (and it is sometimes
necessary to deal with the different abstraction boundaries between
qemu and KVM).
diff mbox series

Patch

diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
index 7f34ad0528ed..c1bf5cd951f5 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -18,6 +18,7 @@  CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
 CONFIG_XICS_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
 CONFIG_XIVE=$(CONFIG_PSERIES)
 CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
+CONFIG_XIVE_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
 CONFIG_MEM_DEVICE=y
 CONFIG_DIMM=y
 CONFIG_SPAPR_RNG=y
diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index aca2969a09ab..9c817bb7ae74 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -40,6 +40,10 @@  typedef struct sPAPRXive {
     /* TIMA mapping address */
     hwaddr        tm_base;
     MemoryRegion  tm_mmio;
+
+    /* KVM support */
+    int           fd;
+    void          *tm_mmap;
 } sPAPRXive;
 
 #define SPAPR_XIVE_BASE_CLASS(klass) \
@@ -83,4 +87,18 @@  void spapr_xive_hcall_init(sPAPRMachineState *spapr);
 void spapr_dt_xive(sPAPRXive *xive, int nr_servers, void *fdt,
                    uint32_t phandle);
 
+/*
+ * XIVE KVM models
+ */
+
+#define TYPE_SPAPR_XIVE_KVM  "spapr-xive-kvm"
+#define SPAPR_XIVE_KVM(obj)  OBJECT_CHECK(sPAPRXive, (obj), TYPE_SPAPR_XIVE_KVM)
+
+#define TYPE_XIVE_SOURCE_KVM "xive-source-kvm"
+#define XIVE_SOURCE_KVM(obj) \
+    OBJECT_CHECK(XiveSource, (obj), TYPE_XIVE_SOURCE_KVM)
+
+#define TYPE_XIVE_TCTX_KVM   "xive-tctx-kvm"
+#define XIVE_TCTX_KVM(obj)   OBJECT_CHECK(XiveTCTX, (obj), TYPE_XIVE_TCTX_KVM)
+
 #endif /* PPC_SPAPR_XIVE_H */
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index 281ed370121c..7aaf5a182cb3 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -69,6 +69,9 @@  typedef struct XiveSource {
     uint32_t        esb_shift;
     MemoryRegion    esb_mmio;
 
+    /* KVM support */
+    void            *esb_mmap;
+
     XiveFabric      *xive;
 } XiveSource;
 
diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
index 8c876c166ef2..f34c971491dd 100644
--- a/linux-headers/asm-powerpc/kvm.h
+++ b/linux-headers/asm-powerpc/kvm.h
@@ -675,4 +675,16 @@  struct kvm_ppc_cpu_char {
 #define  KVM_XICS_PRESENTED		(1ULL << 43)
 #define  KVM_XICS_QUEUED		(1ULL << 44)
 
+/* POWER9 XIVE Native Interrupt Controller */
+#define KVM_DEV_XIVE_GRP_CTRL		1
+#define   KVM_DEV_XIVE_GET_ESB_FD	1
+#define   KVM_DEV_XIVE_GET_TIMA_FD	2
+#define   KVM_DEV_XIVE_VC_BASE		3
+#define KVM_DEV_XIVE_GRP_SOURCES	2	/* 64-bit source attributes */
+
+/* Layout of 64-bit XIVE source attribute values */
+#define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
+#define KVM_XIVE_LEVEL_ASSERTED		(1ULL << 1)
+
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index f11a7eb49cfa..59fa8d8d7f39 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -965,6 +965,8 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_COALESCED_PIO 162
 #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
 #define KVM_CAP_EXCEPTION_PAYLOAD 164
+#define KVM_CAP_ARM_VM_IPA_SIZE 165
+#define KVM_CAP_PPC_IRQ_XIVE 166
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1188,6 +1190,8 @@  enum kvm_device_type {
 #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
 	KVM_DEV_TYPE_ARM_VGIC_ITS,
 #define KVM_DEV_TYPE_ARM_VGIC_ITS	KVM_DEV_TYPE_ARM_VGIC_ITS
+	KVM_DEV_TYPE_XIVE,
+#define KVM_DEV_TYPE_XIVE		KVM_DEV_TYPE_XIVE
 	KVM_DEV_TYPE_MAX,
 };
 
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index bdfaa4e70a83..d2159660f9f2 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -59,6 +59,7 @@  bool kvmppc_has_cap_fixup_hcalls(void);
 bool kvmppc_has_cap_htm(void);
 bool kvmppc_has_cap_mmu_radix(void);
 bool kvmppc_has_cap_mmu_hash_v3(void);
+bool kvmppc_has_cap_xive(void);
 int kvmppc_get_cap_safe_cache(void);
 int kvmppc_get_cap_safe_bounds_check(void);
 int kvmppc_get_cap_safe_indirect_branch(void);
@@ -307,6 +308,11 @@  static inline bool kvmppc_has_cap_mmu_hash_v3(void)
     return false;
 }
 
+static inline bool kvmppc_has_cap_xive(void)
+{
+    return false;
+}
+
 static inline int kvmppc_get_cap_safe_cache(void)
 {
     return 0;
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
new file mode 100644
index 000000000000..767f90826e43
--- /dev/null
+++ b/hw/intc/spapr_xive_kvm.c
@@ -0,0 +1,430 @@ 
+/*
+ * QEMU PowerPC sPAPR XIVE interrupt controller model
+ *
+ * Copyright (c) 2017-2018, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "target/ppc/cpu.h"
+#include "sysemu/cpus.h"
+#include "sysemu/kvm.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_xive.h"
+#include "hw/ppc/xive.h"
+#include "kvm_ppc.h"
+
+#include <sys/ioctl.h>
+
+/*
+ * Helpers for CPU hotplug
+ */
+typedef struct KVMEnabledCPU {
+    unsigned long vcpu_id;
+    QLIST_ENTRY(KVMEnabledCPU) node;
+} KVMEnabledCPU;
+
+static QLIST_HEAD(, KVMEnabledCPU)
+    kvm_enabled_cpus = QLIST_HEAD_INITIALIZER(&kvm_enabled_cpus);
+
+static bool kvm_cpu_is_enabled(CPUState *cs)
+{
+    KVMEnabledCPU *enabled_cpu;
+    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
+
+    QLIST_FOREACH(enabled_cpu, &kvm_enabled_cpus, node) {
+        if (enabled_cpu->vcpu_id == vcpu_id) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static void kvm_cpu_enable(CPUState *cs)
+{
+    KVMEnabledCPU *enabled_cpu;
+    unsigned long vcpu_id = kvm_arch_vcpu_id(cs);
+
+    enabled_cpu = g_malloc(sizeof(*enabled_cpu));
+    enabled_cpu->vcpu_id = vcpu_id;
+    QLIST_INSERT_HEAD(&kvm_enabled_cpus, enabled_cpu, node);
+}
+
+/*
+ * XIVE Thread Interrupt Management context (KVM)
+ */
+
+static void xive_tctx_kvm_init(XiveTCTX *tctx, Error **errp)
+{
+    sPAPRXive *xive;
+    unsigned long vcpu_id;
+    int ret;
+
+    /* Check if CPU was hot unplugged and replugged. */
+    if (kvm_cpu_is_enabled(tctx->cs)) {
+        return;
+    }
+
+    vcpu_id = kvm_arch_vcpu_id(tctx->cs);
+    xive = SPAPR_XIVE_KVM(tctx->xrtr);
+
+    ret = kvm_vcpu_enable_cap(tctx->cs, KVM_CAP_PPC_IRQ_XIVE, 0, xive->fd,
+                              vcpu_id, 0);
+    if (ret < 0) {
+        error_setg(errp, "Unable to connect CPU%ld to KVM XIVE device: %s",
+                   vcpu_id, strerror(errno));
+        return;
+    }
+
+    kvm_cpu_enable(tctx->cs);
+}
+
+static void xive_tctx_kvm_realize(DeviceState *dev, Error **errp)
+{
+    XiveTCTX *tctx = XIVE_TCTX_KVM(dev);
+    XiveTCTXClass *xtc = XIVE_TCTX_BASE_GET_CLASS(dev);
+    Error *local_err = NULL;
+
+    xtc->parent_realize(dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    xive_tctx_kvm_init(tctx, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
+static void xive_tctx_kvm_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    XiveTCTXClass *xtc = XIVE_TCTX_BASE_CLASS(klass);
+
+    dc->desc = "sPAPR XIVE KVM Interrupt Thread Context";
+
+    device_class_set_parent_realize(dc, xive_tctx_kvm_realize,
+                                    &xtc->parent_realize);
+}
+
+static const TypeInfo xive_tctx_kvm_info = {
+    .name          = TYPE_XIVE_TCTX_KVM,
+    .parent        = TYPE_XIVE_TCTX_BASE,
+    .instance_size = sizeof(XiveTCTX),
+    .class_init    = xive_tctx_kvm_class_init,
+    .class_size    = sizeof(XiveTCTXClass),
+};
+
+/*
+ * XIVE Interrupt Source (KVM)
+ */
+
+static void xive_source_kvm_init(XiveSource *xsrc, Error **errp)
+{
+    sPAPRXive *xive = SPAPR_XIVE_KVM(xsrc->xive);
+    int i;
+
+    /*
+     * At reset, interrupt sources are simply created and MASKED. We
+     * only need to inform the KVM device about their type: LSI or
+     * MSI.
+     */
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        Error *local_err = NULL;
+        uint64_t state = 0;
+
+        if (xive_source_irq_is_lsi(xsrc, i)) {
+            state |= KVM_XIVE_LEVEL_SENSITIVE;
+            if (xsrc->status[i] & XIVE_STATUS_ASSERTED) {
+                state |= KVM_XIVE_LEVEL_ASSERTED;
+            }
+        }
+
+        kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCES, i, &state,
+                          true, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
+static void xive_source_kvm_reset(DeviceState *dev)
+{
+    XiveSource *xsrc = XIVE_SOURCE_KVM(dev);
+    XiveSourceClass *xsc = XIVE_SOURCE_BASE_GET_CLASS(dev);
+
+    xsc->parent_reset(dev);
+
+    xive_source_kvm_init(xsrc, &error_fatal);
+}
+
+static void xive_source_kvm_set_irq(void *opaque, int srcno, int val)
+{
+    XiveSource *xsrc = opaque;
+    struct kvm_irq_level args;
+    int rc;
+
+    args.irq = srcno;
+    if (!xive_source_irq_is_lsi(xsrc, srcno)) {
+        if (!val) {
+            return;
+        }
+        args.level = KVM_INTERRUPT_SET;
+    } else {
+        if (val) {
+            xsrc->status[srcno] |= XIVE_STATUS_ASSERTED;
+            args.level = KVM_INTERRUPT_SET_LEVEL;
+        } else {
+            xsrc->status[srcno] &= ~XIVE_STATUS_ASSERTED;
+            args.level = KVM_INTERRUPT_UNSET;
+        }
+    }
+    rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args);
+    if (rc < 0) {
+        error_report("kvm_irq_line() failed : %s", strerror(errno));
+    }
+}
+
+static void *spapr_xive_kvm_mmap(sPAPRXive *xive, int ctrl, size_t len,
+                                 Error **errp)
+{
+    Error *local_err = NULL;
+    void *addr;
+    int fd;
+
+    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, ctrl, &fd, false,
+                      &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return NULL;
+    }
+
+    addr = mmap(NULL, len, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+    close(fd);
+    if (addr == MAP_FAILED) {
+        error_setg_errno(errp, errno, "Unable to set XIVE mmaping");
+        return NULL;
+    }
+
+    return addr;
+}
+
+/*
+ * The sPAPRXive KVM model should have initialized the KVM device
+ * before initializing the source
+ */
+static void xive_source_kvm_mmap(XiveSource *xsrc, Error **errp)
+{
+    sPAPRXive *xive = SPAPR_XIVE_KVM(xsrc->xive);
+    Error *local_err = NULL;
+    size_t esb_len;
+
+    esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
+    xsrc->esb_mmap = spapr_xive_kvm_mmap(xive, KVM_DEV_XIVE_GET_ESB_FD,
+                                         esb_len, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    memory_region_init_ram_device_ptr(&xsrc->esb_mmio, OBJECT(xsrc),
+                                      "xive.esb", esb_len, xsrc->esb_mmap);
+    sysbus_init_mmio(SYS_BUS_DEVICE(xsrc), &xsrc->esb_mmio);
+}
+
+static void xive_source_kvm_realize(DeviceState *dev, Error **errp)
+{
+    XiveSource *xsrc = XIVE_SOURCE_KVM(dev);
+    XiveSourceClass *xsc = XIVE_SOURCE_BASE_GET_CLASS(dev);
+    Error *local_err = NULL;
+
+    xsc->parent_realize(dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    xsrc->qirqs = qemu_allocate_irqs(xive_source_kvm_set_irq, xsrc,
+                                     xsrc->nr_irqs);
+
+    xive_source_kvm_mmap(xsrc, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
+static void xive_source_kvm_unrealize(DeviceState *dev, Error **errp)
+{
+    XiveSource *xsrc = XIVE_SOURCE_KVM(dev);
+    size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
+
+    munmap(xsrc->esb_mmap, esb_len);
+}
+
+static void xive_source_kvm_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    XiveSourceClass *xsc = XIVE_SOURCE_BASE_CLASS(klass);
+
+    device_class_set_parent_realize(dc, xive_source_kvm_realize,
+                                    &xsc->parent_realize);
+    device_class_set_parent_reset(dc, xive_source_kvm_reset,
+                                  &xsc->parent_reset);
+
+    dc->desc = "sPAPR XIVE KVM Interrupt Source";
+    dc->unrealize = xive_source_kvm_unrealize;
+}
+
+static const TypeInfo xive_source_kvm_info = {
+    .name = TYPE_XIVE_SOURCE_KVM,
+    .parent = TYPE_XIVE_SOURCE_BASE,
+    .instance_size = sizeof(XiveSource),
+    .class_init    = xive_source_kvm_class_init,
+    .class_size    = sizeof(XiveSourceClass),
+};
+
+/*
+ * sPAPR XIVE Router (KVM)
+ */
+
+static void spapr_xive_kvm_instance_init(Object *obj)
+{
+    sPAPRXive *xive = SPAPR_XIVE_KVM(obj);
+
+    xive->fd = -1;
+
+    /* We need a KVM flavored source */
+    object_initialize(&xive->source, sizeof(xive->source),
+                      TYPE_XIVE_SOURCE_KVM);
+    object_property_add_child(obj, "source", OBJECT(&xive->source), NULL);
+
+    /* No KVM support for END ESBs. OPAL doesn't either */
+    object_initialize(&xive->end_source, sizeof(xive->end_source),
+                      TYPE_XIVE_END_SOURCE);
+    object_property_add_child(obj, "end_source", OBJECT(&xive->end_source),
+                              NULL);
+}
+
+static void spapr_xive_kvm_init(sPAPRXive *xive, Error **errp)
+{
+    Error *local_err = NULL;
+    size_t tima_len;
+
+    if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
+        error_setg(errp,
+                   "IRQ_XIVE capability must be present for KVM XIVE device");
+        return;
+    }
+
+    /* First, create the KVM XIVE device */
+    xive->fd = kvm_create_device(kvm_state, KVM_DEV_TYPE_XIVE, false);
+    if (xive->fd < 0) {
+        error_setg_errno(errp, -xive->fd, "error creating KVM XIVE device");
+        return;
+    }
+
+    /* Source ESBs KVM mapping
+     *
+     * Inform KVM where we will map the ESB pages. This is needed by
+     * the H_INT_GET_SOURCE_INFO hcall which returns the source
+     * characteristics, among which the ESB page address.
+     */
+    kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, KVM_DEV_XIVE_VC_BASE,
+                      &xive->vc_base, true, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    /* Let the XiveSource KVM model handle the mapping for the moment */
+
+    /* TIMA KVM mapping
+     *
+     * We could also inform KVM where the TIMA will be mapped but as
+     * this is a fixed MMIO address for the system it does not seem
+     * necessary to provide a KVM ioctl to change it.
+     */
+    tima_len = 4ull << TM_SHIFT;
+    xive->tm_mmap = spapr_xive_kvm_mmap(xive, KVM_DEV_XIVE_GET_TIMA_FD,
+                                        tima_len, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    memory_region_init_ram_device_ptr(&xive->tm_mmio, OBJECT(xive),
+                                      "xive.tima", tima_len, xive->tm_mmap);
+    sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio);
+
+    kvm_kernel_irqchip = true;
+    kvm_msi_via_irqfd_allowed = true;
+    kvm_gsi_direct_mapping = true;
+}
+
+static void spapr_xive_kvm_realize(DeviceState *dev, Error **errp)
+{
+    sPAPRXive *xive = SPAPR_XIVE_KVM(dev);
+    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_GET_CLASS(dev);
+    Error *local_err = NULL;
+
+    spapr_xive_kvm_init(xive, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    /* Initialize the source and the local routing tables */
+    sxc->parent_realize(dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
+static void spapr_xive_kvm_unrealize(DeviceState *dev, Error **errp)
+{
+    sPAPRXive *xive = SPAPR_XIVE_KVM(dev);
+
+    close(xive->fd);
+    xive->fd = -1;
+
+    munmap(xive->tm_mmap, 4ull << TM_SHIFT);
+}
+
+static void spapr_xive_kvm_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    sPAPRXiveClass *sxc = SPAPR_XIVE_BASE_CLASS(klass);
+
+    device_class_set_parent_realize(dc, spapr_xive_kvm_realize,
+                                    &sxc->parent_realize);
+
+    dc->desc = "sPAPR XIVE KVM Interrupt Controller";
+    dc->unrealize = spapr_xive_kvm_unrealize;
+}
+
+static const TypeInfo spapr_xive_kvm_info = {
+    .name = TYPE_SPAPR_XIVE_KVM,
+    .parent = TYPE_SPAPR_XIVE_BASE,
+    .instance_init = spapr_xive_kvm_instance_init,
+    .instance_size = sizeof(sPAPRXive),
+    .class_init = spapr_xive_kvm_class_init,
+    .class_size = sizeof(sPAPRXiveClass),
+};
+
+static void xive_kvm_register_types(void)
+{
+    type_register_static(&spapr_xive_kvm_info);
+    type_register_static(&xive_source_kvm_info);
+    type_register_static(&xive_tctx_kvm_info);
+}
+
+type_init(xive_kvm_register_types)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index f9cf2debff5a..d1be2579cd9b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1125,8 +1125,11 @@  static void spapr_dt_ov5_platform_support(sPAPRMachineState *spapr, void *fdt,
         } else {
             val[3] = 0x00; /* Hash */
         }
-        /* TODO: test KVM support */
-        val[1] = smc->irq->ov5;
+        if (kvmppc_has_cap_xive()) {
+            val[1] = smc->irq->ov5;
+        } else {
+            val[1] = 0x00;
+        }
     } else {
         val[1] = smc->irq->ov5;
 
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index 33dd5da7d255..92ef53743b64 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -273,9 +273,22 @@  static void spapr_irq_init_xive(sPAPRMachineState *spapr, int nr_irqs,
     Error *local_err = NULL;
 
     /* KVM XIVE support */
-    if (kvm_enabled()) {
-        if (machine_kernel_irqchip_required(machine)) {
-            error_setg(errp, "kernel_irqchip requested. no XIVE support");
+    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+        spapr->xive_tctx_type = TYPE_XIVE_TCTX_KVM;
+        spapr->xive = spapr_xive_create(spapr, TYPE_SPAPR_XIVE_KVM, nr_irqs,
+                                        nr_servers, &local_err);
+
+        if (local_err && machine_kernel_irqchip_required(machine)) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "kernel_irqchip requested but init failed : ");
+            return;
+        }
+
+        /*
+         * XIVE support is activated under KVM. No need to initialize
+         * the fallback mode under QEMU
+         */
+        if (spapr->xive) {
             return;
         }
     }
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index f81327d6cd47..3b7cf106242b 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -86,6 +86,7 @@  static int cap_fixup_hcalls;
 static int cap_htm;             /* Hardware transactional memory support */
 static int cap_mmu_radix;
 static int cap_mmu_hash_v3;
+static int cap_xive;
 static int cap_resize_hpt;
 static int cap_ppc_pvr_compat;
 static int cap_ppc_safe_cache;
@@ -149,6 +150,7 @@  int kvm_arch_init(MachineState *ms, KVMState *s)
     cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM);
     cap_mmu_radix = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_RADIX);
     cap_mmu_hash_v3 = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_HASH_V3);
+    cap_xive = kvm_vm_check_extension(s, KVM_CAP_PPC_IRQ_XIVE);
     cap_resize_hpt = kvm_vm_check_extension(s, KVM_CAP_SPAPR_RESIZE_HPT);
     kvmppc_get_cpu_characteristics(s);
     cap_ppc_nested_kvm_hv = kvm_vm_check_extension(s, KVM_CAP_PPC_NESTED_HV);
@@ -2385,6 +2387,11 @@  static int parse_cap_ppc_safe_indirect_branch(struct kvm_ppc_cpu_char c)
     return 0;
 }
 
+bool kvmppc_has_cap_xive(void)
+{
+    return cap_xive;
+}
+
 static void kvmppc_get_cpu_characteristics(KVMState *s)
 {
     struct kvm_ppc_cpu_char c;
diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
index eacd26836ebf..dd4d69db2bdd 100644
--- a/hw/intc/Makefile.objs
+++ b/hw/intc/Makefile.objs
@@ -39,6 +39,7 @@  obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
 obj-$(CONFIG_XICS_KVM) += xics_kvm.o
 obj-$(CONFIG_XIVE) += xive.o
 obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o spapr_xive_hcall.o
+obj-$(CONFIG_XIVE_KVM) += spapr_xive_kvm.o
 obj-$(CONFIG_POWERNV) += xics_pnv.o
 obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
 obj-$(CONFIG_S390_FLIC) += s390_flic.o