diff mbox

Added iopmem device emulation

Message ID 1476827711-20758-1-git-send-email-logang@deltatee.com (mailing list archive)
State New, archived
Headers show

Commit Message

Logan Gunthorpe Oct. 18, 2016, 9:55 p.m. UTC
An iopmem device is one which exposes volatile or non-volatile memory mapped
directly to a BAR region. One purpose is to provide buffers to do peer
to peer transfers on the bus. As such this device uses QEMU's drive
backing store to simulate non-volatile memory and provides through a
mapped BAR window.

This patch creates an emulated device which helps to test and debug the
kernel driver for iopmem while hardware availability is poor. A kernel
patch for a driver is being prepared simultaneously.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Stephen Bates <sbates@raithlin.com>
---
 default-configs/pci.mak  |   1 +
 hw/block/Makefile.objs   |   1 +
 hw/block/iopmem.c        | 141 +++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/pci/pci_ids.h |   2 +
 4 files changed, 145 insertions(+)
 create mode 100644 hw/block/iopmem.c

Comments

Stefan Hajnoczi Nov. 4, 2016, 10:49 a.m. UTC | #1
On Tue, Oct 18, 2016 at 03:55:11PM -0600, Logan Gunthorpe wrote:
> An iopmem device is one which exposes volatile or non-volatile memory mapped
> directly to a BAR region. One purpose is to provide buffers to do peer
> to peer transfers on the bus. As such this device uses QEMU's drive
> backing store to simulate non-volatile memory and provides through a
> mapped BAR window.
> 
> This patch creates an emulated device which helps to test and debug the
> kernel driver for iopmem while hardware availability is poor. A kernel
> patch for a driver is being prepared simultaneously.
> 
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> Signed-off-by: Stephen Bates <sbates@raithlin.com>

QEMU already has NVDIMM support (https://pmem.io/).  It can be used both
for passthrough and fake non-volatile memory:

  qemu-system-x86_64 \
    -M pc,nvdimm=on \
    -m 1024,maxmem=$((4096 * 1024 * 1024)),slots=2 \
    -object memory-backend-file,id=mem0,mem-path=/tmp/foo,size=$((64 * 1024 * 1024)) \
    -device nvdimm,memdev=mem0

Please explain where iopmem comes from, where the hardware spec is, etc?

Perhaps you could use nvdimm instead of adding a new device?

> ---
>  default-configs/pci.mak  |   1 +
>  hw/block/Makefile.objs   |   1 +
>  hw/block/iopmem.c        | 141 +++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/pci/pci_ids.h |   2 +
>  4 files changed, 145 insertions(+)
>  create mode 100644 hw/block/iopmem.c
> 
> diff --git a/default-configs/pci.mak b/default-configs/pci.mak
> index fff7ce3..aef2b35 100644
> --- a/default-configs/pci.mak
> +++ b/default-configs/pci.mak
> @@ -39,3 +39,4 @@ CONFIG_VGA=y
>  CONFIG_VGA_PCI=y
>  CONFIG_IVSHMEM=$(CONFIG_EVENTFD)
>  CONFIG_ROCKER=y
> +CONFIG_IOPMEM_PCI=y
> diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
> index d4c3ab7..1c8044b 100644
> --- a/hw/block/Makefile.objs
> +++ b/hw/block/Makefile.objs
> @@ -8,6 +8,7 @@ common-obj-$(CONFIG_XEN_BACKEND) += xen_disk.o
>  common-obj-$(CONFIG_ECC) += ecc.o
>  common-obj-$(CONFIG_ONENAND) += onenand.o
>  common-obj-$(CONFIG_NVME_PCI) += nvme.o
> +common-obj-$(CONFIG_IOPMEM_PCI) += iopmem.o
>  
>  obj-$(CONFIG_SH4) += tc58128.o
>  
> diff --git a/hw/block/iopmem.c b/hw/block/iopmem.c
> new file mode 100644
> index 0000000..9c2d716
> --- /dev/null
> +++ b/hw/block/iopmem.c
> @@ -0,0 +1,141 @@
> +/*
> + * QEMU iopmem Controller
> + *
> + * Copyright (c) 2016, Microsemi Corporation
> + *
> + * Written by Logan Gunthorpe <logang@deltatee.com>
> + *
> + * This code is licensed under the GNU GPL v2 or later.
> + */
> +
> +
> +/**
> + * Usage: add options:
> + *      -drive file=<file>,if=none,id=<drive_id>
> + *      -device iopmem,drive=<drive_id>,id=<id[optional]>
> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/pci/pci.h"
> +#include "sysemu/block-backend.h"
> +
> +typedef struct IoPmemCtrl {
> +    PCIDevice    parent_obj;
> +    MemoryRegion iomem;
> +    BlockBackend *blk;
> +    uint64_t     size;
> +} IoPmemCtrl;
> +
> +#define TYPE_IOPMEM "iopmem"
> +#define IOPMEM(obj) \
> +        OBJECT_CHECK(IoPmemCtrl, (obj), TYPE_IOPMEM)
> +
> +static uint64_t iopmem_bar_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    IoPmemCtrl *ipm = (IoPmemCtrl *)opaque;
> +    uint64_t val;
> +
> +    blk_pread(ipm->blk, addr, &val, size);
> +
> +    return val;
> +}
> +
> +static void iopmem_bar_write(void *opaque, hwaddr addr, uint64_t data,
> +                           unsigned size)
> +{
> +    IoPmemCtrl *ipm = (IoPmemCtrl *)opaque;
> +
> +    if (addr & 3) {
> +        return;
> +    }
> +
> +    blk_pwrite(ipm->blk, addr, &data, size, 0);
> +}

This has terrible performance because blk_pread()/blk_pwrite() are
synchronous operations.  The vcpu thread is paused while QEMU is waiting
for I/O.
Logan Gunthorpe Nov. 4, 2016, 3:47 p.m. UTC | #2
Hi Stefan,

On 04/11/16 04:49 AM, Stefan Hajnoczi wrote:
> QEMU already has NVDIMM support (https://pmem.io/).  It can be used both
> for passthrough and fake non-volatile memory:
> 
>   qemu-system-x86_64 \
>     -M pc,nvdimm=on \
>     -m 1024,maxmem=$((4096 * 1024 * 1024)),slots=2 \
>     -object memory-backend-file,id=mem0,mem-path=/tmp/foo,size=$((64 * 1024 * 1024)) \
>     -device nvdimm,memdev=mem0
> 
> Please explain where iopmem comes from, where the hardware spec is, etc?

Yes, we are aware of nvdimm and, yes, there are quite a few
commonalities. The difference between nvdimm and iopmem is that the
memory that backs iopmem is on a PCI device and not connected through
system memory. Currently, we are working with prototype hardware so
there is no open spec that I'm aware of but the concept is really
simple: a single bar directly maps volatile or non-volatile memory.

One of the primary motivations behind iopmem is to provide memory to do
peer to peer transactions between PCI devices such that, for example, an
RDMA NIC could transfer data directly to storage and bypass the system
memory bus all together.


> Perhaps you could use nvdimm instead of adding a new device?

I'm afraid not. The main purpose of this patch is to enable us to test
kernel drivers for this type of hardware. If we use nvdimm, there is no
PCI device for our driver to enumerate and the existing, different,
NVDIMM drivers would be used instead.

Thanks for the consideration,

Logan
Stefan Hajnoczi Nov. 7, 2016, 10:28 a.m. UTC | #3
On Fri, Nov 04, 2016 at 09:47:33AM -0600, Logan Gunthorpe wrote:
> On 04/11/16 04:49 AM, Stefan Hajnoczi wrote:
> > QEMU already has NVDIMM support (https://pmem.io/).  It can be used both
> > for passthrough and fake non-volatile memory:
> > 
> >   qemu-system-x86_64 \
> >     -M pc,nvdimm=on \
> >     -m 1024,maxmem=$((4096 * 1024 * 1024)),slots=2 \
> >     -object memory-backend-file,id=mem0,mem-path=/tmp/foo,size=$((64 * 1024 * 1024)) \
> >     -device nvdimm,memdev=mem0
> > 
> > Please explain where iopmem comes from, where the hardware spec is, etc?
> 
> Yes, we are aware of nvdimm and, yes, there are quite a few
> commonalities. The difference between nvdimm and iopmem is that the
> memory that backs iopmem is on a PCI device and not connected through
> system memory. Currently, we are working with prototype hardware so
> there is no open spec that I'm aware of but the concept is really
> simple: a single bar directly maps volatile or non-volatile memory.
> 
> One of the primary motivations behind iopmem is to provide memory to do
> peer to peer transactions between PCI devices such that, for example, an
> RDMA NIC could transfer data directly to storage and bypass the system
> memory bus all together.

It may be too early to merge this code into qemu.git if there is no
hardware spec and this is a prototype device that is subject to change.

I'm wondering if there is a way to test or use this device if you are
not releasing specs and code that drives the device.

Have you submitted patches to enable this device in Linux, DPDK, or any
other project?

> > Perhaps you could use nvdimm instead of adding a new device?
> 
> I'm afraid not. The main purpose of this patch is to enable us to test
> kernel drivers for this type of hardware. If we use nvdimm, there is no
> PCI device for our driver to enumerate and the existing, different,
> NVDIMM drivers would be used instead.

Fair enough, it makes sense to implement a PCI device for this purpose.

Stefan
Logan Gunthorpe Nov. 7, 2016, 5:09 p.m. UTC | #4
Hey,

On 07/11/16 03:28 AM, Stefan Hajnoczi wrote:
> It may be too early to merge this code into qemu.git if there is no
> hardware spec and this is a prototype device that is subject to change.

Fair enough, though the interface is so simple I don't know what could
possibly change.

> I'm wondering if there is a way to test or use this device if you are
> not releasing specs and code that drives the device.
> 
> Have you submitted patches to enable this device in Linux, DPDK, or any
> other project?

Yes, you can find patches to the Linux Kernel that were submitted to a
couple mailing lists at the same time as the QEMU patch:

http://www.mail-archive.com/linux-nvdimm@lists.01.org/msg01426.html

There's been a discussion as to how best to expose these devices to user
space and we may take a different approach in v2. But there has been no
indication that the PCI interface would need to change at all.

Thanks,

Logan
Stefan Hajnoczi Nov. 8, 2016, 3:58 p.m. UTC | #5
On Mon, Nov 07, 2016 at 10:09:29AM -0700, Logan Gunthorpe wrote:
> On 07/11/16 03:28 AM, Stefan Hajnoczi wrote:
> > It may be too early to merge this code into qemu.git if there is no
> > hardware spec and this is a prototype device that is subject to change.
> 
> Fair enough, though the interface is so simple I don't know what could
> possibly change.

My concern with the current implementation is that a PCI MMIO access
invokes a synchronous blk_*() call.  That can pause vcpu execution while
I/O is happening and therefore leads to unresponsive guests.  QEMU's
monitor interface is also blocked during blk_*() making it impossible to
troubleshoot QEMU if it gets stuck due to a slow/hung I/O operation.

Device models need to use blk_aio_*() so that control is returned while
I/O is running.  There are a few legacy devices left that use
synchronous I/O but new devices should not use this approach.

Regarding the hardware design, I think the PCI BAR approach to nvdimm is
inefficient for virtualization because each memory load/store requires a
guest<->host transition (vmexit + vmenter).  A DMA approach (i.e.
message passing or descriptor rings) is more efficient because it
requires fewer vmexits.

On real hardware the performance characteristics are different so it
depends what your target market is.

> > I'm wondering if there is a way to test or use this device if you are
> > not releasing specs and code that drives the device.
> > 
> > Have you submitted patches to enable this device in Linux, DPDK, or any
> > other project?
> 
> Yes, you can find patches to the Linux Kernel that were submitted to a
> couple mailing lists at the same time as the QEMU patch:
> 
> http://www.mail-archive.com/linux-nvdimm@lists.01.org/msg01426.html
> 
> There's been a discussion as to how best to expose these devices to user
> space and we may take a different approach in v2. But there has been no
> indication that the PCI interface would need to change at all.

Thanks, I'll check out the discussion!

Stefan
Logan Gunthorpe Nov. 8, 2016, 4:46 p.m. UTC | #6
Hey,

On 08/11/16 08:58 AM, Stefan Hajnoczi wrote:
> My concern with the current implementation is that a PCI MMIO access
> invokes a synchronous blk_*() call.  That can pause vcpu execution while
> I/O is happening and therefore leads to unresponsive guests.  QEMU's
> monitor interface is also blocked during blk_*() making it impossible to
> troubleshoot QEMU if it gets stuck due to a slow/hung I/O operation.
> 
> Device models need to use blk_aio_*() so that control is returned while
> I/O is running.  There are a few legacy devices left that use
> synchronous I/O but new devices should not use this approach.

That's fair. I wasn't aware of this and I must have copied a legacy
device. We can certainly make the change in our patch.

> Regarding the hardware design, I think the PCI BAR approach to nvdimm is
> inefficient for virtualization because each memory load/store requires a
> guest<->host transition (vmexit + vmenter).  A DMA approach (i.e.
> message passing or descriptor rings) is more efficient because it
> requires fewer vmexits.
> 
> On real hardware the performance characteristics are different so it
> depends what your target market is.

The performance of the virtual device is completely unimportant. This
isn't something I'd expect anyone to use except to test drivers. On real
hardware, with real applications, DMA would almost certainly be used --
but it would be the DMA engine in another device. eg. an IB NIC would
DMA from the PCI BAR of the iopmem device. This completely bypasses the
CPU so there would be no load/stores to be concerned about.

Thanks,

Logan
Stefan Hajnoczi Nov. 9, 2016, 9:58 a.m. UTC | #7
On Tue, Nov 08, 2016 at 09:46:47AM -0700, Logan Gunthorpe wrote:
> On 08/11/16 08:58 AM, Stefan Hajnoczi wrote:
> > My concern with the current implementation is that a PCI MMIO access
> > invokes a synchronous blk_*() call.  That can pause vcpu execution while
> > I/O is happening and therefore leads to unresponsive guests.  QEMU's
> > monitor interface is also blocked during blk_*() making it impossible to
> > troubleshoot QEMU if it gets stuck due to a slow/hung I/O operation.
> > 
> > Device models need to use blk_aio_*() so that control is returned while
> > I/O is running.  There are a few legacy devices left that use
> > synchronous I/O but new devices should not use this approach.
> 
> That's fair. I wasn't aware of this and I must have copied a legacy
> device. We can certainly make the change in our patch.
> 
> > Regarding the hardware design, I think the PCI BAR approach to nvdimm is
> > inefficient for virtualization because each memory load/store requires a
> > guest<->host transition (vmexit + vmenter).  A DMA approach (i.e.
> > message passing or descriptor rings) is more efficient because it
> > requires fewer vmexits.
> > 
> > On real hardware the performance characteristics are different so it
> > depends what your target market is.
> 
> The performance of the virtual device is completely unimportant. This
> isn't something I'd expect anyone to use except to test drivers. On real
> hardware, with real applications, DMA would almost certainly be used --
> but it would be the DMA engine in another device. eg. an IB NIC would
> DMA from the PCI BAR of the iopmem device. This completely bypasses the
> CPU so there would be no load/stores to be concerned about.

Okay, sounds good.

Is there a reason to use the block layer in QEMU?  Perhaps it's better
to do the same as the nvdimm device in QEMU and use a memory backend
instead.  The memory backend can be MAP_PRIVATE or MAP_SHARED.  It can
be anonymous memory or it can be backed by a real file.  This gives you
options can avoids using the QEMU block layer in a way it wasn't
designed.

Doing this would forgo QEMU block layer features like image file formats
(qcow2), block jobs (storage migration), etc.  But those probably aren't
necessary for this device.

Stefan
diff mbox

Patch

diff --git a/default-configs/pci.mak b/default-configs/pci.mak
index fff7ce3..aef2b35 100644
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -39,3 +39,4 @@  CONFIG_VGA=y
 CONFIG_VGA_PCI=y
 CONFIG_IVSHMEM=$(CONFIG_EVENTFD)
 CONFIG_ROCKER=y
+CONFIG_IOPMEM_PCI=y
diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
index d4c3ab7..1c8044b 100644
--- a/hw/block/Makefile.objs
+++ b/hw/block/Makefile.objs
@@ -8,6 +8,7 @@  common-obj-$(CONFIG_XEN_BACKEND) += xen_disk.o
 common-obj-$(CONFIG_ECC) += ecc.o
 common-obj-$(CONFIG_ONENAND) += onenand.o
 common-obj-$(CONFIG_NVME_PCI) += nvme.o
+common-obj-$(CONFIG_IOPMEM_PCI) += iopmem.o
 
 obj-$(CONFIG_SH4) += tc58128.o
 
diff --git a/hw/block/iopmem.c b/hw/block/iopmem.c
new file mode 100644
index 0000000..9c2d716
--- /dev/null
+++ b/hw/block/iopmem.c
@@ -0,0 +1,141 @@ 
+/*
+ * QEMU iopmem Controller
+ *
+ * Copyright (c) 2016, Microsemi Corporation
+ *
+ * Written by Logan Gunthorpe <logang@deltatee.com>
+ *
+ * This code is licensed under the GNU GPL v2 or later.
+ */
+
+
+/**
+ * Usage: add options:
+ *      -drive file=<file>,if=none,id=<drive_id>
+ *      -device iopmem,drive=<drive_id>,id=<id[optional]>
+ */
+
+#include "qemu/osdep.h"
+#include "hw/pci/pci.h"
+#include "sysemu/block-backend.h"
+
+typedef struct IoPmemCtrl {
+    PCIDevice    parent_obj;
+    MemoryRegion iomem;
+    BlockBackend *blk;
+    uint64_t     size;
+} IoPmemCtrl;
+
+#define TYPE_IOPMEM "iopmem"
+#define IOPMEM(obj) \
+        OBJECT_CHECK(IoPmemCtrl, (obj), TYPE_IOPMEM)
+
+static uint64_t iopmem_bar_read(void *opaque, hwaddr addr, unsigned size)
+{
+    IoPmemCtrl *ipm = (IoPmemCtrl *)opaque;
+    uint64_t val;
+
+    blk_pread(ipm->blk, addr, &val, size);
+
+    return val;
+}
+
+static void iopmem_bar_write(void *opaque, hwaddr addr, uint64_t data,
+                           unsigned size)
+{
+    IoPmemCtrl *ipm = (IoPmemCtrl *)opaque;
+
+    if (addr & 3) {
+        return;
+    }
+
+    blk_pwrite(ipm->blk, addr, &data, size, 0);
+}
+
+static const MemoryRegionOps iopmem_bar_ops = {
+    .read = iopmem_bar_read,
+    .write = iopmem_bar_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    },
+};
+
+static int iopmem_init(PCIDevice *pci_dev)
+{
+    IoPmemCtrl *ipm = IOPMEM(pci_dev);
+
+    if (!ipm->blk) {
+        return -1;
+    }
+
+    ipm->size = blk_getlength(ipm->blk);
+
+    pci_config_set_prog_interface(pci_dev->config, 0x2);
+    pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_OTHER);
+    pcie_endpoint_cap_init(&ipm->parent_obj, 0x80);
+
+    memory_region_init_io(&ipm->iomem, OBJECT(ipm), &iopmem_bar_ops, ipm,
+                          "iopmem", ipm->size);
+
+    pci_register_bar(&ipm->parent_obj, 4,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY |
+                     PCI_BASE_ADDRESS_MEM_PREFETCH |
+                     PCI_BASE_ADDRESS_MEM_TYPE_64,
+                     &ipm->iomem);
+
+    return 0;
+}
+
+static void iopmem_exit(PCIDevice *pci_dev)
+{
+    IoPmemCtrl *ipm = IOPMEM(pci_dev);
+
+    if (ipm->blk) {
+        blk_flush(ipm->blk);
+    }
+}
+
+static Property iopmem_props[] = {
+    DEFINE_PROP_DRIVE("drive", IoPmemCtrl, blk),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static const VMStateDescription iopmem_vmstate = {
+    .name = "iopmem",
+    .unmigratable = 1,
+};
+
+static void iopmem_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
+
+    pc->init = iopmem_init;
+    pc->exit = iopmem_exit;
+    pc->class_id = PCI_CLASS_STORAGE_OTHER;
+    pc->vendor_id = PCI_VENDOR_ID_PMC_SIERRA;
+    pc->device_id = 0xf115;
+    pc->revision = 2;
+    pc->is_express = 1;
+
+    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    dc->desc = "Non-Volatile IO Memory Storage";
+    dc->props = iopmem_props;
+    dc->vmsd = &iopmem_vmstate;
+}
+
+static const TypeInfo iopmem_info = {
+    .name          = "iopmem",
+    .parent        = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(IoPmemCtrl),
+    .class_init    = iopmem_class_init,
+};
+
+static void iopmem_register_types(void)
+{
+    type_register_static(&iopmem_info);
+}
+
+type_init(iopmem_register_types)
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index d77ca60..e7046e1 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -113,6 +113,8 @@ 
 
 #define PCI_VENDOR_ID_MARVELL            0x11ab
 
+#define PCI_VENDOR_ID_PMC_SIERRA         0x11f8
+
 #define PCI_VENDOR_ID_ENSONIQ            0x1274
 #define PCI_DEVICE_ID_ENSONIQ_ES1370     0x5000