diff mbox

Support shared memory device PCI device

Message ID 1247610317-22349-1-git-send-email-cam@cs.ualberta.ca (mailing list archive)
State New, archived
Headers show

Commit Message

Cam Macdonell July 14, 2009, 10:25 p.m. UTC
This patch is an updated version of a previous one (http://patchwork.kernel.org/patch/22363/) that supports adding a shared memory PCI device.  To be clear, there is no new functionality in this patch, just a fix for changes to the master branch.

The device's memory is mappable into user-level to provide zero-copy messaging between guest and between guest and host.  Please see previous patch for a more detailed description.

---
 Makefile.target |    3 +
 hw/ivshmem.c    |  421 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/pc.c         |    6 +
 hw/pc.h         |    3 +
 qemu-options.hx |   14 ++
 sysemu.h        |    8 +
 vl.c            |   14 ++
 7 files changed, 469 insertions(+), 0 deletions(-)
 create mode 100644 hw/ivshmem.c
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index 660a855..323a935 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -611,6 +611,9 @@  obj-y += pcnet.o
 obj-y += rtl8139.o
 obj-y += e1000.o
 
+# Inter-VM PCI shared memory
+obj-y += ivshmem.o
+
 # Generic watchdog support and some watchdog devices
 obj-y += wdt_ib700.o wdt_i6300esb.o
 
diff --git a/hw/ivshmem.c b/hw/ivshmem.c
new file mode 100644
index 0000000..8b66d0f
--- /dev/null
+++ b/hw/ivshmem.c
@@ -0,0 +1,421 @@ 
+/*
+ * Inter-VM Shared Memory PCI device.
+ *
+ * Author:
+ *      Cam Macdonell <cam@cs.ualberta.ca>
+ *
+ * Based On: cirrus_vga.c and rtl8139.c
+ *
+ * This code is licensed under the GNU GPL v2.
+ */
+
+#include "hw.h"
+#include "console.h"
+#include "pc.h"
+#include "pci.h"
+#include "sysemu.h"
+
+#include "qemu-common.h"
+#include <sys/mman.h>
+
+#define PCI_COMMAND_IOACCESS                0x0001
+#define PCI_COMMAND_MEMACCESS               0x0002
+#define PCI_COMMAND_BUSMASTER               0x0004
+
+//#define DEBUG_IVSHMEM
+
+#ifdef DEBUG_IVSHMEM
+#define IVSHMEM_DPRINTF(fmt, args...)        \
+    do {printf("IVSHMEM: " fmt, ##args); } while (0)
+#else
+#define IVSHMEM_DPRINTF(fmt, args...)
+#endif
+
+typedef struct IVShmemState {
+    uint16_t intrmask;
+    uint16_t intrstatus;
+    uint16_t doorbell;
+    uint8_t *ivshmem_ptr;
+    unsigned long ivshmem_offset;
+    unsigned int ivshmem_size;
+    unsigned long bios_offset;
+    unsigned int bios_size;
+    target_phys_addr_t base_ctrl;
+    int it_shift;
+    PCIDevice *pci_dev;
+    CharDriverState * chr;
+    unsigned long map_addr;
+    unsigned long map_end;
+    int ivshmem_mmio_io_addr;
+} IVShmemState;
+
+typedef struct PCI_IVShmemState {
+    PCIDevice dev;
+    IVShmemState ivshmem_state;
+} PCI_IVShmemState;
+
+typedef struct IVShmemDesc {
+    char name[1024];
+    char * chrdev;
+    int size;
+} IVShmemDesc;
+
+
+/* registers for the Inter-VM shared memory device */
+enum ivshmem_registers {
+    IntrMask = 0,
+    IntrStatus = 16,
+    Doorbell = 32
+};
+
+static int num_ivshmem_devices = 0;
+static IVShmemDesc ivshmem_desc;
+
+static void ivshmem_map(PCIDevice *pci_dev, int region_num,
+                    uint32_t addr, uint32_t size, int type)
+{
+    PCI_IVShmemState *d = (PCI_IVShmemState *)pci_dev;
+    IVShmemState *s = &d->ivshmem_state;
+
+    IVSHMEM_DPRINTF("addr = %u size = %u\n", addr, size);
+    cpu_register_physical_memory(addr, s->ivshmem_size, s->ivshmem_offset);
+
+}
+
+void ivshmem_init(const char * optarg) {
+
+    char * temp;
+    char * ivshmem_sz;
+    int size;
+
+    num_ivshmem_devices++;
+
+    /* currently we only support 1 device */
+    if (num_ivshmem_devices > MAX_IVSHMEM_DEVICES) {
+        return;
+    }
+
+    temp = strdup(optarg);
+    snprintf(ivshmem_desc.name, 1024, "/%s", strsep(&temp,","));
+    ivshmem_sz=strsep(&temp,",");
+    if (ivshmem_sz != NULL){
+        size = atol(ivshmem_sz);
+    } else {
+        size = -1;
+    }
+
+    ivshmem_desc.chrdev = strsep(&temp,"\0");
+
+    if ( size == -1) {
+        ivshmem_desc.size = TARGET_PAGE_SIZE;
+    } else {
+        ivshmem_desc.size = size*1024*1024;
+    }
+    IVSHMEM_DPRINTF("optarg is %s, name is %s, size is %d, chrdev is %s\n",
+                                        optarg, ivshmem_desc.name,
+                                        ivshmem_desc.size, ivshmem_desc.chrdev);
+}
+
+int ivshmem_get_size(void) {
+    return ivshmem_desc.size;
+}
+
+/* accessing registers - based on rtl8139 */
+static void ivshmem_update_irq(IVShmemState *s)
+{
+    int isr;
+    isr = (s->intrstatus & s->intrmask) & 0xffff;
+
+    /* don't print ISR resets */
+    if (isr) {
+        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
+           isr ? 1 : 0, s->intrstatus, s->intrmask);
+    }
+
+    qemu_set_irq(s->pci_dev->irq[0], (isr != 0));
+}
+
+static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
+                       uint32_t addr, uint32_t size, int type)
+{
+    PCI_IVShmemState *d = (PCI_IVShmemState *)pci_dev;
+    IVShmemState *s = &d->ivshmem_state;
+
+    cpu_register_physical_memory(addr + 0, 0x100, s->ivshmem_mmio_io_addr);
+}
+
+static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
+{
+    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
+
+    s->intrmask = val;
+
+    ivshmem_update_irq(s);
+}
+
+static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
+{
+    uint32_t ret = s->intrmask;
+
+    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
+
+    return ret;
+}
+
+static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
+{
+    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
+
+    s->intrstatus = val;
+
+    ivshmem_update_irq(s);
+    return;
+}
+
+static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
+{
+    uint32_t ret = s->intrstatus;
+
+    /* reading ISR clears all interrupts */
+    s->intrstatus = 0;
+
+    ivshmem_update_irq(s);
+
+    return ret;
+}
+
+static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
+{
+    IVShmemState *s = opaque;
+
+    IVSHMEM_DPRINTF("writing 0x%x to 0x%lx\n", addr, (unsigned long) opaque);
+
+    addr &= 0xfe;
+
+    switch (addr)
+    {
+        case IntrMask:
+            ivshmem_IntrMask_write(s, val);
+            break;
+
+        case IntrStatus:
+            ivshmem_IntrStatus_write(s, val);
+            break;
+
+        default:
+            IVSHMEM_DPRINTF("why are we writing 0x%x\n", addr);
+    }
+}
+
+static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
+{
+    IVSHMEM_DPRINTF("We shouldn't be writing longs\n");
+}
+
+static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
+{
+    IVShmemState *s = opaque;
+    uint8_t writebyte = val & 0xff; //write the lower 8-bits of 'val'
+
+    switch (addr)
+    {   // in future, we will probably want to support more types of doorbells
+        case Doorbell:
+            // wake up the other side
+            qemu_chr_write(s->chr, &writebyte, 1);
+            IVSHMEM_DPRINTF("Writing to the other side 0x%x\n", writebyte);
+            break;
+        default:
+            IVSHMEM_DPRINTF("Unhandled write (0x%x)\n", addr);
+    }
+}
+
+static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
+{
+
+    IVShmemState *s = opaque;
+    uint32_t ret;
+
+    switch (addr)
+    {
+        case IntrMask:
+            ret = ivshmem_IntrMask_read(s);
+            break;
+        case IntrStatus:
+            ret = ivshmem_IntrStatus_read(s);
+            break;
+        default:
+            IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
+            ret = 0;
+    }
+
+    return ret;
+}
+
+static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
+{
+    IVSHMEM_DPRINTF("We shouldn't be reading longs\n");
+    return 0;
+}
+
+static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
+{
+    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
+
+    return 0;
+}
+
+static void ivshmem_mmio_writeb(void *opaque,
+                                target_phys_addr_t addr, uint32_t val)
+{
+    ivshmem_io_writeb(opaque, addr & 0xFF, val);
+}
+
+static void ivshmem_mmio_writew(void *opaque,
+                                target_phys_addr_t addr, uint32_t val)
+{
+    ivshmem_io_writew(opaque, addr & 0xFF, val);
+}
+
+static void ivshmem_mmio_writel(void *opaque,
+                                target_phys_addr_t addr, uint32_t val)
+{
+    ivshmem_io_writel(opaque, addr & 0xFF, val);
+}
+
+static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    return ivshmem_io_readb(opaque, addr & 0xFF);
+}
+
+static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    uint32_t val = ivshmem_io_readw(opaque, addr & 0xFF);
+    return val;
+}
+
+static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    uint32_t val = ivshmem_io_readl(opaque, addr & 0xFF);
+    return val;
+}
+
+static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {
+    ivshmem_mmio_readb,
+    ivshmem_mmio_readw,
+    ivshmem_mmio_readl,
+};
+
+static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
+    ivshmem_mmio_writeb,
+    ivshmem_mmio_writew,
+    ivshmem_mmio_writel,
+};
+
+static int ivshmem_can_receive(void * opaque)
+{
+    return 1;
+}
+
+static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
+{
+    IVShmemState *s = opaque;
+
+    ivshmem_IntrStatus_write(s, *buf);
+
+    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
+}
+
+static void ivshmem_event(void *opaque, int event)
+{
+    IVShmemState *s = opaque;
+    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
+}
+
+int pci_ivshmem_init(PCIBus *bus)
+{
+    PCI_IVShmemState *d;
+    IVShmemState *s;
+    uint8_t *pci_conf;
+    int ivshmem_fd;
+
+    IVSHMEM_DPRINTF("shared file is %s\n", ivshmem_desc.name);
+    d = (PCI_IVShmemState *)pci_register_device(bus, "kvm_ivshmem",
+                                           sizeof(PCI_IVShmemState),
+                                           -1, NULL, NULL);
+    if (!d) {
+        return -1;
+    }
+
+    s = &d->ivshmem_state;
+
+    /* allocate shared memory RAM */
+    s->ivshmem_offset = qemu_ram_alloc(ivshmem_desc.size);
+    IVSHMEM_DPRINTF("size is = %d\n", ivshmem_desc.size);
+    IVSHMEM_DPRINTF("ivshmem ram offset = %ld\n", s->ivshmem_offset);
+
+    s->ivshmem_ptr = qemu_get_ram_ptr(s->ivshmem_offset);
+
+    s->pci_dev = &d->dev;
+    s->ivshmem_size = ivshmem_desc.size;
+
+    pci_conf = d->dev.config;
+    pci_conf[0x00] = 0xf4; // Qumranet vendor ID 0x5002
+    pci_conf[0x01] = 0x1a;
+    pci_conf[0x02] = 0x10;
+    pci_conf[0x03] = 0x11;
+    pci_conf[0x04] = PCI_COMMAND_IOACCESS | PCI_COMMAND_MEMACCESS;
+    pci_conf[0x0a] = 0x00; // RAM controller
+    pci_conf[0x0b] = 0x05;
+    pci_conf[0x0e] = 0x00; // header_type
+
+    pci_conf[PCI_INTERRUPT_PIN] = 1; // we are going to support interrupts
+
+    /* XXX: ivshmem_desc.size must be a power of two */
+
+    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
+                                    ivshmem_mmio_write, s);
+
+    /* region for registers*/
+    pci_register_bar(&d->dev, 0, 0x100,
+                           PCI_ADDRESS_SPACE_MEM, ivshmem_mmio_map);
+
+    /* region for shared memory */
+    pci_register_bar(&d->dev, 1, ivshmem_desc.size,
+                           PCI_ADDRESS_SPACE_MEM, ivshmem_map);
+
+    /* open shared memory file  */
+    if ((ivshmem_fd = shm_open(ivshmem_desc.name, O_CREAT|O_RDWR, S_IRWXU)) < 0)
+    {
+        fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
+        exit(-1);
+    }
+
+    ftruncate(ivshmem_fd, ivshmem_desc.size);
+
+    /* mmap onto PCI device's memory */
+    if (mmap(s->ivshmem_ptr, ivshmem_desc.size, PROT_READ|PROT_WRITE,
+                        MAP_SHARED|MAP_FIXED, ivshmem_fd, 0) == MAP_FAILED)
+    {
+        fprintf(stderr, "kvm_ivshmem: could not mmap shared file\n");
+        exit(-1);
+    }
+
+    IVSHMEM_DPRINTF("shared object mapped to 0x%p\n", s->ivshmem_ptr);
+
+    /* setup character device channel */
+
+    if (ivshmem_desc.chrdev != NULL) {
+        char label[32];
+        snprintf(label, 32, "ivshmem_chardev");
+        s->chr = qemu_chr_open(label, ivshmem_desc.chrdev, NULL);
+        if (s->chr == NULL) {
+            fprintf(stderr, "No server listening on %s\n", ivshmem_desc.chrdev);
+            exit(-1);
+        }
+        qemu_chr_add_handlers(s->chr, ivshmem_can_receive, ivshmem_receive,
+                          ivshmem_event, s);
+    }
+
+    return 0;
+}
+
diff --git a/hw/pc.c b/hw/pc.c
index cf84416..f20dc83 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -91,6 +91,8 @@  static void option_rom_setup_reset(target_phys_addr_t addr, unsigned size)
     qemu_register_reset(option_rom_reset, rrd);
 }
 
+extern int ivshmem_enabled;
+
 static void ioport80_write(void *opaque, uint32_t addr, uint32_t data)
 {
 }
@@ -1289,6 +1291,10 @@  static void pc_init1(ram_addr_t ram_size,
         }
     }
 
+    if (pci_enabled && ivshmem_enabled) {
+        pci_ivshmem_init(pci_bus);
+    }
+
     rtc_state = rtc_init(0x70, i8259[8], 2000);
 
     qemu_register_boot_set(pc_boot_set, rtc_state);
diff --git a/hw/pc.h b/hw/pc.h
index 10bf002..5d19a0f 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -185,4 +185,7 @@  void extboot_init(BlockDriverState *bs, int cmd);
 
 int cpu_is_bsp(CPUState *env);
 
+/* ivshmem.c */
+int pci_ivshmem_init(PCIBus *bus);
+
 #endif
diff --git a/qemu-options.hx b/qemu-options.hx
index 7e98053..2411372 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1309,6 +1309,20 @@  The default device is @code{vc} in graphical mode and @code{stdio} in
 non graphical mode.
 ETEXI
 
+DEF("ivshmem", HAS_ARG, QEMU_OPTION_ivshmem, \
+    "-ivshmem name,size[,unix:path][,server]  creates or opens a shared file 'name' of size \
+    'size' (in MB) and exposes it as a PCI device in the guest\n")
+STEXI
+@item -ivshmem @var{file},@var{size}
+Creates a POSIX shared file named @var{file} of size @var{size} and creates a
+PCI device of the same size that maps the shared file into the device for guests
+to access.  The created file on the host is located in /dev/shm/
+
+@item unix:@var{path}[,server]
+A unix domain socket is used to send and receive interrupts between VMs.  The unix domain socket
+@var{path} is used for connections.
+ETEXI
+
 DEF("pidfile", HAS_ARG, QEMU_OPTION_pidfile, \
     "-pidfile file   write PID to 'file'\n")
 STEXI
diff --git a/sysemu.h b/sysemu.h
index 5582633..24abda1 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -239,6 +239,14 @@  extern CharDriverState *parallel_hds[MAX_PARALLEL_PORTS];
 
 extern CharDriverState *virtcon_hds[MAX_VIRTIO_CONSOLES];
 
+/* inter-VM shared memory devices */
+
+#define MAX_IVSHMEM_DEVICES 1
+
+extern CharDriverState * ivshmem_chardev;
+void ivshmem_init(const char * optarg);
+int ivshmem_get_size(void);
+
 #define TFR(expr) do { if ((expr) != -1) break; } while (errno == EINTR)
 
 #ifdef NEED_CPU_H
diff --git a/vl.c b/vl.c
index 5d86e69..553cf5c 100644
--- a/vl.c
+++ b/vl.c
@@ -217,6 +217,7 @@  static int rtc_date_offset = -1; /* -1 means no change */
 int cirrus_vga_enabled = 1;
 int std_vga_enabled = 0;
 int vmsvga_enabled = 0;
+int ivshmem_enabled = 0;
 int xenfb_enabled = 0;
 #ifdef TARGET_SPARC
 int graphic_width = 1024;
@@ -235,6 +236,8 @@  int no_quit = 0;
 CharDriverState *serial_hds[MAX_SERIAL_PORTS];
 CharDriverState *parallel_hds[MAX_PARALLEL_PORTS];
 CharDriverState *virtcon_hds[MAX_VIRTIO_CONSOLES];
+CharDriverState *ivshmem_chardev;
+const char * ivshmem_device;
 #ifdef TARGET_I386
 int win2k_install_hack = 0;
 int rtc_td_hack = 0;
@@ -5139,6 +5142,8 @@  int main(int argc, char **argv, char **envp)
     cyls = heads = secs = 0;
     translation = BIOS_ATA_TRANSLATION_AUTO;
     monitor_device = "vc:80Cx24C";
+    ivshmem_device = NULL;
+    ivshmem_chardev = NULL;
 
     serial_devices[0] = "vc:80Cx24C";
     for(i = 1; i < MAX_SERIAL_PORTS; i++)
@@ -5592,6 +5597,10 @@  int main(int argc, char **argv, char **envp)
                 parallel_devices[parallel_device_index] = optarg;
                 parallel_device_index++;
                 break;
+            case QEMU_OPTION_ivshmem:
+                ivshmem_device = optarg;
+                ivshmem_enabled = 1;
+                break;
 	    case QEMU_OPTION_loadvm:
 		loadvm = optarg;
 		break;
@@ -6049,6 +6058,11 @@  int main(int argc, char **argv, char **envp)
 	    }
     }
 
+    if (ivshmem_enabled) {
+        ivshmem_init(ivshmem_device);
+        ram_size += ivshmem_get_size();
+    }
+
 #ifdef CONFIG_KQEMU
     /* FIXME: This is a nasty hack because kqemu can't cope with dynamic
        guest ram allocation.  It needs to go away.  */