diff mbox

[RFC,v3] qemu: Add virtio pmem device

Message ID 20180713075232.9575-4-pagupta@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Pankaj Gupta July 13, 2018, 7:52 a.m. UTC
This patch adds virtio-pmem Qemu device.

 This device presents memory address range information to guest
 which is backed by file backend type. It acts like persistent
 memory device for KVM guest. Guest can perform read and persistent
 write operations on this memory range with the help of DAX capable
 filesystem.

 Persistent guest writes are assured with the help of virtio based
 flushing interface. When guest userspace space performs fsync on
 file fd on pmem device, a flush command is send to Qemu over VIRTIO
 and host side flush/sync is done on backing image file.

Changes from RFC v2:
- Use aio_worker() to avoid Qemu from hanging with blocking fsync
  call - Stefan
- Use virtio_st*_p() for endianess - Stefan
- Correct indentation in qapi/misc.json - Eric

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
---
 hw/virtio/Makefile.objs                     |   3 +
 hw/virtio/virtio-pci.c                      |  44 +++++
 hw/virtio/virtio-pci.h                      |  14 ++
 hw/virtio/virtio-pmem.c                     | 241 ++++++++++++++++++++++++++++
 include/hw/pci/pci.h                        |   1 +
 include/hw/virtio/virtio-pmem.h             |  42 +++++
 include/standard-headers/linux/virtio_ids.h |   1 +
 qapi/misc.json                              |  26 ++-
 8 files changed, 371 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/virtio-pmem.c
 create mode 100644 include/hw/virtio/virtio-pmem.h

Comments

Luiz Capitulino July 18, 2018, 12:55 p.m. UTC | #1
On Fri, 13 Jul 2018 13:22:32 +0530
Pankaj Gupta <pagupta@redhat.com> wrote:

>  This patch adds virtio-pmem Qemu device.
> 
>  This device presents memory address range information to guest
>  which is backed by file backend type. It acts like persistent
>  memory device for KVM guest. Guest can perform read and persistent
>  write operations on this memory range with the help of DAX capable
>  filesystem.
> 
>  Persistent guest writes are assured with the help of virtio based
>  flushing interface. When guest userspace space performs fsync on
>  file fd on pmem device, a flush command is send to Qemu over VIRTIO
>  and host side flush/sync is done on backing image file.
> 
> Changes from RFC v2:
> - Use aio_worker() to avoid Qemu from hanging with blocking fsync
>   call - Stefan
> - Use virtio_st*_p() for endianess - Stefan
> - Correct indentation in qapi/misc.json - Eric
> 
> Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> ---
>  hw/virtio/Makefile.objs                     |   3 +
>  hw/virtio/virtio-pci.c                      |  44 +++++
>  hw/virtio/virtio-pci.h                      |  14 ++
>  hw/virtio/virtio-pmem.c                     | 241 ++++++++++++++++++++++++++++
>  include/hw/pci/pci.h                        |   1 +
>  include/hw/virtio/virtio-pmem.h             |  42 +++++
>  include/standard-headers/linux/virtio_ids.h |   1 +
>  qapi/misc.json                              |  26 ++-
>  8 files changed, 371 insertions(+), 1 deletion(-)
>  create mode 100644 hw/virtio/virtio-pmem.c
>  create mode 100644 include/hw/virtio/virtio-pmem.h
> 
> diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
> index 1b2799cfd8..7f914d45d0 100644
> --- a/hw/virtio/Makefile.objs
> +++ b/hw/virtio/Makefile.objs
> @@ -10,6 +10,9 @@ obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
>  obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) += virtio-crypto-pci.o
>  
>  obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
> +ifeq ($(CONFIG_MEM_HOTPLUG),y)
> +obj-$(CONFIG_LINUX) += virtio-pmem.o
> +endif
>  obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
>  endif
>  
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index 3a01fe90f0..93d3fc05c7 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -2521,6 +2521,49 @@ static const TypeInfo virtio_rng_pci_info = {
>      .class_init    = virtio_rng_pci_class_init,
>  };
>  
> +/* virtio-pmem-pci */
> +
> +static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> +{
> +    VirtIOPMEMPCI *vpmem = VIRTIO_PMEM_PCI(vpci_dev);
> +    DeviceState *vdev = DEVICE(&vpmem->vdev);
> +
> +    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
> +    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
> +}
> +
> +static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
> +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
> +    k->realize = virtio_pmem_pci_realize;
> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
> +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_PMEM;
> +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
> +    pcidev_k->class_id = PCI_CLASS_OTHERS;
> +}
> +
> +static void virtio_pmem_pci_instance_init(Object *obj)
> +{
> +    VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
> +
> +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
> +                                TYPE_VIRTIO_PMEM);
> +    object_property_add_alias(obj, "memdev", OBJECT(&dev->vdev), "memdev",
> +                              &error_abort);
> +}
> +
> +static const TypeInfo virtio_pmem_pci_info = {
> +    .name          = TYPE_VIRTIO_PMEM_PCI,
> +    .parent        = TYPE_VIRTIO_PCI,
> +    .instance_size = sizeof(VirtIOPMEMPCI),
> +    .instance_init = virtio_pmem_pci_instance_init,
> +    .class_init    = virtio_pmem_pci_class_init,
> +};
> +
> +
>  /* virtio-input-pci */
>  
>  static Property virtio_input_pci_properties[] = {
> @@ -2714,6 +2757,7 @@ static void virtio_pci_register_types(void)
>      type_register_static(&virtio_balloon_pci_info);
>      type_register_static(&virtio_serial_pci_info);
>      type_register_static(&virtio_net_pci_info);
> +    type_register_static(&virtio_pmem_pci_info);
>  #ifdef CONFIG_VHOST_SCSI
>      type_register_static(&vhost_scsi_pci_info);
>  #endif
> diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
> index 813082b0d7..fe74fcad3f 100644
> --- a/hw/virtio/virtio-pci.h
> +++ b/hw/virtio/virtio-pci.h
> @@ -19,6 +19,7 @@
>  #include "hw/virtio/virtio-blk.h"
>  #include "hw/virtio/virtio-net.h"
>  #include "hw/virtio/virtio-rng.h"
> +#include "hw/virtio/virtio-pmem.h"
>  #include "hw/virtio/virtio-serial.h"
>  #include "hw/virtio/virtio-scsi.h"
>  #include "hw/virtio/virtio-balloon.h"
> @@ -57,6 +58,7 @@ typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
>  typedef struct VirtIOGPUPCI VirtIOGPUPCI;
>  typedef struct VHostVSockPCI VHostVSockPCI;
>  typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
> +typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
>  
>  /* virtio-pci-bus */
>  
> @@ -274,6 +276,18 @@ struct VirtIOBlkPCI {
>      VirtIOBlock vdev;
>  };
>  
> +/*
> + * virtio-pmem-pci: This extends VirtioPCIProxy.
> + */
> +#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci"
> +#define VIRTIO_PMEM_PCI(obj) \
> +        OBJECT_CHECK(VirtIOPMEMPCI, (obj), TYPE_VIRTIO_PMEM_PCI)
> +
> +struct VirtIOPMEMPCI {
> +    VirtIOPCIProxy parent_obj;
> +    VirtIOPMEM vdev;
> +};
> +
>  /*
>   * virtio-balloon-pci: This extends VirtioPCIProxy.
>   */
> diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
> new file mode 100644
> index 0000000000..08c96d7e80
> --- /dev/null
> +++ b/hw/virtio/virtio-pmem.c
> @@ -0,0 +1,241 @@
> +/*
> + * Virtio pmem device
> + *
> + * Copyright (C) 2018 Red Hat, Inc.
> + * Copyright (C) 2018 Pankaj Gupta <pagupta@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu-common.h"
> +#include "qemu/error-report.h"
> +#include "hw/virtio/virtio-access.h"
> +#include "hw/virtio/virtio-pmem.h"
> +#include "hw/mem/memory-device.h"
> +#include "block/aio.h"
> +#include "block/thread-pool.h"
> +
> +typedef struct VirtIOPMEMresp {
> +    int ret;
> +} VirtIOPMEMResp;
> +
> +typedef struct VirtIODeviceRequest {
> +    VirtQueueElement elem;
> +    int fd;
> +    VirtIOPMEM *pmem;
> +    VirtIOPMEMResp resp;
> +} VirtIODeviceRequest;
> +
> +static int worker_cb(void *opaque)
> +{
> +    VirtIODeviceRequest *req = opaque;
> +    int err = 0;
> +
> +    /* flush raw backing image */
> +    err = fsync(req->fd);
> +    if (err != 0) {
> +        err = errno;
> +    }
> +    req->resp.ret = err;

Host question: are you returning the guest errno code to the host?

If yes, I don't think this is right. I think you probably want to
define a constant for error and let the host decide on the errno
code to return to the application.

> +
> +    return 0;
> +}
> +
> +static void done_cb(void *opaque, int ret)
> +{
> +    VirtIODeviceRequest *req = opaque;
> +    int len = iov_from_buf(req->elem.in_sg, req->elem.in_num, 0,
> +                              &req->resp, sizeof(VirtIOPMEMResp));
> +
> +    /* Callbacks are serialized, so no need to use atomic ops.  */
> +    virtqueue_push(req->pmem->rq_vq, &req->elem, len);
> +    virtio_notify((VirtIODevice *)req->pmem, req->pmem->rq_vq);
> +    g_free(req);
> +}
> +
> +static void virtio_pmem_flush(VirtIODevice *vdev, VirtQueue *vq)
> +{
> +    VirtIODeviceRequest *req;
> +    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
> +    HostMemoryBackend *backend = MEMORY_BACKEND(pmem->memdev);
> +    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
> +
> +    req = virtqueue_pop(vq, sizeof(VirtIODeviceRequest));
> +    if (!req) {
> +        virtio_error(vdev, "virtio-pmem missing request data");
> +        return;
> +    }
> +
> +    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
> +        virtio_error(vdev, "virtio-pmem request not proper");
> +        g_free(req);
> +        return;
> +    }
> +    req->fd = memory_region_get_fd(&backend->mr);
> +    req->pmem = pmem;
> +    thread_pool_submit_aio(pool, worker_cb, req, done_cb, req);
> +}
> +
> +static void virtio_pmem_get_config(VirtIODevice *vdev, uint8_t *config)
> +{
> +    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
> +    struct virtio_pmem_config *pmemcfg = (struct virtio_pmem_config *) config;
> +
> +    virtio_stq_p(vdev, &pmemcfg->start, pmem->start);
> +    virtio_stq_p(vdev, &pmemcfg->size, pmem->size);
> +}
> +
> +static uint64_t virtio_pmem_get_features(VirtIODevice *vdev, uint64_t features,
> +                                        Error **errp)
> +{
> +    return features;
> +}
> +
> +static void virtio_pmem_realize(DeviceState *dev, Error **errp)
> +{
> +    VirtIODevice   *vdev   = VIRTIO_DEVICE(dev);
> +    VirtIOPMEM     *pmem   = VIRTIO_PMEM(dev);
> +    MachineState   *ms     = MACHINE(qdev_get_machine());
> +    uint64_t align;
> +    Error *local_err = NULL;
> +    MemoryRegion *mr;
> +
> +    if (!pmem->memdev) {
> +        error_setg(errp, "virtio-pmem memdev not set");
> +        return;
> +    }
> +
> +    mr  = host_memory_backend_get_memory(pmem->memdev);
> +    align = memory_region_get_alignment(mr);
> +    pmem->size = QEMU_ALIGN_DOWN(memory_region_size(mr), align);
> +    pmem->start = memory_device_get_free_addr(ms, NULL, align, pmem->size,
> +                                                               &local_err);
> +    if (local_err) {
> +        error_setg(errp, "Can't get free address in mem device");
> +        return;
> +    }
> +    memory_region_init_alias(&pmem->mr, OBJECT(pmem),
> +                             "virtio_pmem-memory", mr, 0, pmem->size);
> +    memory_device_plug_region(ms, &pmem->mr, pmem->start);
> +
> +    host_memory_backend_set_mapped(pmem->memdev, true);
> +    virtio_init(vdev, TYPE_VIRTIO_PMEM, VIRTIO_ID_PMEM,
> +                                          sizeof(struct virtio_pmem_config));
> +    pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush);
> +}
> +
> +static void virtio_mem_check_memdev(Object *obj, const char *name, Object *val,
> +                                    Error **errp)
> +{
> +    if (host_memory_backend_is_mapped(MEMORY_BACKEND(val))) {
> +        char *path = object_get_canonical_path_component(val);
> +        error_setg(errp, "Can't use already busy memdev: %s", path);
> +        g_free(path);
> +        return;
> +    }
> +
> +    qdev_prop_allow_set_link_before_realize(obj, name, val, errp);
> +}
> +
> +static const char *virtio_pmem_get_device_id(VirtIOPMEM *vm)
> +{
> +    Object *obj = OBJECT(vm);
> +    DeviceState *parent_dev;
> +
> +    /* always use the ID of the proxy device */
> +    if (obj->parent && object_dynamic_cast(obj->parent, TYPE_DEVICE)) {
> +        parent_dev = DEVICE(obj->parent);
> +        return parent_dev->id;
> +    }
> +    return NULL;
> +}
> +
> +static void virtio_pmem_md_fill_device_info(const MemoryDeviceState *md,
> +                                           MemoryDeviceInfo *info)
> +{
> +    VirtioPMemDeviceInfo *vi = g_new0(VirtioPMemDeviceInfo, 1);
> +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> +    const char *id = virtio_pmem_get_device_id(vm);
> +
> +    if (id) {
> +        vi->has_id = true;
> +        vi->id = g_strdup(id);
> +    }
> +
> +    vi->start = vm->start;
> +    vi->size = vm->size;
> +    vi->memdev = object_get_canonical_path(OBJECT(vm->memdev));
> +
> +    info->u.virtio_pmem.data = vi;
> +    info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM;
> +}
> +
> +static uint64_t virtio_pmem_md_get_addr(const MemoryDeviceState *md)
> +{
> +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> +
> +    return vm->start;
> +}
> +
> +static uint64_t virtio_pmem_md_get_plugged_size(const MemoryDeviceState *md)
> +{
> +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> +
> +    return vm->size;
> +}
> +
> +static uint64_t virtio_pmem_md_get_region_size(const MemoryDeviceState *md)
> +{
> +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> +
> +    return vm->size;
> +}
> +
> +static void virtio_pmem_instance_init(Object *obj)
> +{
> +    VirtIOPMEM *vm = VIRTIO_PMEM(obj);
> +    object_property_add_link(obj, "memdev", TYPE_MEMORY_BACKEND,
> +                                (Object **)&vm->memdev,
> +                                (void *) virtio_mem_check_memdev,
> +                                OBJ_PROP_LINK_STRONG,
> +                                &error_abort);
> +}
> +
> +
> +static void virtio_pmem_class_init(ObjectClass *klass, void *data)
> +{
> +    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
> +    MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
> +
> +    vdc->realize      =  virtio_pmem_realize;
> +    vdc->get_config   =  virtio_pmem_get_config;
> +    vdc->get_features =  virtio_pmem_get_features;
> +
> +    mdc->get_addr         = virtio_pmem_md_get_addr;
> +    mdc->get_plugged_size = virtio_pmem_md_get_plugged_size;
> +    mdc->get_region_size  = virtio_pmem_md_get_region_size;
> +    mdc->fill_device_info = virtio_pmem_md_fill_device_info;
> +}
> +
> +static TypeInfo virtio_pmem_info = {
> +    .name          = TYPE_VIRTIO_PMEM,
> +    .parent        = TYPE_VIRTIO_DEVICE,
> +    .class_init    = virtio_pmem_class_init,
> +    .instance_size = sizeof(VirtIOPMEM),
> +    .instance_init = virtio_pmem_instance_init,
> +    .interfaces = (InterfaceInfo[]) {
> +        { TYPE_MEMORY_DEVICE },
> +        { }
> +  },
> +};
> +
> +static void virtio_register_types(void)
> +{
> +    type_register_static(&virtio_pmem_info);
> +}
> +
> +type_init(virtio_register_types)
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 990d6fcbde..28829b6437 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -85,6 +85,7 @@ extern bool pci_available;
>  #define PCI_DEVICE_ID_VIRTIO_RNG         0x1005
>  #define PCI_DEVICE_ID_VIRTIO_9P          0x1009
>  #define PCI_DEVICE_ID_VIRTIO_VSOCK       0x1012
> +#define PCI_DEVICE_ID_VIRTIO_PMEM        0x1013
>  
>  #define PCI_VENDOR_ID_REDHAT             0x1b36
>  #define PCI_DEVICE_ID_REDHAT_BRIDGE      0x0001
> diff --git a/include/hw/virtio/virtio-pmem.h b/include/hw/virtio/virtio-pmem.h
> new file mode 100644
> index 0000000000..fda3ee691c
> --- /dev/null
> +++ b/include/hw/virtio/virtio-pmem.h
> @@ -0,0 +1,42 @@
> +/*
> + * Virtio pmem Device
> + *
> + * Copyright Red Hat, Inc. 2018
> + * Copyright Pankaj Gupta <pagupta@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * (at your option) any later version.  See the COPYING file in the
> + * top-level directory.
> + */
> +
> +#ifndef QEMU_VIRTIO_PMEM_H
> +#define QEMU_VIRTIO_PMEM_H
> +
> +#include "hw/virtio/virtio.h"
> +#include "exec/memory.h"
> +#include "sysemu/hostmem.h"
> +#include "standard-headers/linux/virtio_ids.h"
> +#include "hw/boards.h"
> +#include "hw/i386/pc.h"
> +
> +#define TYPE_VIRTIO_PMEM "virtio-pmem"
> +
> +#define VIRTIO_PMEM(obj) \
> +        OBJECT_CHECK(VirtIOPMEM, (obj), TYPE_VIRTIO_PMEM)
> +
> +/* VirtIOPMEM device structure */
> +typedef struct VirtIOPMEM {
> +    VirtIODevice parent_obj;
> +
> +    VirtQueue *rq_vq;
> +    uint64_t start;
> +    uint64_t size;
> +    MemoryRegion mr;
> +    HostMemoryBackend *memdev;
> +} VirtIOPMEM;
> +
> +struct virtio_pmem_config {
> +    uint64_t start;
> +    uint64_t size;
> +};
> +#endif
> diff --git a/include/standard-headers/linux/virtio_ids.h b/include/standard-headers/linux/virtio_ids.h
> index 6d5c3b2d4f..346389565a 100644
> --- a/include/standard-headers/linux/virtio_ids.h
> +++ b/include/standard-headers/linux/virtio_ids.h
> @@ -43,5 +43,6 @@
>  #define VIRTIO_ID_INPUT        18 /* virtio input */
>  #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
>  #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
> +#define VIRTIO_ID_PMEM         25 /* virtio pmem */
>  
>  #endif /* _LINUX_VIRTIO_IDS_H */
> diff --git a/qapi/misc.json b/qapi/misc.json
> index 29da7856e3..fb85dd6f6c 100644
> --- a/qapi/misc.json
> +++ b/qapi/misc.json
> @@ -2907,6 +2907,29 @@
>            }
>  }
>  
> +##
> +# @VirtioPMemDeviceInfo:
> +#
> +# VirtioPMem state information
> +#
> +# @id: device's ID
> +#
> +# @start: physical address, where device is mapped
> +#
> +# @size: size of memory that the device provides
> +#
> +# @memdev: memory backend linked with device
> +#
> +# Since: 2.13
> +##
> +{ 'struct': 'VirtioPMemDeviceInfo',
> +  'data': { '*id': 'str',
> +            'start': 'size',
> +            'size': 'size',
> +            'memdev': 'str'
> +          }
> +}
> +
>  ##
>  # @MemoryDeviceInfo:
>  #
> @@ -2916,7 +2939,8 @@
>  ##
>  { 'union': 'MemoryDeviceInfo',
>    'data': { 'dimm': 'PCDIMMDeviceInfo',
> -            'nvdimm': 'PCDIMMDeviceInfo'
> +            'nvdimm': 'PCDIMMDeviceInfo',
> +	    'virtio-pmem': 'VirtioPMemDeviceInfo'
>            }
>  }
>
Pankaj Gupta July 19, 2018, 5:48 a.m. UTC | #2
> 
> >  This patch adds virtio-pmem Qemu device.
> > 
> >  This device presents memory address range information to guest
> >  which is backed by file backend type. It acts like persistent
> >  memory device for KVM guest. Guest can perform read and persistent
> >  write operations on this memory range with the help of DAX capable
> >  filesystem.
> > 
> >  Persistent guest writes are assured with the help of virtio based
> >  flushing interface. When guest userspace space performs fsync on
> >  file fd on pmem device, a flush command is send to Qemu over VIRTIO
> >  and host side flush/sync is done on backing image file.
> > 
> > Changes from RFC v2:
> > - Use aio_worker() to avoid Qemu from hanging with blocking fsync
> >   call - Stefan
> > - Use virtio_st*_p() for endianess - Stefan
> > - Correct indentation in qapi/misc.json - Eric
> > 
> > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > ---
> >  hw/virtio/Makefile.objs                     |   3 +
> >  hw/virtio/virtio-pci.c                      |  44 +++++
> >  hw/virtio/virtio-pci.h                      |  14 ++
> >  hw/virtio/virtio-pmem.c                     | 241
> >  ++++++++++++++++++++++++++++
> >  include/hw/pci/pci.h                        |   1 +
> >  include/hw/virtio/virtio-pmem.h             |  42 +++++
> >  include/standard-headers/linux/virtio_ids.h |   1 +
> >  qapi/misc.json                              |  26 ++-
> >  8 files changed, 371 insertions(+), 1 deletion(-)
> >  create mode 100644 hw/virtio/virtio-pmem.c
> >  create mode 100644 include/hw/virtio/virtio-pmem.h
> > 
> > diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
> > index 1b2799cfd8..7f914d45d0 100644
> > --- a/hw/virtio/Makefile.objs
> > +++ b/hw/virtio/Makefile.objs
> > @@ -10,6 +10,9 @@ obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
> >  obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) +=
> >  virtio-crypto-pci.o
> >  
> >  obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
> > +ifeq ($(CONFIG_MEM_HOTPLUG),y)
> > +obj-$(CONFIG_LINUX) += virtio-pmem.o
> > +endif
> >  obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
> >  endif
> >  
> > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> > index 3a01fe90f0..93d3fc05c7 100644
> > --- a/hw/virtio/virtio-pci.c
> > +++ b/hw/virtio/virtio-pci.c
> > @@ -2521,6 +2521,49 @@ static const TypeInfo virtio_rng_pci_info = {
> >      .class_init    = virtio_rng_pci_class_init,
> >  };
> >  
> > +/* virtio-pmem-pci */
> > +
> > +static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error
> > **errp)
> > +{
> > +    VirtIOPMEMPCI *vpmem = VIRTIO_PMEM_PCI(vpci_dev);
> > +    DeviceState *vdev = DEVICE(&vpmem->vdev);
> > +
> > +    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
> > +    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
> > +}
> > +
> > +static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
> > +{
> > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
> > +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
> > +    k->realize = virtio_pmem_pci_realize;
> > +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> > +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
> > +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_PMEM;
> > +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
> > +    pcidev_k->class_id = PCI_CLASS_OTHERS;
> > +}
> > +
> > +static void virtio_pmem_pci_instance_init(Object *obj)
> > +{
> > +    VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
> > +
> > +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
> > +                                TYPE_VIRTIO_PMEM);
> > +    object_property_add_alias(obj, "memdev", OBJECT(&dev->vdev), "memdev",
> > +                              &error_abort);
> > +}
> > +
> > +static const TypeInfo virtio_pmem_pci_info = {
> > +    .name          = TYPE_VIRTIO_PMEM_PCI,
> > +    .parent        = TYPE_VIRTIO_PCI,
> > +    .instance_size = sizeof(VirtIOPMEMPCI),
> > +    .instance_init = virtio_pmem_pci_instance_init,
> > +    .class_init    = virtio_pmem_pci_class_init,
> > +};
> > +
> > +
> >  /* virtio-input-pci */
> >  
> >  static Property virtio_input_pci_properties[] = {
> > @@ -2714,6 +2757,7 @@ static void virtio_pci_register_types(void)
> >      type_register_static(&virtio_balloon_pci_info);
> >      type_register_static(&virtio_serial_pci_info);
> >      type_register_static(&virtio_net_pci_info);
> > +    type_register_static(&virtio_pmem_pci_info);
> >  #ifdef CONFIG_VHOST_SCSI
> >      type_register_static(&vhost_scsi_pci_info);
> >  #endif
> > diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
> > index 813082b0d7..fe74fcad3f 100644
> > --- a/hw/virtio/virtio-pci.h
> > +++ b/hw/virtio/virtio-pci.h
> > @@ -19,6 +19,7 @@
> >  #include "hw/virtio/virtio-blk.h"
> >  #include "hw/virtio/virtio-net.h"
> >  #include "hw/virtio/virtio-rng.h"
> > +#include "hw/virtio/virtio-pmem.h"
> >  #include "hw/virtio/virtio-serial.h"
> >  #include "hw/virtio/virtio-scsi.h"
> >  #include "hw/virtio/virtio-balloon.h"
> > @@ -57,6 +58,7 @@ typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
> >  typedef struct VirtIOGPUPCI VirtIOGPUPCI;
> >  typedef struct VHostVSockPCI VHostVSockPCI;
> >  typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
> > +typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
> >  
> >  /* virtio-pci-bus */
> >  
> > @@ -274,6 +276,18 @@ struct VirtIOBlkPCI {
> >      VirtIOBlock vdev;
> >  };
> >  
> > +/*
> > + * virtio-pmem-pci: This extends VirtioPCIProxy.
> > + */
> > +#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci"
> > +#define VIRTIO_PMEM_PCI(obj) \
> > +        OBJECT_CHECK(VirtIOPMEMPCI, (obj), TYPE_VIRTIO_PMEM_PCI)
> > +
> > +struct VirtIOPMEMPCI {
> > +    VirtIOPCIProxy parent_obj;
> > +    VirtIOPMEM vdev;
> > +};
> > +
> >  /*
> >   * virtio-balloon-pci: This extends VirtioPCIProxy.
> >   */
> > diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
> > new file mode 100644
> > index 0000000000..08c96d7e80
> > --- /dev/null
> > +++ b/hw/virtio/virtio-pmem.c
> > @@ -0,0 +1,241 @@
> > +/*
> > + * Virtio pmem device
> > + *
> > + * Copyright (C) 2018 Red Hat, Inc.
> > + * Copyright (C) 2018 Pankaj Gupta <pagupta@redhat.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.
> > + * See the COPYING file in the top-level directory.
> > + *
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qapi/error.h"
> > +#include "qemu-common.h"
> > +#include "qemu/error-report.h"
> > +#include "hw/virtio/virtio-access.h"
> > +#include "hw/virtio/virtio-pmem.h"
> > +#include "hw/mem/memory-device.h"
> > +#include "block/aio.h"
> > +#include "block/thread-pool.h"
> > +
> > +typedef struct VirtIOPMEMresp {
> > +    int ret;
> > +} VirtIOPMEMResp;
> > +
> > +typedef struct VirtIODeviceRequest {
> > +    VirtQueueElement elem;
> > +    int fd;
> > +    VirtIOPMEM *pmem;
> > +    VirtIOPMEMResp resp;
> > +} VirtIODeviceRequest;
> > +
> > +static int worker_cb(void *opaque)
> > +{
> > +    VirtIODeviceRequest *req = opaque;
> > +    int err = 0;
> > +
> > +    /* flush raw backing image */
> > +    err = fsync(req->fd);
> > +    if (err != 0) {
> > +        err = errno;
> > +    }
> > +    req->resp.ret = err;
> 
> Host question: are you returning the guest errno code to the host?

No. I am returning error code from the host in-case of host fsync
failure, otherwise returning zero.

Thanks,
Pankaj
> 
> If yes, I don't think this is right. I think you probably want to
> define a constant for error and let the host decide on the errno
> code to return to the application.
> 
> > +
> > +    return 0;
> > +}
> > +
> > +static void done_cb(void *opaque, int ret)
> > +{
> > +    VirtIODeviceRequest *req = opaque;
> > +    int len = iov_from_buf(req->elem.in_sg, req->elem.in_num, 0,
> > +                              &req->resp, sizeof(VirtIOPMEMResp));
> > +
> > +    /* Callbacks are serialized, so no need to use atomic ops.  */
> > +    virtqueue_push(req->pmem->rq_vq, &req->elem, len);
> > +    virtio_notify((VirtIODevice *)req->pmem, req->pmem->rq_vq);
> > +    g_free(req);
> > +}
> > +
> > +static void virtio_pmem_flush(VirtIODevice *vdev, VirtQueue *vq)
> > +{
> > +    VirtIODeviceRequest *req;
> > +    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
> > +    HostMemoryBackend *backend = MEMORY_BACKEND(pmem->memdev);
> > +    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
> > +
> > +    req = virtqueue_pop(vq, sizeof(VirtIODeviceRequest));
> > +    if (!req) {
> > +        virtio_error(vdev, "virtio-pmem missing request data");
> > +        return;
> > +    }
> > +
> > +    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
> > +        virtio_error(vdev, "virtio-pmem request not proper");
> > +        g_free(req);
> > +        return;
> > +    }
> > +    req->fd = memory_region_get_fd(&backend->mr);
> > +    req->pmem = pmem;
> > +    thread_pool_submit_aio(pool, worker_cb, req, done_cb, req);
> > +}
> > +
> > +static void virtio_pmem_get_config(VirtIODevice *vdev, uint8_t *config)
> > +{
> > +    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
> > +    struct virtio_pmem_config *pmemcfg = (struct virtio_pmem_config *)
> > config;
> > +
> > +    virtio_stq_p(vdev, &pmemcfg->start, pmem->start);
> > +    virtio_stq_p(vdev, &pmemcfg->size, pmem->size);
> > +}
> > +
> > +static uint64_t virtio_pmem_get_features(VirtIODevice *vdev, uint64_t
> > features,
> > +                                        Error **errp)
> > +{
> > +    return features;
> > +}
> > +
> > +static void virtio_pmem_realize(DeviceState *dev, Error **errp)
> > +{
> > +    VirtIODevice   *vdev   = VIRTIO_DEVICE(dev);
> > +    VirtIOPMEM     *pmem   = VIRTIO_PMEM(dev);
> > +    MachineState   *ms     = MACHINE(qdev_get_machine());
> > +    uint64_t align;
> > +    Error *local_err = NULL;
> > +    MemoryRegion *mr;
> > +
> > +    if (!pmem->memdev) {
> > +        error_setg(errp, "virtio-pmem memdev not set");
> > +        return;
> > +    }
> > +
> > +    mr  = host_memory_backend_get_memory(pmem->memdev);
> > +    align = memory_region_get_alignment(mr);
> > +    pmem->size = QEMU_ALIGN_DOWN(memory_region_size(mr), align);
> > +    pmem->start = memory_device_get_free_addr(ms, NULL, align, pmem->size,
> > +
> > &local_err);
> > +    if (local_err) {
> > +        error_setg(errp, "Can't get free address in mem device");
> > +        return;
> > +    }
> > +    memory_region_init_alias(&pmem->mr, OBJECT(pmem),
> > +                             "virtio_pmem-memory", mr, 0, pmem->size);
> > +    memory_device_plug_region(ms, &pmem->mr, pmem->start);
> > +
> > +    host_memory_backend_set_mapped(pmem->memdev, true);
> > +    virtio_init(vdev, TYPE_VIRTIO_PMEM, VIRTIO_ID_PMEM,
> > +                                          sizeof(struct
> > virtio_pmem_config));
> > +    pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush);
> > +}
> > +
> > +static void virtio_mem_check_memdev(Object *obj, const char *name, Object
> > *val,
> > +                                    Error **errp)
> > +{
> > +    if (host_memory_backend_is_mapped(MEMORY_BACKEND(val))) {
> > +        char *path = object_get_canonical_path_component(val);
> > +        error_setg(errp, "Can't use already busy memdev: %s", path);
> > +        g_free(path);
> > +        return;
> > +    }
> > +
> > +    qdev_prop_allow_set_link_before_realize(obj, name, val, errp);
> > +}
> > +
> > +static const char *virtio_pmem_get_device_id(VirtIOPMEM *vm)
> > +{
> > +    Object *obj = OBJECT(vm);
> > +    DeviceState *parent_dev;
> > +
> > +    /* always use the ID of the proxy device */
> > +    if (obj->parent && object_dynamic_cast(obj->parent, TYPE_DEVICE)) {
> > +        parent_dev = DEVICE(obj->parent);
> > +        return parent_dev->id;
> > +    }
> > +    return NULL;
> > +}
> > +
> > +static void virtio_pmem_md_fill_device_info(const MemoryDeviceState *md,
> > +                                           MemoryDeviceInfo *info)
> > +{
> > +    VirtioPMemDeviceInfo *vi = g_new0(VirtioPMemDeviceInfo, 1);
> > +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> > +    const char *id = virtio_pmem_get_device_id(vm);
> > +
> > +    if (id) {
> > +        vi->has_id = true;
> > +        vi->id = g_strdup(id);
> > +    }
> > +
> > +    vi->start = vm->start;
> > +    vi->size = vm->size;
> > +    vi->memdev = object_get_canonical_path(OBJECT(vm->memdev));
> > +
> > +    info->u.virtio_pmem.data = vi;
> > +    info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM;
> > +}
> > +
> > +static uint64_t virtio_pmem_md_get_addr(const MemoryDeviceState *md)
> > +{
> > +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> > +
> > +    return vm->start;
> > +}
> > +
> > +static uint64_t virtio_pmem_md_get_plugged_size(const MemoryDeviceState
> > *md)
> > +{
> > +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> > +
> > +    return vm->size;
> > +}
> > +
> > +static uint64_t virtio_pmem_md_get_region_size(const MemoryDeviceState
> > *md)
> > +{
> > +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> > +
> > +    return vm->size;
> > +}
> > +
> > +static void virtio_pmem_instance_init(Object *obj)
> > +{
> > +    VirtIOPMEM *vm = VIRTIO_PMEM(obj);
> > +    object_property_add_link(obj, "memdev", TYPE_MEMORY_BACKEND,
> > +                                (Object **)&vm->memdev,
> > +                                (void *) virtio_mem_check_memdev,
> > +                                OBJ_PROP_LINK_STRONG,
> > +                                &error_abort);
> > +}
> > +
> > +
> > +static void virtio_pmem_class_init(ObjectClass *klass, void *data)
> > +{
> > +    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
> > +    MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
> > +
> > +    vdc->realize      =  virtio_pmem_realize;
> > +    vdc->get_config   =  virtio_pmem_get_config;
> > +    vdc->get_features =  virtio_pmem_get_features;
> > +
> > +    mdc->get_addr         = virtio_pmem_md_get_addr;
> > +    mdc->get_plugged_size = virtio_pmem_md_get_plugged_size;
> > +    mdc->get_region_size  = virtio_pmem_md_get_region_size;
> > +    mdc->fill_device_info = virtio_pmem_md_fill_device_info;
> > +}
> > +
> > +static TypeInfo virtio_pmem_info = {
> > +    .name          = TYPE_VIRTIO_PMEM,
> > +    .parent        = TYPE_VIRTIO_DEVICE,
> > +    .class_init    = virtio_pmem_class_init,
> > +    .instance_size = sizeof(VirtIOPMEM),
> > +    .instance_init = virtio_pmem_instance_init,
> > +    .interfaces = (InterfaceInfo[]) {
> > +        { TYPE_MEMORY_DEVICE },
> > +        { }
> > +  },
> > +};
> > +
> > +static void virtio_register_types(void)
> > +{
> > +    type_register_static(&virtio_pmem_info);
> > +}
> > +
> > +type_init(virtio_register_types)
> > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> > index 990d6fcbde..28829b6437 100644
> > --- a/include/hw/pci/pci.h
> > +++ b/include/hw/pci/pci.h
> > @@ -85,6 +85,7 @@ extern bool pci_available;
> >  #define PCI_DEVICE_ID_VIRTIO_RNG         0x1005
> >  #define PCI_DEVICE_ID_VIRTIO_9P          0x1009
> >  #define PCI_DEVICE_ID_VIRTIO_VSOCK       0x1012
> > +#define PCI_DEVICE_ID_VIRTIO_PMEM        0x1013
> >  
> >  #define PCI_VENDOR_ID_REDHAT             0x1b36
> >  #define PCI_DEVICE_ID_REDHAT_BRIDGE      0x0001
> > diff --git a/include/hw/virtio/virtio-pmem.h
> > b/include/hw/virtio/virtio-pmem.h
> > new file mode 100644
> > index 0000000000..fda3ee691c
> > --- /dev/null
> > +++ b/include/hw/virtio/virtio-pmem.h
> > @@ -0,0 +1,42 @@
> > +/*
> > + * Virtio pmem Device
> > + *
> > + * Copyright Red Hat, Inc. 2018
> > + * Copyright Pankaj Gupta <pagupta@redhat.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or
> > + * (at your option) any later version.  See the COPYING file in the
> > + * top-level directory.
> > + */
> > +
> > +#ifndef QEMU_VIRTIO_PMEM_H
> > +#define QEMU_VIRTIO_PMEM_H
> > +
> > +#include "hw/virtio/virtio.h"
> > +#include "exec/memory.h"
> > +#include "sysemu/hostmem.h"
> > +#include "standard-headers/linux/virtio_ids.h"
> > +#include "hw/boards.h"
> > +#include "hw/i386/pc.h"
> > +
> > +#define TYPE_VIRTIO_PMEM "virtio-pmem"
> > +
> > +#define VIRTIO_PMEM(obj) \
> > +        OBJECT_CHECK(VirtIOPMEM, (obj), TYPE_VIRTIO_PMEM)
> > +
> > +/* VirtIOPMEM device structure */
> > +typedef struct VirtIOPMEM {
> > +    VirtIODevice parent_obj;
> > +
> > +    VirtQueue *rq_vq;
> > +    uint64_t start;
> > +    uint64_t size;
> > +    MemoryRegion mr;
> > +    HostMemoryBackend *memdev;
> > +} VirtIOPMEM;
> > +
> > +struct virtio_pmem_config {
> > +    uint64_t start;
> > +    uint64_t size;
> > +};
> > +#endif
> > diff --git a/include/standard-headers/linux/virtio_ids.h
> > b/include/standard-headers/linux/virtio_ids.h
> > index 6d5c3b2d4f..346389565a 100644
> > --- a/include/standard-headers/linux/virtio_ids.h
> > +++ b/include/standard-headers/linux/virtio_ids.h
> > @@ -43,5 +43,6 @@
> >  #define VIRTIO_ID_INPUT        18 /* virtio input */
> >  #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
> >  #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
> > +#define VIRTIO_ID_PMEM         25 /* virtio pmem */
> >  
> >  #endif /* _LINUX_VIRTIO_IDS_H */
> > diff --git a/qapi/misc.json b/qapi/misc.json
> > index 29da7856e3..fb85dd6f6c 100644
> > --- a/qapi/misc.json
> > +++ b/qapi/misc.json
> > @@ -2907,6 +2907,29 @@
> >            }
> >  }
> >  
> > +##
> > +# @VirtioPMemDeviceInfo:
> > +#
> > +# VirtioPMem state information
> > +#
> > +# @id: device's ID
> > +#
> > +# @start: physical address, where device is mapped
> > +#
> > +# @size: size of memory that the device provides
> > +#
> > +# @memdev: memory backend linked with device
> > +#
> > +# Since: 2.13
> > +##
> > +{ 'struct': 'VirtioPMemDeviceInfo',
> > +  'data': { '*id': 'str',
> > +            'start': 'size',
> > +            'size': 'size',
> > +            'memdev': 'str'
> > +          }
> > +}
> > +
> >  ##
> >  # @MemoryDeviceInfo:
> >  #
> > @@ -2916,7 +2939,8 @@
> >  ##
> >  { 'union': 'MemoryDeviceInfo',
> >    'data': { 'dimm': 'PCDIMMDeviceInfo',
> > -            'nvdimm': 'PCDIMMDeviceInfo'
> > +            'nvdimm': 'PCDIMMDeviceInfo',
> > +	    'virtio-pmem': 'VirtioPMemDeviceInfo'
> >            }
> >  }
> >  
> 
> 
>
Stefan Hajnoczi July 19, 2018, 12:16 p.m. UTC | #3
On Thu, Jul 19, 2018 at 01:48:13AM -0400, Pankaj Gupta wrote:
> 
> > 
> > >  This patch adds virtio-pmem Qemu device.
> > > 
> > >  This device presents memory address range information to guest
> > >  which is backed by file backend type. It acts like persistent
> > >  memory device for KVM guest. Guest can perform read and persistent
> > >  write operations on this memory range with the help of DAX capable
> > >  filesystem.
> > > 
> > >  Persistent guest writes are assured with the help of virtio based
> > >  flushing interface. When guest userspace space performs fsync on
> > >  file fd on pmem device, a flush command is send to Qemu over VIRTIO
> > >  and host side flush/sync is done on backing image file.
> > > 
> > > Changes from RFC v2:
> > > - Use aio_worker() to avoid Qemu from hanging with blocking fsync
> > >   call - Stefan
> > > - Use virtio_st*_p() for endianess - Stefan
> > > - Correct indentation in qapi/misc.json - Eric
> > > 
> > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > > ---
> > >  hw/virtio/Makefile.objs                     |   3 +
> > >  hw/virtio/virtio-pci.c                      |  44 +++++
> > >  hw/virtio/virtio-pci.h                      |  14 ++
> > >  hw/virtio/virtio-pmem.c                     | 241
> > >  ++++++++++++++++++++++++++++
> > >  include/hw/pci/pci.h                        |   1 +
> > >  include/hw/virtio/virtio-pmem.h             |  42 +++++
> > >  include/standard-headers/linux/virtio_ids.h |   1 +
> > >  qapi/misc.json                              |  26 ++-
> > >  8 files changed, 371 insertions(+), 1 deletion(-)
> > >  create mode 100644 hw/virtio/virtio-pmem.c
> > >  create mode 100644 include/hw/virtio/virtio-pmem.h
> > > 
> > > diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
> > > index 1b2799cfd8..7f914d45d0 100644
> > > --- a/hw/virtio/Makefile.objs
> > > +++ b/hw/virtio/Makefile.objs
> > > @@ -10,6 +10,9 @@ obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
> > >  obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) +=
> > >  virtio-crypto-pci.o
> > >  
> > >  obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
> > > +ifeq ($(CONFIG_MEM_HOTPLUG),y)
> > > +obj-$(CONFIG_LINUX) += virtio-pmem.o
> > > +endif
> > >  obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
> > >  endif
> > >  
> > > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> > > index 3a01fe90f0..93d3fc05c7 100644
> > > --- a/hw/virtio/virtio-pci.c
> > > +++ b/hw/virtio/virtio-pci.c
> > > @@ -2521,6 +2521,49 @@ static const TypeInfo virtio_rng_pci_info = {
> > >      .class_init    = virtio_rng_pci_class_init,
> > >  };
> > >  
> > > +/* virtio-pmem-pci */
> > > +
> > > +static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error
> > > **errp)
> > > +{
> > > +    VirtIOPMEMPCI *vpmem = VIRTIO_PMEM_PCI(vpci_dev);
> > > +    DeviceState *vdev = DEVICE(&vpmem->vdev);
> > > +
> > > +    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
> > > +    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
> > > +}
> > > +
> > > +static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
> > > +{
> > > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > > +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
> > > +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
> > > +    k->realize = virtio_pmem_pci_realize;
> > > +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> > > +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
> > > +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_PMEM;
> > > +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
> > > +    pcidev_k->class_id = PCI_CLASS_OTHERS;
> > > +}
> > > +
> > > +static void virtio_pmem_pci_instance_init(Object *obj)
> > > +{
> > > +    VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
> > > +
> > > +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
> > > +                                TYPE_VIRTIO_PMEM);
> > > +    object_property_add_alias(obj, "memdev", OBJECT(&dev->vdev), "memdev",
> > > +                              &error_abort);
> > > +}
> > > +
> > > +static const TypeInfo virtio_pmem_pci_info = {
> > > +    .name          = TYPE_VIRTIO_PMEM_PCI,
> > > +    .parent        = TYPE_VIRTIO_PCI,
> > > +    .instance_size = sizeof(VirtIOPMEMPCI),
> > > +    .instance_init = virtio_pmem_pci_instance_init,
> > > +    .class_init    = virtio_pmem_pci_class_init,
> > > +};
> > > +
> > > +
> > >  /* virtio-input-pci */
> > >  
> > >  static Property virtio_input_pci_properties[] = {
> > > @@ -2714,6 +2757,7 @@ static void virtio_pci_register_types(void)
> > >      type_register_static(&virtio_balloon_pci_info);
> > >      type_register_static(&virtio_serial_pci_info);
> > >      type_register_static(&virtio_net_pci_info);
> > > +    type_register_static(&virtio_pmem_pci_info);
> > >  #ifdef CONFIG_VHOST_SCSI
> > >      type_register_static(&vhost_scsi_pci_info);
> > >  #endif
> > > diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
> > > index 813082b0d7..fe74fcad3f 100644
> > > --- a/hw/virtio/virtio-pci.h
> > > +++ b/hw/virtio/virtio-pci.h
> > > @@ -19,6 +19,7 @@
> > >  #include "hw/virtio/virtio-blk.h"
> > >  #include "hw/virtio/virtio-net.h"
> > >  #include "hw/virtio/virtio-rng.h"
> > > +#include "hw/virtio/virtio-pmem.h"
> > >  #include "hw/virtio/virtio-serial.h"
> > >  #include "hw/virtio/virtio-scsi.h"
> > >  #include "hw/virtio/virtio-balloon.h"
> > > @@ -57,6 +58,7 @@ typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
> > >  typedef struct VirtIOGPUPCI VirtIOGPUPCI;
> > >  typedef struct VHostVSockPCI VHostVSockPCI;
> > >  typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
> > > +typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
> > >  
> > >  /* virtio-pci-bus */
> > >  
> > > @@ -274,6 +276,18 @@ struct VirtIOBlkPCI {
> > >      VirtIOBlock vdev;
> > >  };
> > >  
> > > +/*
> > > + * virtio-pmem-pci: This extends VirtioPCIProxy.
> > > + */
> > > +#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci"
> > > +#define VIRTIO_PMEM_PCI(obj) \
> > > +        OBJECT_CHECK(VirtIOPMEMPCI, (obj), TYPE_VIRTIO_PMEM_PCI)
> > > +
> > > +struct VirtIOPMEMPCI {
> > > +    VirtIOPCIProxy parent_obj;
> > > +    VirtIOPMEM vdev;
> > > +};
> > > +
> > >  /*
> > >   * virtio-balloon-pci: This extends VirtioPCIProxy.
> > >   */
> > > diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
> > > new file mode 100644
> > > index 0000000000..08c96d7e80
> > > --- /dev/null
> > > +++ b/hw/virtio/virtio-pmem.c
> > > @@ -0,0 +1,241 @@
> > > +/*
> > > + * Virtio pmem device
> > > + *
> > > + * Copyright (C) 2018 Red Hat, Inc.
> > > + * Copyright (C) 2018 Pankaj Gupta <pagupta@redhat.com>
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.
> > > + * See the COPYING file in the top-level directory.
> > > + *
> > > + */
> > > +
> > > +#include "qemu/osdep.h"
> > > +#include "qapi/error.h"
> > > +#include "qemu-common.h"
> > > +#include "qemu/error-report.h"
> > > +#include "hw/virtio/virtio-access.h"
> > > +#include "hw/virtio/virtio-pmem.h"
> > > +#include "hw/mem/memory-device.h"
> > > +#include "block/aio.h"
> > > +#include "block/thread-pool.h"
> > > +
> > > +typedef struct VirtIOPMEMresp {
> > > +    int ret;
> > > +} VirtIOPMEMResp;
> > > +
> > > +typedef struct VirtIODeviceRequest {
> > > +    VirtQueueElement elem;
> > > +    int fd;
> > > +    VirtIOPMEM *pmem;
> > > +    VirtIOPMEMResp resp;
> > > +} VirtIODeviceRequest;
> > > +
> > > +static int worker_cb(void *opaque)
> > > +{
> > > +    VirtIODeviceRequest *req = opaque;
> > > +    int err = 0;
> > > +
> > > +    /* flush raw backing image */
> > > +    err = fsync(req->fd);
> > > +    if (err != 0) {
> > > +        err = errno;
> > > +    }
> > > +    req->resp.ret = err;
> > 
> > Host question: are you returning the guest errno code to the host?
> 
> No. I am returning error code from the host in-case of host fsync
> failure, otherwise returning zero.

I think that's what Luiz meant.  errno constants are not portable
between operating systems and architectures.  Therefore they cannot be
used in external interfaces in software that expects to communicate with
other systems.

It will be necessary to define specific constants for virtio-pmem
instead of passing errno from the host to guest.

Stefan
Luiz Capitulino July 19, 2018, 12:39 p.m. UTC | #4
On Thu, 19 Jul 2018 01:48:13 -0400 (EDT)
Pankaj Gupta <pagupta@redhat.com> wrote:

> >   
> > >  This patch adds virtio-pmem Qemu device.
> > > 
> > >  This device presents memory address range information to guest
> > >  which is backed by file backend type. It acts like persistent
> > >  memory device for KVM guest. Guest can perform read and persistent
> > >  write operations on this memory range with the help of DAX capable
> > >  filesystem.
> > > 
> > >  Persistent guest writes are assured with the help of virtio based
> > >  flushing interface. When guest userspace space performs fsync on
> > >  file fd on pmem device, a flush command is send to Qemu over VIRTIO
> > >  and host side flush/sync is done on backing image file.
> > > 
> > > Changes from RFC v2:
> > > - Use aio_worker() to avoid Qemu from hanging with blocking fsync
> > >   call - Stefan
> > > - Use virtio_st*_p() for endianess - Stefan
> > > - Correct indentation in qapi/misc.json - Eric
> > > 
> > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > > ---
> > >  hw/virtio/Makefile.objs                     |   3 +
> > >  hw/virtio/virtio-pci.c                      |  44 +++++
> > >  hw/virtio/virtio-pci.h                      |  14 ++
> > >  hw/virtio/virtio-pmem.c                     | 241
> > >  ++++++++++++++++++++++++++++
> > >  include/hw/pci/pci.h                        |   1 +
> > >  include/hw/virtio/virtio-pmem.h             |  42 +++++
> > >  include/standard-headers/linux/virtio_ids.h |   1 +
> > >  qapi/misc.json                              |  26 ++-
> > >  8 files changed, 371 insertions(+), 1 deletion(-)
> > >  create mode 100644 hw/virtio/virtio-pmem.c
> > >  create mode 100644 include/hw/virtio/virtio-pmem.h
> > > 
> > > diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
> > > index 1b2799cfd8..7f914d45d0 100644
> > > --- a/hw/virtio/Makefile.objs
> > > +++ b/hw/virtio/Makefile.objs
> > > @@ -10,6 +10,9 @@ obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
> > >  obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) +=
> > >  virtio-crypto-pci.o
> > >  
> > >  obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
> > > +ifeq ($(CONFIG_MEM_HOTPLUG),y)
> > > +obj-$(CONFIG_LINUX) += virtio-pmem.o
> > > +endif
> > >  obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
> > >  endif
> > >  
> > > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> > > index 3a01fe90f0..93d3fc05c7 100644
> > > --- a/hw/virtio/virtio-pci.c
> > > +++ b/hw/virtio/virtio-pci.c
> > > @@ -2521,6 +2521,49 @@ static const TypeInfo virtio_rng_pci_info = {
> > >      .class_init    = virtio_rng_pci_class_init,
> > >  };
> > >  
> > > +/* virtio-pmem-pci */
> > > +
> > > +static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error
> > > **errp)
> > > +{
> > > +    VirtIOPMEMPCI *vpmem = VIRTIO_PMEM_PCI(vpci_dev);
> > > +    DeviceState *vdev = DEVICE(&vpmem->vdev);
> > > +
> > > +    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
> > > +    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
> > > +}
> > > +
> > > +static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
> > > +{
> > > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > > +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
> > > +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
> > > +    k->realize = virtio_pmem_pci_realize;
> > > +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> > > +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
> > > +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_PMEM;
> > > +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
> > > +    pcidev_k->class_id = PCI_CLASS_OTHERS;
> > > +}
> > > +
> > > +static void virtio_pmem_pci_instance_init(Object *obj)
> > > +{
> > > +    VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
> > > +
> > > +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
> > > +                                TYPE_VIRTIO_PMEM);
> > > +    object_property_add_alias(obj, "memdev", OBJECT(&dev->vdev), "memdev",
> > > +                              &error_abort);
> > > +}
> > > +
> > > +static const TypeInfo virtio_pmem_pci_info = {
> > > +    .name          = TYPE_VIRTIO_PMEM_PCI,
> > > +    .parent        = TYPE_VIRTIO_PCI,
> > > +    .instance_size = sizeof(VirtIOPMEMPCI),
> > > +    .instance_init = virtio_pmem_pci_instance_init,
> > > +    .class_init    = virtio_pmem_pci_class_init,
> > > +};
> > > +
> > > +
> > >  /* virtio-input-pci */
> > >  
> > >  static Property virtio_input_pci_properties[] = {
> > > @@ -2714,6 +2757,7 @@ static void virtio_pci_register_types(void)
> > >      type_register_static(&virtio_balloon_pci_info);
> > >      type_register_static(&virtio_serial_pci_info);
> > >      type_register_static(&virtio_net_pci_info);
> > > +    type_register_static(&virtio_pmem_pci_info);
> > >  #ifdef CONFIG_VHOST_SCSI
> > >      type_register_static(&vhost_scsi_pci_info);
> > >  #endif
> > > diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
> > > index 813082b0d7..fe74fcad3f 100644
> > > --- a/hw/virtio/virtio-pci.h
> > > +++ b/hw/virtio/virtio-pci.h
> > > @@ -19,6 +19,7 @@
> > >  #include "hw/virtio/virtio-blk.h"
> > >  #include "hw/virtio/virtio-net.h"
> > >  #include "hw/virtio/virtio-rng.h"
> > > +#include "hw/virtio/virtio-pmem.h"
> > >  #include "hw/virtio/virtio-serial.h"
> > >  #include "hw/virtio/virtio-scsi.h"
> > >  #include "hw/virtio/virtio-balloon.h"
> > > @@ -57,6 +58,7 @@ typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
> > >  typedef struct VirtIOGPUPCI VirtIOGPUPCI;
> > >  typedef struct VHostVSockPCI VHostVSockPCI;
> > >  typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
> > > +typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
> > >  
> > >  /* virtio-pci-bus */
> > >  
> > > @@ -274,6 +276,18 @@ struct VirtIOBlkPCI {
> > >      VirtIOBlock vdev;
> > >  };
> > >  
> > > +/*
> > > + * virtio-pmem-pci: This extends VirtioPCIProxy.
> > > + */
> > > +#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci"
> > > +#define VIRTIO_PMEM_PCI(obj) \
> > > +        OBJECT_CHECK(VirtIOPMEMPCI, (obj), TYPE_VIRTIO_PMEM_PCI)
> > > +
> > > +struct VirtIOPMEMPCI {
> > > +    VirtIOPCIProxy parent_obj;
> > > +    VirtIOPMEM vdev;
> > > +};
> > > +
> > >  /*
> > >   * virtio-balloon-pci: This extends VirtioPCIProxy.
> > >   */
> > > diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
> > > new file mode 100644
> > > index 0000000000..08c96d7e80
> > > --- /dev/null
> > > +++ b/hw/virtio/virtio-pmem.c
> > > @@ -0,0 +1,241 @@
> > > +/*
> > > + * Virtio pmem device
> > > + *
> > > + * Copyright (C) 2018 Red Hat, Inc.
> > > + * Copyright (C) 2018 Pankaj Gupta <pagupta@redhat.com>
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.
> > > + * See the COPYING file in the top-level directory.
> > > + *
> > > + */
> > > +
> > > +#include "qemu/osdep.h"
> > > +#include "qapi/error.h"
> > > +#include "qemu-common.h"
> > > +#include "qemu/error-report.h"
> > > +#include "hw/virtio/virtio-access.h"
> > > +#include "hw/virtio/virtio-pmem.h"
> > > +#include "hw/mem/memory-device.h"
> > > +#include "block/aio.h"
> > > +#include "block/thread-pool.h"
> > > +
> > > +typedef struct VirtIOPMEMresp {
> > > +    int ret;
> > > +} VirtIOPMEMResp;
> > > +
> > > +typedef struct VirtIODeviceRequest {
> > > +    VirtQueueElement elem;
> > > +    int fd;
> > > +    VirtIOPMEM *pmem;
> > > +    VirtIOPMEMResp resp;
> > > +} VirtIODeviceRequest;
> > > +
> > > +static int worker_cb(void *opaque)
> > > +{
> > > +    VirtIODeviceRequest *req = opaque;
> > > +    int err = 0;
> > > +
> > > +    /* flush raw backing image */
> > > +    err = fsync(req->fd);
> > > +    if (err != 0) {
> > > +        err = errno;
> > > +    }
> > > +    req->resp.ret = err;  
> > 
> > Host question: are you returning the guest errno code to the host?  
> 
> No. I am returning error code from the host in-case of host fsync
> failure, otherwise returning zero.

Sorry, that's what I meant. I exchanged host and guest but what I
said still applies if you do s/host/guest

> 
> Thanks,
> Pankaj
> > 
> > If yes, I don't think this is right. I think you probably want to
> > define a constant for error and let the host decide on the errno
> > code to return to the application.
> >   
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static void done_cb(void *opaque, int ret)
> > > +{
> > > +    VirtIODeviceRequest *req = opaque;
> > > +    int len = iov_from_buf(req->elem.in_sg, req->elem.in_num, 0,
> > > +                              &req->resp, sizeof(VirtIOPMEMResp));
> > > +
> > > +    /* Callbacks are serialized, so no need to use atomic ops.  */
> > > +    virtqueue_push(req->pmem->rq_vq, &req->elem, len);
> > > +    virtio_notify((VirtIODevice *)req->pmem, req->pmem->rq_vq);
> > > +    g_free(req);
> > > +}
> > > +
> > > +static void virtio_pmem_flush(VirtIODevice *vdev, VirtQueue *vq)
> > > +{
> > > +    VirtIODeviceRequest *req;
> > > +    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
> > > +    HostMemoryBackend *backend = MEMORY_BACKEND(pmem->memdev);
> > > +    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
> > > +
> > > +    req = virtqueue_pop(vq, sizeof(VirtIODeviceRequest));
> > > +    if (!req) {
> > > +        virtio_error(vdev, "virtio-pmem missing request data");
> > > +        return;
> > > +    }
> > > +
> > > +    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
> > > +        virtio_error(vdev, "virtio-pmem request not proper");
> > > +        g_free(req);
> > > +        return;
> > > +    }
> > > +    req->fd = memory_region_get_fd(&backend->mr);
> > > +    req->pmem = pmem;
> > > +    thread_pool_submit_aio(pool, worker_cb, req, done_cb, req);
> > > +}
> > > +
> > > +static void virtio_pmem_get_config(VirtIODevice *vdev, uint8_t *config)
> > > +{
> > > +    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
> > > +    struct virtio_pmem_config *pmemcfg = (struct virtio_pmem_config *)
> > > config;
> > > +
> > > +    virtio_stq_p(vdev, &pmemcfg->start, pmem->start);
> > > +    virtio_stq_p(vdev, &pmemcfg->size, pmem->size);
> > > +}
> > > +
> > > +static uint64_t virtio_pmem_get_features(VirtIODevice *vdev, uint64_t
> > > features,
> > > +                                        Error **errp)
> > > +{
> > > +    return features;
> > > +}
> > > +
> > > +static void virtio_pmem_realize(DeviceState *dev, Error **errp)
> > > +{
> > > +    VirtIODevice   *vdev   = VIRTIO_DEVICE(dev);
> > > +    VirtIOPMEM     *pmem   = VIRTIO_PMEM(dev);
> > > +    MachineState   *ms     = MACHINE(qdev_get_machine());
> > > +    uint64_t align;
> > > +    Error *local_err = NULL;
> > > +    MemoryRegion *mr;
> > > +
> > > +    if (!pmem->memdev) {
> > > +        error_setg(errp, "virtio-pmem memdev not set");
> > > +        return;
> > > +    }
> > > +
> > > +    mr  = host_memory_backend_get_memory(pmem->memdev);
> > > +    align = memory_region_get_alignment(mr);
> > > +    pmem->size = QEMU_ALIGN_DOWN(memory_region_size(mr), align);
> > > +    pmem->start = memory_device_get_free_addr(ms, NULL, align, pmem->size,
> > > +
> > > &local_err);
> > > +    if (local_err) {
> > > +        error_setg(errp, "Can't get free address in mem device");
> > > +        return;
> > > +    }
> > > +    memory_region_init_alias(&pmem->mr, OBJECT(pmem),
> > > +                             "virtio_pmem-memory", mr, 0, pmem->size);
> > > +    memory_device_plug_region(ms, &pmem->mr, pmem->start);
> > > +
> > > +    host_memory_backend_set_mapped(pmem->memdev, true);
> > > +    virtio_init(vdev, TYPE_VIRTIO_PMEM, VIRTIO_ID_PMEM,
> > > +                                          sizeof(struct
> > > virtio_pmem_config));
> > > +    pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush);
> > > +}
> > > +
> > > +static void virtio_mem_check_memdev(Object *obj, const char *name, Object
> > > *val,
> > > +                                    Error **errp)
> > > +{
> > > +    if (host_memory_backend_is_mapped(MEMORY_BACKEND(val))) {
> > > +        char *path = object_get_canonical_path_component(val);
> > > +        error_setg(errp, "Can't use already busy memdev: %s", path);
> > > +        g_free(path);
> > > +        return;
> > > +    }
> > > +
> > > +    qdev_prop_allow_set_link_before_realize(obj, name, val, errp);
> > > +}
> > > +
> > > +static const char *virtio_pmem_get_device_id(VirtIOPMEM *vm)
> > > +{
> > > +    Object *obj = OBJECT(vm);
> > > +    DeviceState *parent_dev;
> > > +
> > > +    /* always use the ID of the proxy device */
> > > +    if (obj->parent && object_dynamic_cast(obj->parent, TYPE_DEVICE)) {
> > > +        parent_dev = DEVICE(obj->parent);
> > > +        return parent_dev->id;
> > > +    }
> > > +    return NULL;
> > > +}
> > > +
> > > +static void virtio_pmem_md_fill_device_info(const MemoryDeviceState *md,
> > > +                                           MemoryDeviceInfo *info)
> > > +{
> > > +    VirtioPMemDeviceInfo *vi = g_new0(VirtioPMemDeviceInfo, 1);
> > > +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> > > +    const char *id = virtio_pmem_get_device_id(vm);
> > > +
> > > +    if (id) {
> > > +        vi->has_id = true;
> > > +        vi->id = g_strdup(id);
> > > +    }
> > > +
> > > +    vi->start = vm->start;
> > > +    vi->size = vm->size;
> > > +    vi->memdev = object_get_canonical_path(OBJECT(vm->memdev));
> > > +
> > > +    info->u.virtio_pmem.data = vi;
> > > +    info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM;
> > > +}
> > > +
> > > +static uint64_t virtio_pmem_md_get_addr(const MemoryDeviceState *md)
> > > +{
> > > +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> > > +
> > > +    return vm->start;
> > > +}
> > > +
> > > +static uint64_t virtio_pmem_md_get_plugged_size(const MemoryDeviceState
> > > *md)
> > > +{
> > > +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> > > +
> > > +    return vm->size;
> > > +}
> > > +
> > > +static uint64_t virtio_pmem_md_get_region_size(const MemoryDeviceState
> > > *md)
> > > +{
> > > +    VirtIOPMEM *vm = VIRTIO_PMEM(md);
> > > +
> > > +    return vm->size;
> > > +}
> > > +
> > > +static void virtio_pmem_instance_init(Object *obj)
> > > +{
> > > +    VirtIOPMEM *vm = VIRTIO_PMEM(obj);
> > > +    object_property_add_link(obj, "memdev", TYPE_MEMORY_BACKEND,
> > > +                                (Object **)&vm->memdev,
> > > +                                (void *) virtio_mem_check_memdev,
> > > +                                OBJ_PROP_LINK_STRONG,
> > > +                                &error_abort);
> > > +}
> > > +
> > > +
> > > +static void virtio_pmem_class_init(ObjectClass *klass, void *data)
> > > +{
> > > +    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
> > > +    MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
> > > +
> > > +    vdc->realize      =  virtio_pmem_realize;
> > > +    vdc->get_config   =  virtio_pmem_get_config;
> > > +    vdc->get_features =  virtio_pmem_get_features;
> > > +
> > > +    mdc->get_addr         = virtio_pmem_md_get_addr;
> > > +    mdc->get_plugged_size = virtio_pmem_md_get_plugged_size;
> > > +    mdc->get_region_size  = virtio_pmem_md_get_region_size;
> > > +    mdc->fill_device_info = virtio_pmem_md_fill_device_info;
> > > +}
> > > +
> > > +static TypeInfo virtio_pmem_info = {
> > > +    .name          = TYPE_VIRTIO_PMEM,
> > > +    .parent        = TYPE_VIRTIO_DEVICE,
> > > +    .class_init    = virtio_pmem_class_init,
> > > +    .instance_size = sizeof(VirtIOPMEM),
> > > +    .instance_init = virtio_pmem_instance_init,
> > > +    .interfaces = (InterfaceInfo[]) {
> > > +        { TYPE_MEMORY_DEVICE },
> > > +        { }
> > > +  },
> > > +};
> > > +
> > > +static void virtio_register_types(void)
> > > +{
> > > +    type_register_static(&virtio_pmem_info);
> > > +}
> > > +
> > > +type_init(virtio_register_types)
> > > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> > > index 990d6fcbde..28829b6437 100644
> > > --- a/include/hw/pci/pci.h
> > > +++ b/include/hw/pci/pci.h
> > > @@ -85,6 +85,7 @@ extern bool pci_available;
> > >  #define PCI_DEVICE_ID_VIRTIO_RNG         0x1005
> > >  #define PCI_DEVICE_ID_VIRTIO_9P          0x1009
> > >  #define PCI_DEVICE_ID_VIRTIO_VSOCK       0x1012
> > > +#define PCI_DEVICE_ID_VIRTIO_PMEM        0x1013
> > >  
> > >  #define PCI_VENDOR_ID_REDHAT             0x1b36
> > >  #define PCI_DEVICE_ID_REDHAT_BRIDGE      0x0001
> > > diff --git a/include/hw/virtio/virtio-pmem.h
> > > b/include/hw/virtio/virtio-pmem.h
> > > new file mode 100644
> > > index 0000000000..fda3ee691c
> > > --- /dev/null
> > > +++ b/include/hw/virtio/virtio-pmem.h
> > > @@ -0,0 +1,42 @@
> > > +/*
> > > + * Virtio pmem Device
> > > + *
> > > + * Copyright Red Hat, Inc. 2018
> > > + * Copyright Pankaj Gupta <pagupta@redhat.com>
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2 or
> > > + * (at your option) any later version.  See the COPYING file in the
> > > + * top-level directory.
> > > + */
> > > +
> > > +#ifndef QEMU_VIRTIO_PMEM_H
> > > +#define QEMU_VIRTIO_PMEM_H
> > > +
> > > +#include "hw/virtio/virtio.h"
> > > +#include "exec/memory.h"
> > > +#include "sysemu/hostmem.h"
> > > +#include "standard-headers/linux/virtio_ids.h"
> > > +#include "hw/boards.h"
> > > +#include "hw/i386/pc.h"
> > > +
> > > +#define TYPE_VIRTIO_PMEM "virtio-pmem"
> > > +
> > > +#define VIRTIO_PMEM(obj) \
> > > +        OBJECT_CHECK(VirtIOPMEM, (obj), TYPE_VIRTIO_PMEM)
> > > +
> > > +/* VirtIOPMEM device structure */
> > > +typedef struct VirtIOPMEM {
> > > +    VirtIODevice parent_obj;
> > > +
> > > +    VirtQueue *rq_vq;
> > > +    uint64_t start;
> > > +    uint64_t size;
> > > +    MemoryRegion mr;
> > > +    HostMemoryBackend *memdev;
> > > +} VirtIOPMEM;
> > > +
> > > +struct virtio_pmem_config {
> > > +    uint64_t start;
> > > +    uint64_t size;
> > > +};
> > > +#endif
> > > diff --git a/include/standard-headers/linux/virtio_ids.h
> > > b/include/standard-headers/linux/virtio_ids.h
> > > index 6d5c3b2d4f..346389565a 100644
> > > --- a/include/standard-headers/linux/virtio_ids.h
> > > +++ b/include/standard-headers/linux/virtio_ids.h
> > > @@ -43,5 +43,6 @@
> > >  #define VIRTIO_ID_INPUT        18 /* virtio input */
> > >  #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
> > >  #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
> > > +#define VIRTIO_ID_PMEM         25 /* virtio pmem */
> > >  
> > >  #endif /* _LINUX_VIRTIO_IDS_H */
> > > diff --git a/qapi/misc.json b/qapi/misc.json
> > > index 29da7856e3..fb85dd6f6c 100644
> > > --- a/qapi/misc.json
> > > +++ b/qapi/misc.json
> > > @@ -2907,6 +2907,29 @@
> > >            }
> > >  }
> > >  
> > > +##
> > > +# @VirtioPMemDeviceInfo:
> > > +#
> > > +# VirtioPMem state information
> > > +#
> > > +# @id: device's ID
> > > +#
> > > +# @start: physical address, where device is mapped
> > > +#
> > > +# @size: size of memory that the device provides
> > > +#
> > > +# @memdev: memory backend linked with device
> > > +#
> > > +# Since: 2.13
> > > +##
> > > +{ 'struct': 'VirtioPMemDeviceInfo',
> > > +  'data': { '*id': 'str',
> > > +            'start': 'size',
> > > +            'size': 'size',
> > > +            'memdev': 'str'
> > > +          }
> > > +}
> > > +
> > >  ##
> > >  # @MemoryDeviceInfo:
> > >  #
> > > @@ -2916,7 +2939,8 @@
> > >  ##
> > >  { 'union': 'MemoryDeviceInfo',
> > >    'data': { 'dimm': 'PCDIMMDeviceInfo',
> > > -            'nvdimm': 'PCDIMMDeviceInfo'
> > > +            'nvdimm': 'PCDIMMDeviceInfo',
> > > +	    'virtio-pmem': 'VirtioPMemDeviceInfo'
> > >            }
> > >  }
> > >    
> > 
> > 
> >   
>
Luiz Capitulino July 19, 2018, 12:48 p.m. UTC | #5
On Thu, 19 Jul 2018 13:16:35 +0100
Stefan Hajnoczi <stefanha@redhat.com> wrote:

> On Thu, Jul 19, 2018 at 01:48:13AM -0400, Pankaj Gupta wrote:
> >   
> > >   
> > > >  This patch adds virtio-pmem Qemu device.
> > > > 
> > > >  This device presents memory address range information to guest
> > > >  which is backed by file backend type. It acts like persistent
> > > >  memory device for KVM guest. Guest can perform read and persistent
> > > >  write operations on this memory range with the help of DAX capable
> > > >  filesystem.
> > > > 
> > > >  Persistent guest writes are assured with the help of virtio based
> > > >  flushing interface. When guest userspace space performs fsync on
> > > >  file fd on pmem device, a flush command is send to Qemu over VIRTIO
> > > >  and host side flush/sync is done on backing image file.
> > > > 
> > > > Changes from RFC v2:
> > > > - Use aio_worker() to avoid Qemu from hanging with blocking fsync
> > > >   call - Stefan
> > > > - Use virtio_st*_p() for endianess - Stefan
> > > > - Correct indentation in qapi/misc.json - Eric
> > > > 
> > > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > > > ---
> > > >  hw/virtio/Makefile.objs                     |   3 +
> > > >  hw/virtio/virtio-pci.c                      |  44 +++++
> > > >  hw/virtio/virtio-pci.h                      |  14 ++
> > > >  hw/virtio/virtio-pmem.c                     | 241
> > > >  ++++++++++++++++++++++++++++
> > > >  include/hw/pci/pci.h                        |   1 +
> > > >  include/hw/virtio/virtio-pmem.h             |  42 +++++
> > > >  include/standard-headers/linux/virtio_ids.h |   1 +
> > > >  qapi/misc.json                              |  26 ++-
> > > >  8 files changed, 371 insertions(+), 1 deletion(-)
> > > >  create mode 100644 hw/virtio/virtio-pmem.c
> > > >  create mode 100644 include/hw/virtio/virtio-pmem.h
> > > > 
> > > > diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
> > > > index 1b2799cfd8..7f914d45d0 100644
> > > > --- a/hw/virtio/Makefile.objs
> > > > +++ b/hw/virtio/Makefile.objs
> > > > @@ -10,6 +10,9 @@ obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
> > > >  obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) +=
> > > >  virtio-crypto-pci.o
> > > >  
> > > >  obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
> > > > +ifeq ($(CONFIG_MEM_HOTPLUG),y)
> > > > +obj-$(CONFIG_LINUX) += virtio-pmem.o
> > > > +endif
> > > >  obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
> > > >  endif
> > > >  
> > > > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> > > > index 3a01fe90f0..93d3fc05c7 100644
> > > > --- a/hw/virtio/virtio-pci.c
> > > > +++ b/hw/virtio/virtio-pci.c
> > > > @@ -2521,6 +2521,49 @@ static const TypeInfo virtio_rng_pci_info = {
> > > >      .class_init    = virtio_rng_pci_class_init,
> > > >  };
> > > >  
> > > > +/* virtio-pmem-pci */
> > > > +
> > > > +static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error
> > > > **errp)
> > > > +{
> > > > +    VirtIOPMEMPCI *vpmem = VIRTIO_PMEM_PCI(vpci_dev);
> > > > +    DeviceState *vdev = DEVICE(&vpmem->vdev);
> > > > +
> > > > +    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
> > > > +    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
> > > > +}
> > > > +
> > > > +static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
> > > > +{
> > > > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > > > +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
> > > > +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
> > > > +    k->realize = virtio_pmem_pci_realize;
> > > > +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> > > > +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
> > > > +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_PMEM;
> > > > +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
> > > > +    pcidev_k->class_id = PCI_CLASS_OTHERS;
> > > > +}
> > > > +
> > > > +static void virtio_pmem_pci_instance_init(Object *obj)
> > > > +{
> > > > +    VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
> > > > +
> > > > +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
> > > > +                                TYPE_VIRTIO_PMEM);
> > > > +    object_property_add_alias(obj, "memdev", OBJECT(&dev->vdev), "memdev",
> > > > +                              &error_abort);
> > > > +}
> > > > +
> > > > +static const TypeInfo virtio_pmem_pci_info = {
> > > > +    .name          = TYPE_VIRTIO_PMEM_PCI,
> > > > +    .parent        = TYPE_VIRTIO_PCI,
> > > > +    .instance_size = sizeof(VirtIOPMEMPCI),
> > > > +    .instance_init = virtio_pmem_pci_instance_init,
> > > > +    .class_init    = virtio_pmem_pci_class_init,
> > > > +};
> > > > +
> > > > +
> > > >  /* virtio-input-pci */
> > > >  
> > > >  static Property virtio_input_pci_properties[] = {
> > > > @@ -2714,6 +2757,7 @@ static void virtio_pci_register_types(void)
> > > >      type_register_static(&virtio_balloon_pci_info);
> > > >      type_register_static(&virtio_serial_pci_info);
> > > >      type_register_static(&virtio_net_pci_info);
> > > > +    type_register_static(&virtio_pmem_pci_info);
> > > >  #ifdef CONFIG_VHOST_SCSI
> > > >      type_register_static(&vhost_scsi_pci_info);
> > > >  #endif
> > > > diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
> > > > index 813082b0d7..fe74fcad3f 100644
> > > > --- a/hw/virtio/virtio-pci.h
> > > > +++ b/hw/virtio/virtio-pci.h
> > > > @@ -19,6 +19,7 @@
> > > >  #include "hw/virtio/virtio-blk.h"
> > > >  #include "hw/virtio/virtio-net.h"
> > > >  #include "hw/virtio/virtio-rng.h"
> > > > +#include "hw/virtio/virtio-pmem.h"
> > > >  #include "hw/virtio/virtio-serial.h"
> > > >  #include "hw/virtio/virtio-scsi.h"
> > > >  #include "hw/virtio/virtio-balloon.h"
> > > > @@ -57,6 +58,7 @@ typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
> > > >  typedef struct VirtIOGPUPCI VirtIOGPUPCI;
> > > >  typedef struct VHostVSockPCI VHostVSockPCI;
> > > >  typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
> > > > +typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
> > > >  
> > > >  /* virtio-pci-bus */
> > > >  
> > > > @@ -274,6 +276,18 @@ struct VirtIOBlkPCI {
> > > >      VirtIOBlock vdev;
> > > >  };
> > > >  
> > > > +/*
> > > > + * virtio-pmem-pci: This extends VirtioPCIProxy.
> > > > + */
> > > > +#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci"
> > > > +#define VIRTIO_PMEM_PCI(obj) \
> > > > +        OBJECT_CHECK(VirtIOPMEMPCI, (obj), TYPE_VIRTIO_PMEM_PCI)
> > > > +
> > > > +struct VirtIOPMEMPCI {
> > > > +    VirtIOPCIProxy parent_obj;
> > > > +    VirtIOPMEM vdev;
> > > > +};
> > > > +
> > > >  /*
> > > >   * virtio-balloon-pci: This extends VirtioPCIProxy.
> > > >   */
> > > > diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
> > > > new file mode 100644
> > > > index 0000000000..08c96d7e80
> > > > --- /dev/null
> > > > +++ b/hw/virtio/virtio-pmem.c
> > > > @@ -0,0 +1,241 @@
> > > > +/*
> > > > + * Virtio pmem device
> > > > + *
> > > > + * Copyright (C) 2018 Red Hat, Inc.
> > > > + * Copyright (C) 2018 Pankaj Gupta <pagupta@redhat.com>
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2.
> > > > + * See the COPYING file in the top-level directory.
> > > > + *
> > > > + */
> > > > +
> > > > +#include "qemu/osdep.h"
> > > > +#include "qapi/error.h"
> > > > +#include "qemu-common.h"
> > > > +#include "qemu/error-report.h"
> > > > +#include "hw/virtio/virtio-access.h"
> > > > +#include "hw/virtio/virtio-pmem.h"
> > > > +#include "hw/mem/memory-device.h"
> > > > +#include "block/aio.h"
> > > > +#include "block/thread-pool.h"
> > > > +
> > > > +typedef struct VirtIOPMEMresp {
> > > > +    int ret;
> > > > +} VirtIOPMEMResp;
> > > > +
> > > > +typedef struct VirtIODeviceRequest {
> > > > +    VirtQueueElement elem;
> > > > +    int fd;
> > > > +    VirtIOPMEM *pmem;
> > > > +    VirtIOPMEMResp resp;
> > > > +} VirtIODeviceRequest;
> > > > +
> > > > +static int worker_cb(void *opaque)
> > > > +{
> > > > +    VirtIODeviceRequest *req = opaque;
> > > > +    int err = 0;
> > > > +
> > > > +    /* flush raw backing image */
> > > > +    err = fsync(req->fd);
> > > > +    if (err != 0) {
> > > > +        err = errno;
> > > > +    }
> > > > +    req->resp.ret = err;  
> > > 
> > > Host question: are you returning the guest errno code to the host?  
> > 
> > No. I am returning error code from the host in-case of host fsync
> > failure, otherwise returning zero.  
> 
> I think that's what Luiz meant.  errno constants are not portable
> between operating systems and architectures.  Therefore they cannot be
> used in external interfaces in software that expects to communicate with
> other systems.

Oh, thanks. Only saw this email now.

> It will be necessary to define specific constants for virtio-pmem
> instead of passing errno from the host to guest.

Yes, defining your own constants work. But I think the only fsync()
error that will make sense for the guest is EIO. The other errors
only make sense for the host.
Luiz Capitulino July 19, 2018, 12:57 p.m. UTC | #6
On Thu, 19 Jul 2018 08:48:19 -0400
Luiz Capitulino <lcapitulino@redhat.com> wrote:

> > It will be necessary to define specific constants for virtio-pmem
> > instead of passing errno from the host to guest.  
> 
> Yes, defining your own constants work. But I think the only fsync()
> error that will make sense for the guest is EIO. The other errors
> only make sense for the host.

Just to clarify: of course you'll return an error to guest on any
fsync() error. But maybe you should always return EIO even if the error
was EBADF for example. Or just signal the error with some constant,
and let the guest implementation pick any errno it prefers (this was
my first suggestion).
David Hildenbrand July 19, 2018, 1:58 p.m. UTC | #7
On 19.07.2018 14:16, Stefan Hajnoczi wrote:
> On Thu, Jul 19, 2018 at 01:48:13AM -0400, Pankaj Gupta wrote:
>>
>>>
>>>>  This patch adds virtio-pmem Qemu device.
>>>>
>>>>  This device presents memory address range information to guest
>>>>  which is backed by file backend type. It acts like persistent
>>>>  memory device for KVM guest. Guest can perform read and persistent
>>>>  write operations on this memory range with the help of DAX capable
>>>>  filesystem.
>>>>
>>>>  Persistent guest writes are assured with the help of virtio based
>>>>  flushing interface. When guest userspace space performs fsync on
>>>>  file fd on pmem device, a flush command is send to Qemu over VIRTIO
>>>>  and host side flush/sync is done on backing image file.
>>>>
>>>> Changes from RFC v2:
>>>> - Use aio_worker() to avoid Qemu from hanging with blocking fsync
>>>>   call - Stefan
>>>> - Use virtio_st*_p() for endianess - Stefan
>>>> - Correct indentation in qapi/misc.json - Eric
>>>>
>>>> Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
>>>> ---
>>>>  hw/virtio/Makefile.objs                     |   3 +
>>>>  hw/virtio/virtio-pci.c                      |  44 +++++
>>>>  hw/virtio/virtio-pci.h                      |  14 ++
>>>>  hw/virtio/virtio-pmem.c                     | 241
>>>>  ++++++++++++++++++++++++++++
>>>>  include/hw/pci/pci.h                        |   1 +
>>>>  include/hw/virtio/virtio-pmem.h             |  42 +++++
>>>>  include/standard-headers/linux/virtio_ids.h |   1 +
>>>>  qapi/misc.json                              |  26 ++-
>>>>  8 files changed, 371 insertions(+), 1 deletion(-)
>>>>  create mode 100644 hw/virtio/virtio-pmem.c
>>>>  create mode 100644 include/hw/virtio/virtio-pmem.h
>>>>
>>>> diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
>>>> index 1b2799cfd8..7f914d45d0 100644
>>>> --- a/hw/virtio/Makefile.objs
>>>> +++ b/hw/virtio/Makefile.objs
>>>> @@ -10,6 +10,9 @@ obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
>>>>  obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) +=
>>>>  virtio-crypto-pci.o
>>>>  
>>>>  obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
>>>> +ifeq ($(CONFIG_MEM_HOTPLUG),y)
>>>> +obj-$(CONFIG_LINUX) += virtio-pmem.o
>>>> +endif
>>>>  obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
>>>>  endif
>>>>  
>>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
>>>> index 3a01fe90f0..93d3fc05c7 100644
>>>> --- a/hw/virtio/virtio-pci.c
>>>> +++ b/hw/virtio/virtio-pci.c
>>>> @@ -2521,6 +2521,49 @@ static const TypeInfo virtio_rng_pci_info = {
>>>>      .class_init    = virtio_rng_pci_class_init,
>>>>  };
>>>>  
>>>> +/* virtio-pmem-pci */
>>>> +
>>>> +static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error
>>>> **errp)
>>>> +{
>>>> +    VirtIOPMEMPCI *vpmem = VIRTIO_PMEM_PCI(vpci_dev);
>>>> +    DeviceState *vdev = DEVICE(&vpmem->vdev);
>>>> +
>>>> +    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
>>>> +    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
>>>> +}
>>>> +
>>>> +static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
>>>> +{
>>>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>>>> +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
>>>> +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
>>>> +    k->realize = virtio_pmem_pci_realize;
>>>> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
>>>> +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
>>>> +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_PMEM;
>>>> +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
>>>> +    pcidev_k->class_id = PCI_CLASS_OTHERS;
>>>> +}
>>>> +
>>>> +static void virtio_pmem_pci_instance_init(Object *obj)
>>>> +{
>>>> +    VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
>>>> +
>>>> +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
>>>> +                                TYPE_VIRTIO_PMEM);
>>>> +    object_property_add_alias(obj, "memdev", OBJECT(&dev->vdev), "memdev",
>>>> +                              &error_abort);
>>>> +}
>>>> +
>>>> +static const TypeInfo virtio_pmem_pci_info = {
>>>> +    .name          = TYPE_VIRTIO_PMEM_PCI,
>>>> +    .parent        = TYPE_VIRTIO_PCI,
>>>> +    .instance_size = sizeof(VirtIOPMEMPCI),
>>>> +    .instance_init = virtio_pmem_pci_instance_init,
>>>> +    .class_init    = virtio_pmem_pci_class_init,
>>>> +};
>>>> +
>>>> +
>>>>  /* virtio-input-pci */
>>>>  
>>>>  static Property virtio_input_pci_properties[] = {
>>>> @@ -2714,6 +2757,7 @@ static void virtio_pci_register_types(void)
>>>>      type_register_static(&virtio_balloon_pci_info);
>>>>      type_register_static(&virtio_serial_pci_info);
>>>>      type_register_static(&virtio_net_pci_info);
>>>> +    type_register_static(&virtio_pmem_pci_info);
>>>>  #ifdef CONFIG_VHOST_SCSI
>>>>      type_register_static(&vhost_scsi_pci_info);
>>>>  #endif
>>>> diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
>>>> index 813082b0d7..fe74fcad3f 100644
>>>> --- a/hw/virtio/virtio-pci.h
>>>> +++ b/hw/virtio/virtio-pci.h
>>>> @@ -19,6 +19,7 @@
>>>>  #include "hw/virtio/virtio-blk.h"
>>>>  #include "hw/virtio/virtio-net.h"
>>>>  #include "hw/virtio/virtio-rng.h"
>>>> +#include "hw/virtio/virtio-pmem.h"
>>>>  #include "hw/virtio/virtio-serial.h"
>>>>  #include "hw/virtio/virtio-scsi.h"
>>>>  #include "hw/virtio/virtio-balloon.h"
>>>> @@ -57,6 +58,7 @@ typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
>>>>  typedef struct VirtIOGPUPCI VirtIOGPUPCI;
>>>>  typedef struct VHostVSockPCI VHostVSockPCI;
>>>>  typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
>>>> +typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
>>>>  
>>>>  /* virtio-pci-bus */
>>>>  
>>>> @@ -274,6 +276,18 @@ struct VirtIOBlkPCI {
>>>>      VirtIOBlock vdev;
>>>>  };
>>>>  
>>>> +/*
>>>> + * virtio-pmem-pci: This extends VirtioPCIProxy.
>>>> + */
>>>> +#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci"
>>>> +#define VIRTIO_PMEM_PCI(obj) \
>>>> +        OBJECT_CHECK(VirtIOPMEMPCI, (obj), TYPE_VIRTIO_PMEM_PCI)
>>>> +
>>>> +struct VirtIOPMEMPCI {
>>>> +    VirtIOPCIProxy parent_obj;
>>>> +    VirtIOPMEM vdev;
>>>> +};
>>>> +
>>>>  /*
>>>>   * virtio-balloon-pci: This extends VirtioPCIProxy.
>>>>   */
>>>> diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
>>>> new file mode 100644
>>>> index 0000000000..08c96d7e80
>>>> --- /dev/null
>>>> +++ b/hw/virtio/virtio-pmem.c
>>>> @@ -0,0 +1,241 @@
>>>> +/*
>>>> + * Virtio pmem device
>>>> + *
>>>> + * Copyright (C) 2018 Red Hat, Inc.
>>>> + * Copyright (C) 2018 Pankaj Gupta <pagupta@redhat.com>
>>>> + *
>>>> + * This work is licensed under the terms of the GNU GPL, version 2.
>>>> + * See the COPYING file in the top-level directory.
>>>> + *
>>>> + */
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qapi/error.h"
>>>> +#include "qemu-common.h"
>>>> +#include "qemu/error-report.h"
>>>> +#include "hw/virtio/virtio-access.h"
>>>> +#include "hw/virtio/virtio-pmem.h"
>>>> +#include "hw/mem/memory-device.h"
>>>> +#include "block/aio.h"
>>>> +#include "block/thread-pool.h"
>>>> +
>>>> +typedef struct VirtIOPMEMresp {
>>>> +    int ret;
>>>> +} VirtIOPMEMResp;
>>>> +
>>>> +typedef struct VirtIODeviceRequest {
>>>> +    VirtQueueElement elem;
>>>> +    int fd;
>>>> +    VirtIOPMEM *pmem;
>>>> +    VirtIOPMEMResp resp;
>>>> +} VirtIODeviceRequest;
>>>> +
>>>> +static int worker_cb(void *opaque)
>>>> +{
>>>> +    VirtIODeviceRequest *req = opaque;
>>>> +    int err = 0;
>>>> +
>>>> +    /* flush raw backing image */
>>>> +    err = fsync(req->fd);
>>>> +    if (err != 0) {
>>>> +        err = errno;
>>>> +    }
>>>> +    req->resp.ret = err;
>>>
>>> Host question: are you returning the guest errno code to the host?
>>
>> No. I am returning error code from the host in-case of host fsync
>> failure, otherwise returning zero.
> 
> I think that's what Luiz meant.  errno constants are not portable
> between operating systems and architectures.  Therefore they cannot be
> used in external interfaces in software that expects to communicate with
> other systems.
> 
> It will be necessary to define specific constants for virtio-pmem
> instead of passing errno from the host to guest.
> 

In general, I wonder if we should report errors at all or rather *kill*
the guest. That might sound harsh, but think about the following scenario:

fsync() fails due to some block that cannot e.g. be written (e.g.
network connection failed). What happens if our guest tries to
read/write that mmaped block? (e.g. network connection failed).

I assume we'll get a signal an get killed? So we are trying to optimize
one special case (fsync()) although every read/write is prone to kill
the guest. And as soon as the guest will try to access the block that
made fsync fail, we will crash the guest either way.

I assume the main problem is that we are trying to take a file (with all
the errors that can happen during read/write/fsync) and make it look
like memory (dax). On ordinary block access, we can forward errors, but
not if it's memory (maybe using MCE, but it's complicated and
architecture specific).

So I wonder if we should rather assume that our backend file is placed
on some stable storage that cannot easily fail.

(we might have the same problem with NVDIMM right now, at least the
memory reading/writing part)

It's complicated and I am not a block level expert :)

> Stefan
>
Luiz Capitulino July 19, 2018, 3:48 p.m. UTC | #8
On Thu, 19 Jul 2018 15:58:20 +0200
David Hildenbrand <david@redhat.com> wrote:

> On 19.07.2018 14:16, Stefan Hajnoczi wrote:
> > On Thu, Jul 19, 2018 at 01:48:13AM -0400, Pankaj Gupta wrote:  
> >>  
> >>>  
> >>>>  This patch adds virtio-pmem Qemu device.
> >>>>
> >>>>  This device presents memory address range information to guest
> >>>>  which is backed by file backend type. It acts like persistent
> >>>>  memory device for KVM guest. Guest can perform read and persistent
> >>>>  write operations on this memory range with the help of DAX capable
> >>>>  filesystem.
> >>>>
> >>>>  Persistent guest writes are assured with the help of virtio based
> >>>>  flushing interface. When guest userspace space performs fsync on
> >>>>  file fd on pmem device, a flush command is send to Qemu over VIRTIO
> >>>>  and host side flush/sync is done on backing image file.
> >>>>
> >>>> Changes from RFC v2:
> >>>> - Use aio_worker() to avoid Qemu from hanging with blocking fsync
> >>>>   call - Stefan
> >>>> - Use virtio_st*_p() for endianess - Stefan
> >>>> - Correct indentation in qapi/misc.json - Eric
> >>>>
> >>>> Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> >>>> ---
> >>>>  hw/virtio/Makefile.objs                     |   3 +
> >>>>  hw/virtio/virtio-pci.c                      |  44 +++++
> >>>>  hw/virtio/virtio-pci.h                      |  14 ++
> >>>>  hw/virtio/virtio-pmem.c                     | 241
> >>>>  ++++++++++++++++++++++++++++
> >>>>  include/hw/pci/pci.h                        |   1 +
> >>>>  include/hw/virtio/virtio-pmem.h             |  42 +++++
> >>>>  include/standard-headers/linux/virtio_ids.h |   1 +
> >>>>  qapi/misc.json                              |  26 ++-
> >>>>  8 files changed, 371 insertions(+), 1 deletion(-)
> >>>>  create mode 100644 hw/virtio/virtio-pmem.c
> >>>>  create mode 100644 include/hw/virtio/virtio-pmem.h
> >>>>
> >>>> diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
> >>>> index 1b2799cfd8..7f914d45d0 100644
> >>>> --- a/hw/virtio/Makefile.objs
> >>>> +++ b/hw/virtio/Makefile.objs
> >>>> @@ -10,6 +10,9 @@ obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
> >>>>  obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) +=
> >>>>  virtio-crypto-pci.o
> >>>>  
> >>>>  obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
> >>>> +ifeq ($(CONFIG_MEM_HOTPLUG),y)
> >>>> +obj-$(CONFIG_LINUX) += virtio-pmem.o
> >>>> +endif
> >>>>  obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
> >>>>  endif
> >>>>  
> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> >>>> index 3a01fe90f0..93d3fc05c7 100644
> >>>> --- a/hw/virtio/virtio-pci.c
> >>>> +++ b/hw/virtio/virtio-pci.c
> >>>> @@ -2521,6 +2521,49 @@ static const TypeInfo virtio_rng_pci_info = {
> >>>>      .class_init    = virtio_rng_pci_class_init,
> >>>>  };
> >>>>  
> >>>> +/* virtio-pmem-pci */
> >>>> +
> >>>> +static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error
> >>>> **errp)
> >>>> +{
> >>>> +    VirtIOPMEMPCI *vpmem = VIRTIO_PMEM_PCI(vpci_dev);
> >>>> +    DeviceState *vdev = DEVICE(&vpmem->vdev);
> >>>> +
> >>>> +    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
> >>>> +    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
> >>>> +}
> >>>> +
> >>>> +static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
> >>>> +{
> >>>> +    DeviceClass *dc = DEVICE_CLASS(klass);
> >>>> +    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
> >>>> +    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
> >>>> +    k->realize = virtio_pmem_pci_realize;
> >>>> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> >>>> +    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
> >>>> +    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_PMEM;
> >>>> +    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
> >>>> +    pcidev_k->class_id = PCI_CLASS_OTHERS;
> >>>> +}
> >>>> +
> >>>> +static void virtio_pmem_pci_instance_init(Object *obj)
> >>>> +{
> >>>> +    VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
> >>>> +
> >>>> +    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
> >>>> +                                TYPE_VIRTIO_PMEM);
> >>>> +    object_property_add_alias(obj, "memdev", OBJECT(&dev->vdev), "memdev",
> >>>> +                              &error_abort);
> >>>> +}
> >>>> +
> >>>> +static const TypeInfo virtio_pmem_pci_info = {
> >>>> +    .name          = TYPE_VIRTIO_PMEM_PCI,
> >>>> +    .parent        = TYPE_VIRTIO_PCI,
> >>>> +    .instance_size = sizeof(VirtIOPMEMPCI),
> >>>> +    .instance_init = virtio_pmem_pci_instance_init,
> >>>> +    .class_init    = virtio_pmem_pci_class_init,
> >>>> +};
> >>>> +
> >>>> +
> >>>>  /* virtio-input-pci */
> >>>>  
> >>>>  static Property virtio_input_pci_properties[] = {
> >>>> @@ -2714,6 +2757,7 @@ static void virtio_pci_register_types(void)
> >>>>      type_register_static(&virtio_balloon_pci_info);
> >>>>      type_register_static(&virtio_serial_pci_info);
> >>>>      type_register_static(&virtio_net_pci_info);
> >>>> +    type_register_static(&virtio_pmem_pci_info);
> >>>>  #ifdef CONFIG_VHOST_SCSI
> >>>>      type_register_static(&vhost_scsi_pci_info);
> >>>>  #endif
> >>>> diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
> >>>> index 813082b0d7..fe74fcad3f 100644
> >>>> --- a/hw/virtio/virtio-pci.h
> >>>> +++ b/hw/virtio/virtio-pci.h
> >>>> @@ -19,6 +19,7 @@
> >>>>  #include "hw/virtio/virtio-blk.h"
> >>>>  #include "hw/virtio/virtio-net.h"
> >>>>  #include "hw/virtio/virtio-rng.h"
> >>>> +#include "hw/virtio/virtio-pmem.h"
> >>>>  #include "hw/virtio/virtio-serial.h"
> >>>>  #include "hw/virtio/virtio-scsi.h"
> >>>>  #include "hw/virtio/virtio-balloon.h"
> >>>> @@ -57,6 +58,7 @@ typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
> >>>>  typedef struct VirtIOGPUPCI VirtIOGPUPCI;
> >>>>  typedef struct VHostVSockPCI VHostVSockPCI;
> >>>>  typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
> >>>> +typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
> >>>>  
> >>>>  /* virtio-pci-bus */
> >>>>  
> >>>> @@ -274,6 +276,18 @@ struct VirtIOBlkPCI {
> >>>>      VirtIOBlock vdev;
> >>>>  };
> >>>>  
> >>>> +/*
> >>>> + * virtio-pmem-pci: This extends VirtioPCIProxy.
> >>>> + */
> >>>> +#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci"
> >>>> +#define VIRTIO_PMEM_PCI(obj) \
> >>>> +        OBJECT_CHECK(VirtIOPMEMPCI, (obj), TYPE_VIRTIO_PMEM_PCI)
> >>>> +
> >>>> +struct VirtIOPMEMPCI {
> >>>> +    VirtIOPCIProxy parent_obj;
> >>>> +    VirtIOPMEM vdev;
> >>>> +};
> >>>> +
> >>>>  /*
> >>>>   * virtio-balloon-pci: This extends VirtioPCIProxy.
> >>>>   */
> >>>> diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
> >>>> new file mode 100644
> >>>> index 0000000000..08c96d7e80
> >>>> --- /dev/null
> >>>> +++ b/hw/virtio/virtio-pmem.c
> >>>> @@ -0,0 +1,241 @@
> >>>> +/*
> >>>> + * Virtio pmem device
> >>>> + *
> >>>> + * Copyright (C) 2018 Red Hat, Inc.
> >>>> + * Copyright (C) 2018 Pankaj Gupta <pagupta@redhat.com>
> >>>> + *
> >>>> + * This work is licensed under the terms of the GNU GPL, version 2.
> >>>> + * See the COPYING file in the top-level directory.
> >>>> + *
> >>>> + */
> >>>> +
> >>>> +#include "qemu/osdep.h"
> >>>> +#include "qapi/error.h"
> >>>> +#include "qemu-common.h"
> >>>> +#include "qemu/error-report.h"
> >>>> +#include "hw/virtio/virtio-access.h"
> >>>> +#include "hw/virtio/virtio-pmem.h"
> >>>> +#include "hw/mem/memory-device.h"
> >>>> +#include "block/aio.h"
> >>>> +#include "block/thread-pool.h"
> >>>> +
> >>>> +typedef struct VirtIOPMEMresp {
> >>>> +    int ret;
> >>>> +} VirtIOPMEMResp;
> >>>> +
> >>>> +typedef struct VirtIODeviceRequest {
> >>>> +    VirtQueueElement elem;
> >>>> +    int fd;
> >>>> +    VirtIOPMEM *pmem;
> >>>> +    VirtIOPMEMResp resp;
> >>>> +} VirtIODeviceRequest;
> >>>> +
> >>>> +static int worker_cb(void *opaque)
> >>>> +{
> >>>> +    VirtIODeviceRequest *req = opaque;
> >>>> +    int err = 0;
> >>>> +
> >>>> +    /* flush raw backing image */
> >>>> +    err = fsync(req->fd);
> >>>> +    if (err != 0) {
> >>>> +        err = errno;
> >>>> +    }
> >>>> +    req->resp.ret = err;  
> >>>
> >>> Host question: are you returning the guest errno code to the host?  
> >>
> >> No. I am returning error code from the host in-case of host fsync
> >> failure, otherwise returning zero.  
> > 
> > I think that's what Luiz meant.  errno constants are not portable
> > between operating systems and architectures.  Therefore they cannot be
> > used in external interfaces in software that expects to communicate with
> > other systems.
> > 
> > It will be necessary to define specific constants for virtio-pmem
> > instead of passing errno from the host to guest.
> >   
> 
> In general, I wonder if we should report errors at all or rather *kill*
> the guest. That might sound harsh, but think about the following scenario:

I almost sure that I read in the nvdimm spec that real hardware will
cause a memory error on sync or read/write error. If we truly want
to emulate this, then I guess QEMU should be able to inject a memory
error for the entire region instead of returning the fsync() error
to the guest.

> fsync() fails due to some block that cannot e.g. be written (e.g.
> network connection failed). What happens if our guest tries to
> read/write that mmaped block? (e.g. network connection failed).

I think it gets a SIGBUS? Btw, I think that QEMU already has the
machinery to turn a SIGBUS into an memory error in the guest.

> I assume we'll get a signal an get killed? So we are trying to optimize
> one special case (fsync()) although every read/write is prone to kill
> the guest. And as soon as the guest will try to access the block that
> made fsync fail, we will crash the guest either way.

I think you have a point.

> 
> I assume the main problem is that we are trying to take a file (with all
> the errors that can happen during read/write/fsync) and make it look
> like memory (dax). On ordinary block access, we can forward errors, but
> not if it's memory (maybe using MCE, but it's complicated and
> architecture specific).
> 
> So I wonder if we should rather assume that our backend file is placed
> on some stable storage that cannot easily fail.
> 
> (we might have the same problem with NVDIMM right now, at least the
> memory reading/writing part)
> 
> It's complicated and I am not a block level expert :)
> 
> > Stefan
> >   
> 
>
Pankaj Gupta July 20, 2018, 1:02 p.m. UTC | #9
> >>>>  /*
> >>>>   * virtio-balloon-pci: This extends VirtioPCIProxy.
> >>>>   */
> >>>> diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
> >>>> new file mode 100644
> >>>> index 0000000000..08c96d7e80
> >>>> --- /dev/null
> >>>> +++ b/hw/virtio/virtio-pmem.c
> >>>> @@ -0,0 +1,241 @@
> >>>> +/*
> >>>> + * Virtio pmem device
> >>>> + *
> >>>> + * Copyright (C) 2018 Red Hat, Inc.
> >>>> + * Copyright (C) 2018 Pankaj Gupta <pagupta@redhat.com>
> >>>> + *
> >>>> + * This work is licensed under the terms of the GNU GPL, version 2.
> >>>> + * See the COPYING file in the top-level directory.
> >>>> + *
> >>>> + */
> >>>> +
> >>>> +#include "qemu/osdep.h"
> >>>> +#include "qapi/error.h"
> >>>> +#include "qemu-common.h"
> >>>> +#include "qemu/error-report.h"
> >>>> +#include "hw/virtio/virtio-access.h"
> >>>> +#include "hw/virtio/virtio-pmem.h"
> >>>> +#include "hw/mem/memory-device.h"
> >>>> +#include "block/aio.h"
> >>>> +#include "block/thread-pool.h"
> >>>> +
> >>>> +typedef struct VirtIOPMEMresp {
> >>>> +    int ret;
> >>>> +} VirtIOPMEMResp;
> >>>> +
> >>>> +typedef struct VirtIODeviceRequest {
> >>>> +    VirtQueueElement elem;
> >>>> +    int fd;
> >>>> +    VirtIOPMEM *pmem;
> >>>> +    VirtIOPMEMResp resp;
> >>>> +} VirtIODeviceRequest;
> >>>> +
> >>>> +static int worker_cb(void *opaque)
> >>>> +{
> >>>> +    VirtIODeviceRequest *req = opaque;
> >>>> +    int err = 0;
> >>>> +
> >>>> +    /* flush raw backing image */
> >>>> +    err = fsync(req->fd);
> >>>> +    if (err != 0) {
> >>>> +        err = errno;
> >>>> +    }
> >>>> +    req->resp.ret = err;
> >>>
> >>> Host question: are you returning the guest errno code to the host?
> >>
> >> No. I am returning error code from the host in-case of host fsync
> >> failure, otherwise returning zero.
> > 
> > I think that's what Luiz meant.  errno constants are not portable
> > between operating systems and architectures.  Therefore they cannot be
> > used in external interfaces in software that expects to communicate with
> > other systems.
> > 
> > It will be necessary to define specific constants for virtio-pmem
> > instead of passing errno from the host to guest.
> > 
> 
> In general, I wonder if we should report errors at all or rather *kill*
> the guest. That might sound harsh, but think about the following scenario:
> 
> fsync() fails due to some block that cannot e.g. be written (e.g.
> network connection failed). What happens if our guest tries to
> read/write that mmaped block? (e.g. network connection failed).
> 
> I assume we'll get a signal an get killed? So we are trying to optimize
> one special case (fsync()) although every read/write is prone to kill
> the guest. And as soon as the guest will try to access the block that
> made fsync fail, we will crash the guest either way.
> 
> I assume the main problem is that we are trying to take a file (with all
> the errors that can happen during read/write/fsync) and make it look
> like memory (dax). On ordinary block access, we can forward errors, but
> not if it's memory (maybe using MCE, but it's complicated and
> architecture specific).

There are two points which you highlighted:

1] Memory hardware errors:
These type of errors will be notified by MCA. If mce is non-recoverable, KVM gets 
SIG_BUS when hardware detects such error and injects mce in guest vCPU. If guest 
does not recoverable it can decide to kill the user-space process. 

Default option for mce is '1':
1: panic or SIGBUS on uncorrected errors, log corrected errors

2] read/write/fsync failure because of (network connection failure):
I assume you are talking about something like NFS mount where read/write/fsync
responsibility is taken care by NFS. This scenario can happen for any application
accessing a network filesystem and return appropriate error or wait. Until 'fsync' 
is not performed there is no guarantee ram data is backed. I think its
the responsibility of application to perform fsync after write operation or 
a transaction.
 
> 
> So I wonder if we should rather assume that our backend file is placed
> on some stable storage that cannot easily fail.
> 
> (we might have the same problem with NVDIMM right now, at least the
> memory reading/writing part)

NVDIMM NFIT handles this handler and checks if any SPA falls in the range
of mce:address. It creates a list of bad blocks(corresponding to nd_region) and handle 
in function 'pmem_do_bvec' used by 'pmem_mem_request' & 'pmem_read_write'.

void nfit_mce_register(void)
{
        mce_register_decode_chain(&nfit_mce_dec);
}

In 'fake DAX', we bypass NFIT ACPI and using virtio & nvdimm_bus way of registering
memory region. By default it should kill the userspace process or at worst cause guest reboot.
I am thinking how we can integrate the NFIT bad block handling with mce handler approach
for fake DAX. I think we can do this. But I want inputs from NVDIMM guys?

Thanks,
Pankaj

> 
> It's complicated and I am not a block level expert :)
Pankaj Gupta July 20, 2018, 1:04 p.m. UTC | #10
> > > > > +
> > > > > +typedef struct VirtIOPMEMresp {
> > > > > +    int ret;
> > > > > +} VirtIOPMEMResp;
> > > > > +
> > > > > +typedef struct VirtIODeviceRequest {
> > > > > +    VirtQueueElement elem;
> > > > > +    int fd;
> > > > > +    VirtIOPMEM *pmem;
> > > > > +    VirtIOPMEMResp resp;
> > > > > +} VirtIODeviceRequest;
> > > > > +
> > > > > +static int worker_cb(void *opaque)
> > > > > +{
> > > > > +    VirtIODeviceRequest *req = opaque;
> > > > > +    int err = 0;
> > > > > +
> > > > > +    /* flush raw backing image */
> > > > > +    err = fsync(req->fd);
> > > > > +    if (err != 0) {
> > > > > +        err = errno;
> > > > > +    }
> > > > > +    req->resp.ret = err;
> > > > 
> > > > Host question: are you returning the guest errno code to the host?
> > > 
> > > No. I am returning error code from the host in-case of host fsync
> > > failure, otherwise returning zero.
> > 
> > I think that's what Luiz meant.  errno constants are not portable
> > between operating systems and architectures.  Therefore they cannot be
> > used in external interfaces in software that expects to communicate with
> > other systems.
> 
> Oh, thanks. Only saw this email now.
> 
> > It will be necessary to define specific constants for virtio-pmem
> > instead of passing errno from the host to guest.
> 
> Yes, defining your own constants work. But I think the only fsync()
> error that will make sense for the guest is EIO. The other errors
> only make sense for the host.

Agree.

Thanks,
Pankaj
Eric Blake July 24, 2018, 4:13 p.m. UTC | #11
On 07/13/2018 02:52 AM, Pankaj Gupta wrote:
>   This patch adds virtio-pmem Qemu device.
> 
>   This device presents memory address range information to guest
>   which is backed by file backend type. It acts like persistent
>   memory device for KVM guest. Guest can perform read and persistent
>   write operations on this memory range with the help of DAX capable
>   filesystem.
> 
>   Persistent guest writes are assured with the help of virtio based
>   flushing interface. When guest userspace space performs fsync on
>   file fd on pmem device, a flush command is send to Qemu over VIRTIO
>   and host side flush/sync is done on backing image file.
> 
> Changes from RFC v2:

This patch has no n/M in the subject line; but is included in a thread 
that also has a 0/2 cover letter, as well as 1/2 and 2/2 patches in 
separate mails.  Is that intentional?

When sending revision notes on a specific patch, it's best to place them...

> - Use aio_worker() to avoid Qemu from hanging with blocking fsync
>    call - Stefan
> - Use virtio_st*_p() for endianess - Stefan
> - Correct indentation in qapi/misc.json - Eric
> 
> Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> ---

...here, after the --- separator. They are useful to reviewers on the 
list, but are stripped by 'git am' as they don't need to be part of the 
git history (a year from now, we won't care how many iterations the 
patch went through during review, only what actually landed).


> +++ b/qapi/misc.json
> @@ -2907,6 +2907,29 @@
>             }
>   }
>   
> +##
> +# @VirtioPMemDeviceInfo:
> +#
> +# VirtioPMem state information
> +#
> +# @id: device's ID
> +#
> +# @start: physical address, where device is mapped
> +#
> +# @size: size of memory that the device provides
> +#
> +# @memdev: memory backend linked with device
> +#
> +# Since: 2.13

There is no 2.13 release, and you've missed the 3.0 window.  Please 
update this and any other version reference to 3.1.
Pankaj Gupta July 25, 2018, 5:01 a.m. UTC | #12
Hi Eric,

> 
> On 07/13/2018 02:52 AM, Pankaj Gupta wrote:
> >   This patch adds virtio-pmem Qemu device.
> > 
> >   This device presents memory address range information to guest
> >   which is backed by file backend type. It acts like persistent
> >   memory device for KVM guest. Guest can perform read and persistent
> >   write operations on this memory range with the help of DAX capable
> >   filesystem.
> > 
> >   Persistent guest writes are assured with the help of virtio based
> >   flushing interface. When guest userspace space performs fsync on
> >   file fd on pmem device, a flush command is send to Qemu over VIRTIO
> >   and host side flush/sync is done on backing image file.
> > 
> > Changes from RFC v2:
> 
> This patch has no n/M in the subject line; but is included in a thread
> that also has a 0/2 cover letter, as well as 1/2 and 2/2 patches in
> separate mails.  Is that intentional?

Yes, kernel series has 0-2 patches and Qemu has this one. I thought its
good to keep separate numbering for both the sets.  
> 
> When sending revision notes on a specific patch, it's best to place them...

Sure.
> 
> > - Use aio_worker() to avoid Qemu from hanging with blocking fsync
> >    call - Stefan
> > - Use virtio_st*_p() for endianess - Stefan
> > - Correct indentation in qapi/misc.json - Eric
> > 
> > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > ---
> 
> ...here, after the --- separator. They are useful to reviewers on the
> list, but are stripped by 'git am' as they don't need to be part of the
> git history (a year from now, we won't care how many iterations the
> patch went through during review, only what actually landed).
> 
> 
> > +++ b/qapi/misc.json
> > @@ -2907,6 +2907,29 @@
> >             }
> >   }
> >   
> > +##
> > +# @VirtioPMemDeviceInfo:
> > +#
> > +# VirtioPMem state information
> > +#
> > +# @id: device's ID
> > +#
> > +# @start: physical address, where device is mapped
> > +#
> > +# @size: size of memory that the device provides
> > +#
> > +# @memdev: memory backend linked with device
> > +#
> > +# Since: 2.13
> 
> There is no 2.13 release, and you've missed the 3.0 window.  Please
> update this and any other version reference to 3.1.

okay.

Thanks,
Pankaj
Eric Blake July 25, 2018, 12:19 p.m. UTC | #13
On 07/25/2018 12:01 AM, Pankaj Gupta wrote:

>>
>> This patch has no n/M in the subject line; but is included in a thread
>> that also has a 0/2 cover letter, as well as 1/2 and 2/2 patches in
>> separate mails.  Is that intentional?
> 
> Yes, kernel series has 0-2 patches and Qemu has this one. I thought its
> good to keep separate numbering for both the sets.

Ah, that makes sense. The cover letter didn't make it obvious to me that 
this was two separate series to two projects, but related enough that 
both series have to be incorporated for the feature to work and thus 
cross-posted under a single cover letter.
Pankaj Gupta July 25, 2018, 12:47 p.m. UTC | #14
> 
> >>
> >> This patch has no n/M in the subject line; but is included in a thread
> >> that also has a 0/2 cover letter, as well as 1/2 and 2/2 patches in
> >> separate mails.  Is that intentional?
> > 
> > Yes, kernel series has 0-2 patches and Qemu has this one. I thought its
> > good to keep separate numbering for both the sets.
> 
> Ah, that makes sense. The cover letter didn't make it obvious to me that
> this was two separate series to two projects, but related enough that
> both series have to be incorporated for the feature to work and thus
> cross-posted under a single cover letter.

Will try to make it more clear in next posting.

Thanks,
Pankaj
diff mbox

Patch

diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index 1b2799cfd8..7f914d45d0 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -10,6 +10,9 @@  obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
 obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) += virtio-crypto-pci.o
 
 obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
+ifeq ($(CONFIG_MEM_HOTPLUG),y)
+obj-$(CONFIG_LINUX) += virtio-pmem.o
+endif
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
 endif
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 3a01fe90f0..93d3fc05c7 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -2521,6 +2521,49 @@  static const TypeInfo virtio_rng_pci_info = {
     .class_init    = virtio_rng_pci_class_init,
 };
 
+/* virtio-pmem-pci */
+
+static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+    VirtIOPMEMPCI *vpmem = VIRTIO_PMEM_PCI(vpci_dev);
+    DeviceState *vdev = DEVICE(&vpmem->vdev);
+
+    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
+    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+    k->realize = virtio_pmem_pci_realize;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_PMEM;
+    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+    pcidev_k->class_id = PCI_CLASS_OTHERS;
+}
+
+static void virtio_pmem_pci_instance_init(Object *obj)
+{
+    VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_PMEM);
+    object_property_add_alias(obj, "memdev", OBJECT(&dev->vdev), "memdev",
+                              &error_abort);
+}
+
+static const TypeInfo virtio_pmem_pci_info = {
+    .name          = TYPE_VIRTIO_PMEM_PCI,
+    .parent        = TYPE_VIRTIO_PCI,
+    .instance_size = sizeof(VirtIOPMEMPCI),
+    .instance_init = virtio_pmem_pci_instance_init,
+    .class_init    = virtio_pmem_pci_class_init,
+};
+
+
 /* virtio-input-pci */
 
 static Property virtio_input_pci_properties[] = {
@@ -2714,6 +2757,7 @@  static void virtio_pci_register_types(void)
     type_register_static(&virtio_balloon_pci_info);
     type_register_static(&virtio_serial_pci_info);
     type_register_static(&virtio_net_pci_info);
+    type_register_static(&virtio_pmem_pci_info);
 #ifdef CONFIG_VHOST_SCSI
     type_register_static(&vhost_scsi_pci_info);
 #endif
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index 813082b0d7..fe74fcad3f 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -19,6 +19,7 @@ 
 #include "hw/virtio/virtio-blk.h"
 #include "hw/virtio/virtio-net.h"
 #include "hw/virtio/virtio-rng.h"
+#include "hw/virtio/virtio-pmem.h"
 #include "hw/virtio/virtio-serial.h"
 #include "hw/virtio/virtio-scsi.h"
 #include "hw/virtio/virtio-balloon.h"
@@ -57,6 +58,7 @@  typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
 typedef struct VirtIOGPUPCI VirtIOGPUPCI;
 typedef struct VHostVSockPCI VHostVSockPCI;
 typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
+typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
 
 /* virtio-pci-bus */
 
@@ -274,6 +276,18 @@  struct VirtIOBlkPCI {
     VirtIOBlock vdev;
 };
 
+/*
+ * virtio-pmem-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci"
+#define VIRTIO_PMEM_PCI(obj) \
+        OBJECT_CHECK(VirtIOPMEMPCI, (obj), TYPE_VIRTIO_PMEM_PCI)
+
+struct VirtIOPMEMPCI {
+    VirtIOPCIProxy parent_obj;
+    VirtIOPMEM vdev;
+};
+
 /*
  * virtio-balloon-pci: This extends VirtioPCIProxy.
  */
diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
new file mode 100644
index 0000000000..08c96d7e80
--- /dev/null
+++ b/hw/virtio/virtio-pmem.c
@@ -0,0 +1,241 @@ 
+/*
+ * Virtio pmem device
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ * Copyright (C) 2018 Pankaj Gupta <pagupta@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/virtio-pmem.h"
+#include "hw/mem/memory-device.h"
+#include "block/aio.h"
+#include "block/thread-pool.h"
+
+typedef struct VirtIOPMEMresp {
+    int ret;
+} VirtIOPMEMResp;
+
+typedef struct VirtIODeviceRequest {
+    VirtQueueElement elem;
+    int fd;
+    VirtIOPMEM *pmem;
+    VirtIOPMEMResp resp;
+} VirtIODeviceRequest;
+
+static int worker_cb(void *opaque)
+{
+    VirtIODeviceRequest *req = opaque;
+    int err = 0;
+
+    /* flush raw backing image */
+    err = fsync(req->fd);
+    if (err != 0) {
+        err = errno;
+    }
+    req->resp.ret = err;
+
+    return 0;
+}
+
+static void done_cb(void *opaque, int ret)
+{
+    VirtIODeviceRequest *req = opaque;
+    int len = iov_from_buf(req->elem.in_sg, req->elem.in_num, 0,
+                              &req->resp, sizeof(VirtIOPMEMResp));
+
+    /* Callbacks are serialized, so no need to use atomic ops.  */
+    virtqueue_push(req->pmem->rq_vq, &req->elem, len);
+    virtio_notify((VirtIODevice *)req->pmem, req->pmem->rq_vq);
+    g_free(req);
+}
+
+static void virtio_pmem_flush(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIODeviceRequest *req;
+    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
+    HostMemoryBackend *backend = MEMORY_BACKEND(pmem->memdev);
+    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+
+    req = virtqueue_pop(vq, sizeof(VirtIODeviceRequest));
+    if (!req) {
+        virtio_error(vdev, "virtio-pmem missing request data");
+        return;
+    }
+
+    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
+        virtio_error(vdev, "virtio-pmem request not proper");
+        g_free(req);
+        return;
+    }
+    req->fd = memory_region_get_fd(&backend->mr);
+    req->pmem = pmem;
+    thread_pool_submit_aio(pool, worker_cb, req, done_cb, req);
+}
+
+static void virtio_pmem_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
+    struct virtio_pmem_config *pmemcfg = (struct virtio_pmem_config *) config;
+
+    virtio_stq_p(vdev, &pmemcfg->start, pmem->start);
+    virtio_stq_p(vdev, &pmemcfg->size, pmem->size);
+}
+
+static uint64_t virtio_pmem_get_features(VirtIODevice *vdev, uint64_t features,
+                                        Error **errp)
+{
+    return features;
+}
+
+static void virtio_pmem_realize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice   *vdev   = VIRTIO_DEVICE(dev);
+    VirtIOPMEM     *pmem   = VIRTIO_PMEM(dev);
+    MachineState   *ms     = MACHINE(qdev_get_machine());
+    uint64_t align;
+    Error *local_err = NULL;
+    MemoryRegion *mr;
+
+    if (!pmem->memdev) {
+        error_setg(errp, "virtio-pmem memdev not set");
+        return;
+    }
+
+    mr  = host_memory_backend_get_memory(pmem->memdev);
+    align = memory_region_get_alignment(mr);
+    pmem->size = QEMU_ALIGN_DOWN(memory_region_size(mr), align);
+    pmem->start = memory_device_get_free_addr(ms, NULL, align, pmem->size,
+                                                               &local_err);
+    if (local_err) {
+        error_setg(errp, "Can't get free address in mem device");
+        return;
+    }
+    memory_region_init_alias(&pmem->mr, OBJECT(pmem),
+                             "virtio_pmem-memory", mr, 0, pmem->size);
+    memory_device_plug_region(ms, &pmem->mr, pmem->start);
+
+    host_memory_backend_set_mapped(pmem->memdev, true);
+    virtio_init(vdev, TYPE_VIRTIO_PMEM, VIRTIO_ID_PMEM,
+                                          sizeof(struct virtio_pmem_config));
+    pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush);
+}
+
+static void virtio_mem_check_memdev(Object *obj, const char *name, Object *val,
+                                    Error **errp)
+{
+    if (host_memory_backend_is_mapped(MEMORY_BACKEND(val))) {
+        char *path = object_get_canonical_path_component(val);
+        error_setg(errp, "Can't use already busy memdev: %s", path);
+        g_free(path);
+        return;
+    }
+
+    qdev_prop_allow_set_link_before_realize(obj, name, val, errp);
+}
+
+static const char *virtio_pmem_get_device_id(VirtIOPMEM *vm)
+{
+    Object *obj = OBJECT(vm);
+    DeviceState *parent_dev;
+
+    /* always use the ID of the proxy device */
+    if (obj->parent && object_dynamic_cast(obj->parent, TYPE_DEVICE)) {
+        parent_dev = DEVICE(obj->parent);
+        return parent_dev->id;
+    }
+    return NULL;
+}
+
+static void virtio_pmem_md_fill_device_info(const MemoryDeviceState *md,
+                                           MemoryDeviceInfo *info)
+{
+    VirtioPMemDeviceInfo *vi = g_new0(VirtioPMemDeviceInfo, 1);
+    VirtIOPMEM *vm = VIRTIO_PMEM(md);
+    const char *id = virtio_pmem_get_device_id(vm);
+
+    if (id) {
+        vi->has_id = true;
+        vi->id = g_strdup(id);
+    }
+
+    vi->start = vm->start;
+    vi->size = vm->size;
+    vi->memdev = object_get_canonical_path(OBJECT(vm->memdev));
+
+    info->u.virtio_pmem.data = vi;
+    info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM;
+}
+
+static uint64_t virtio_pmem_md_get_addr(const MemoryDeviceState *md)
+{
+    VirtIOPMEM *vm = VIRTIO_PMEM(md);
+
+    return vm->start;
+}
+
+static uint64_t virtio_pmem_md_get_plugged_size(const MemoryDeviceState *md)
+{
+    VirtIOPMEM *vm = VIRTIO_PMEM(md);
+
+    return vm->size;
+}
+
+static uint64_t virtio_pmem_md_get_region_size(const MemoryDeviceState *md)
+{
+    VirtIOPMEM *vm = VIRTIO_PMEM(md);
+
+    return vm->size;
+}
+
+static void virtio_pmem_instance_init(Object *obj)
+{
+    VirtIOPMEM *vm = VIRTIO_PMEM(obj);
+    object_property_add_link(obj, "memdev", TYPE_MEMORY_BACKEND,
+                                (Object **)&vm->memdev,
+                                (void *) virtio_mem_check_memdev,
+                                OBJ_PROP_LINK_STRONG,
+                                &error_abort);
+}
+
+
+static void virtio_pmem_class_init(ObjectClass *klass, void *data)
+{
+    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+    MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
+
+    vdc->realize      =  virtio_pmem_realize;
+    vdc->get_config   =  virtio_pmem_get_config;
+    vdc->get_features =  virtio_pmem_get_features;
+
+    mdc->get_addr         = virtio_pmem_md_get_addr;
+    mdc->get_plugged_size = virtio_pmem_md_get_plugged_size;
+    mdc->get_region_size  = virtio_pmem_md_get_region_size;
+    mdc->fill_device_info = virtio_pmem_md_fill_device_info;
+}
+
+static TypeInfo virtio_pmem_info = {
+    .name          = TYPE_VIRTIO_PMEM,
+    .parent        = TYPE_VIRTIO_DEVICE,
+    .class_init    = virtio_pmem_class_init,
+    .instance_size = sizeof(VirtIOPMEM),
+    .instance_init = virtio_pmem_instance_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_MEMORY_DEVICE },
+        { }
+  },
+};
+
+static void virtio_register_types(void)
+{
+    type_register_static(&virtio_pmem_info);
+}
+
+type_init(virtio_register_types)
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 990d6fcbde..28829b6437 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -85,6 +85,7 @@  extern bool pci_available;
 #define PCI_DEVICE_ID_VIRTIO_RNG         0x1005
 #define PCI_DEVICE_ID_VIRTIO_9P          0x1009
 #define PCI_DEVICE_ID_VIRTIO_VSOCK       0x1012
+#define PCI_DEVICE_ID_VIRTIO_PMEM        0x1013
 
 #define PCI_VENDOR_ID_REDHAT             0x1b36
 #define PCI_DEVICE_ID_REDHAT_BRIDGE      0x0001
diff --git a/include/hw/virtio/virtio-pmem.h b/include/hw/virtio/virtio-pmem.h
new file mode 100644
index 0000000000..fda3ee691c
--- /dev/null
+++ b/include/hw/virtio/virtio-pmem.h
@@ -0,0 +1,42 @@ 
+/*
+ * Virtio pmem Device
+ *
+ * Copyright Red Hat, Inc. 2018
+ * Copyright Pankaj Gupta <pagupta@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#ifndef QEMU_VIRTIO_PMEM_H
+#define QEMU_VIRTIO_PMEM_H
+
+#include "hw/virtio/virtio.h"
+#include "exec/memory.h"
+#include "sysemu/hostmem.h"
+#include "standard-headers/linux/virtio_ids.h"
+#include "hw/boards.h"
+#include "hw/i386/pc.h"
+
+#define TYPE_VIRTIO_PMEM "virtio-pmem"
+
+#define VIRTIO_PMEM(obj) \
+        OBJECT_CHECK(VirtIOPMEM, (obj), TYPE_VIRTIO_PMEM)
+
+/* VirtIOPMEM device structure */
+typedef struct VirtIOPMEM {
+    VirtIODevice parent_obj;
+
+    VirtQueue *rq_vq;
+    uint64_t start;
+    uint64_t size;
+    MemoryRegion mr;
+    HostMemoryBackend *memdev;
+} VirtIOPMEM;
+
+struct virtio_pmem_config {
+    uint64_t start;
+    uint64_t size;
+};
+#endif
diff --git a/include/standard-headers/linux/virtio_ids.h b/include/standard-headers/linux/virtio_ids.h
index 6d5c3b2d4f..346389565a 100644
--- a/include/standard-headers/linux/virtio_ids.h
+++ b/include/standard-headers/linux/virtio_ids.h
@@ -43,5 +43,6 @@ 
 #define VIRTIO_ID_INPUT        18 /* virtio input */
 #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
 #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
+#define VIRTIO_ID_PMEM         25 /* virtio pmem */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/qapi/misc.json b/qapi/misc.json
index 29da7856e3..fb85dd6f6c 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -2907,6 +2907,29 @@ 
           }
 }
 
+##
+# @VirtioPMemDeviceInfo:
+#
+# VirtioPMem state information
+#
+# @id: device's ID
+#
+# @start: physical address, where device is mapped
+#
+# @size: size of memory that the device provides
+#
+# @memdev: memory backend linked with device
+#
+# Since: 2.13
+##
+{ 'struct': 'VirtioPMemDeviceInfo',
+  'data': { '*id': 'str',
+            'start': 'size',
+            'size': 'size',
+            'memdev': 'str'
+          }
+}
+
 ##
 # @MemoryDeviceInfo:
 #
@@ -2916,7 +2939,8 @@ 
 ##
 { 'union': 'MemoryDeviceInfo',
   'data': { 'dimm': 'PCDIMMDeviceInfo',
-            'nvdimm': 'PCDIMMDeviceInfo'
+            'nvdimm': 'PCDIMMDeviceInfo',
+	    'virtio-pmem': 'VirtioPMemDeviceInfo'
           }
 }