Message ID | 20171012155027.3277-3-pagupta@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Oct 12, 2017 at 8:50 AM, Pankaj Gupta <pagupta@redhat.com> wrote: > This patch adds virtio-pmem driver for KVM guest. > Guest reads the persistent memory range information > over virtio bus from Qemu and reserves the range > as persistent memory. Guest also allocates a block > device corresponding to the pmem range which later > can be accessed with DAX compatible file systems. > Idea is to use the virtio channel between guest and > host to perform the block device flush for guest pmem > DAX device. > > There is work to do including DAX file system support > and other advanced features. > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com> > --- > drivers/virtio/Kconfig | 10 ++ > drivers/virtio/Makefile | 1 + > drivers/virtio/virtio_pmem.c | 322 +++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/virtio_pmem.h | 55 +++++++ > 4 files changed, 388 insertions(+) > create mode 100644 drivers/virtio/virtio_pmem.c > create mode 100644 include/uapi/linux/virtio_pmem.h > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > index cff773f15b7e..0192c4bda54b 100644 > --- a/drivers/virtio/Kconfig > +++ b/drivers/virtio/Kconfig > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY > > If unsure, say Y. > > +config VIRTIO_PMEM > + tristate "Virtio pmem driver" > + depends on VIRTIO > + ---help--- > + This driver adds persistent memory range within a KVM guest. I think we need to call this something other than persistent memory to make it clear that this not memory where the persistence can be managed from userspace. The persistence point always requires a driver call, so this is something distinctly different than "persistent memory". For example, it's a bug if this memory range ends up backing a device-dax range in the guest where there is no such thing as a driver callback to perform the flushing. How does this solution protect against that scenario? > + It also associates a block device corresponding to the pmem > + range. > + > + If unsure, say M. > + > config VIRTIO_BALLOON > tristate "Virtio balloon driver" > depends on VIRTIO > diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile > index 41e30e3dc842..032ade725cc2 100644 > --- a/drivers/virtio/Makefile > +++ b/drivers/virtio/Makefile > @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o > virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o > obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o > obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o > +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o > diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c > new file mode 100644 > index 000000000000..74e47cae0e24 > --- /dev/null > +++ b/drivers/virtio/virtio_pmem.c > @@ -0,0 +1,322 @@ > +/* > + * virtio-pmem driver > + */ > + > +#include <linux/virtio.h> > +#include <linux/swap.h> > +#include <linux/workqueue.h> > +#include <linux/delay.h> > +#include <linux/slab.h> > +#include <linux/module.h> > +#include <linux/oom.h> > +#include <linux/wait.h> > +#include <linux/mm.h> > +#include <linux/mount.h> > +#include <linux/magic.h> > +#include <linux/virtio_pmem.h> > + > +void devm_vpmem_disable(struct device *dev, struct resource *res, void *addr) > +{ > + devm_memunmap(dev, addr); > + devm_release_mem_region(dev, res->start, resource_size(res)); > +} > + > +static void pmem_flush_done(struct virtqueue *vq) > +{ > + return; > +}; > + > +static void virtio_pmem_release_queue(void *q) > +{ > + blk_cleanup_queue(q); > +} > + > +static void virtio_pmem_freeze_queue(void *q) > +{ > + blk_freeze_queue_start(q); > +} > + > +static void virtio_pmem_release_disk(void *__pmem) > +{ > + struct virtio_pmem *pmem = __pmem; > + > + del_gendisk(pmem->disk); > + put_disk(pmem->disk); > +} This code seems identical to the base pmem case, it should move to the shared code object. > + > +static int init_vq(struct virtio_pmem *vpmem) > +{ > + struct virtqueue *vq; > + > + /* single vq */ > + vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, "flush_queue"); > + > + if (IS_ERR(vq)) > + return PTR_ERR(vq); > + > + return 0; > +} > + > +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem, > + struct resource *res, struct vmem_altmap *altmap) > +{ > + u32 start_pad = 0, end_trunc = 0; > + resource_size_t start, size; > + unsigned long npfns; > + phys_addr_t offset; > + > + size = resource_size(res); > + start = PHYS_SECTION_ALIGN_DOWN(res->start); > + > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > + IORES_DESC_NONE) == REGION_MIXED) { > + > + start = res->start; > + start_pad = PHYS_SECTION_ALIGN_UP(start) - start; > + } > + start = res->start; > + size = PHYS_SECTION_ALIGN_UP(start + size) - start; > + > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > + IORES_DESC_NONE) == REGION_MIXED) { > + > + size = resource_size(res); > + end_trunc = start + size - > + PHYS_SECTION_ALIGN_DOWN(start + size); > + } > + > + start += start_pad; > + size = resource_size(res); > + npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K) > + / PAGE_SIZE); > + > + /* > + * vmemmap_populate_hugepages() allocates the memmap array in > + * HPAGE_SIZE chunks. > + */ > + offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start; > + vpmem->data_offset = offset; > + > + struct vmem_altmap __altmap = { > + .base_pfn = init_altmap_base(start+start_pad), > + .reserve = init_altmap_reserve(start+start_pad), > + }; > + > + res->start += start_pad; > + res->end -= end_trunc; > + memcpy(altmap, &__altmap, sizeof(*altmap)); > + altmap->free = PHYS_PFN(offset - SZ_8K); > + altmap->alloc = 0; > + > + return altmap; > +} > + > +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page *page, > + unsigned int len, unsigned int off, bool is_write, > + sector_t sector) > +{ > + blk_status_t rc = BLK_STS_OK; > + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; > + void *pmem_addr = pmem->virt_addr + pmem_off; > + > + if (!is_write) { > + rc = read_pmem(page, off, pmem_addr, len); > + flush_dcache_page(page); > + } else { > + flush_dcache_page(page); > + write_pmem(pmem_addr, page, off, len); > + } > + > + return rc; > +} > + > +static int vpmem_rw_page(struct block_device *bdev, sector_t sector, > + struct page *page, bool is_write) > +{ > + struct virtio_pmem *pmem = bdev->bd_queue->queuedata; > + blk_status_t rc; > + > + rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, > + 0, is_write, sector); > + > + if (rc == 0) > + page_endio(page, is_write, 0); > + > + return blk_status_to_errno(rc); > +} > + > +#ifndef REQ_FLUSH > +#define REQ_FLUSH REQ_PREFLUSH > +#endif > + > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q, > + struct bio *bio) > +{ > + blk_status_t rc = 0; > + struct bio_vec bvec; > + struct bvec_iter iter; > + struct virtio_pmem *pmem = q->queuedata; > + > + if (bio->bi_opf & REQ_FLUSH) > + //todo host flush command > + > + bio_for_each_segment(bvec, bio, iter) { > + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, > + bvec.bv_offset, op_is_write(bio_op(bio)), > + iter.bi_sector); > + if (rc) { > + bio->bi_status = rc; > + break; > + } > + } > + > + bio_endio(bio); > + return BLK_QC_T_NONE; > +} Again, the above could be shared by both drivers. > + > +static const struct block_device_operations pmem_fops = { > + .owner = THIS_MODULE, > + .rw_page = vpmem_rw_page, > + //.revalidate_disk = nvdimm_revalidate_disk, > +}; > + > +static int virtio_pmem_probe(struct virtio_device *vdev) > +{ > + struct virtio_pmem *vpmem; > + int err = 0; > + void *addr; > + struct resource *res, res_pfn; > + struct request_queue *q; > + struct vmem_altmap __altmap, *altmap = NULL; > + struct gendisk *disk; > + struct device *gendev; > + int nid = dev_to_node(&vdev->dev); > + > + if (!vdev->config->get) { > + dev_err(&vdev->dev, "%s failure: config disabled\n", > + __func__); > + return -EINVAL; > + } > + > + vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), > + GFP_KERNEL); > + > + if (!vpmem) { > + err = -ENOMEM; > + goto out; > + } > + > + dev_set_drvdata(&vdev->dev, vpmem); > + > + vpmem->vdev = vdev; > + err = init_vq(vpmem); > + if (err) > + goto out; > + > + if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) { > + dev_err(&vdev->dev, "%s : pmem not supported\n", > + __func__); > + goto out; > + } > + > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > + start, &vpmem->start); > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > + size, &vpmem->size); > + > + res_pfn.start = vpmem->start; > + res_pfn.end = vpmem->start + vpmem->size-1; > + > + /* used for allocating memmap in the pmem device */ > + altmap = setup_pmem_pfn(vpmem, &res_pfn, &__altmap); > + > + res = devm_request_mem_region(&vdev->dev, > + res_pfn.start, resource_size(&res_pfn), "virtio-pmem"); > + > + if (!res) { > + dev_warn(&vdev->dev, "could not reserve region "); > + return -EBUSY; > + } > + > + q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev)); > + > + if (!q) > + return -ENOMEM; > + > + if (devm_add_action_or_reset(&vdev->dev, > + virtio_pmem_release_queue, q)) > + return -ENOMEM; > + > + vpmem->pfn_flags = PFN_DEV; > + > + /* allocate memap in pmem device itself */ > + if (IS_ENABLED(CONFIG_ZONE_DEVICE)) { > + > + addr = devm_memremap_pages(&vdev->dev, res, > + &q->q_usage_counter, altmap); > + vpmem->pfn_flags |= PFN_MAP; > + } else > + addr = devm_memremap(&vdev->dev, vpmem->start, > + vpmem->size, ARCH_MEMREMAP_PMEM); > + > + /* > + * At release time the queue must be frozen before > + * devm_memremap_pages is unwound > + */ > + if (devm_add_action_or_reset(&vdev->dev, > + virtio_pmem_freeze_queue, q)) > + return -ENOMEM; > + > + if (IS_ERR(addr)) > + return PTR_ERR(addr); > + > + vpmem->virt_addr = addr; > + blk_queue_write_cache(q, 0, 0); > + blk_queue_make_request(q, virtio_pmem_make_request); > + blk_queue_physical_block_size(q, PAGE_SIZE); > + blk_queue_logical_block_size(q, 512); > + blk_queue_max_hw_sectors(q, UINT_MAX); > + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); > + queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); > + q->queuedata = vpmem; > + > + disk = alloc_disk_node(0, nid); > + if (!disk) > + return -ENOMEM; > + vpmem->disk = disk; > + > + disk->fops = &pmem_fops; > + disk->queue = q; > + disk->flags = GENHD_FL_EXT_DEVT; > + strcpy(disk->disk_name, "vpmem"); > + set_capacity(disk, vpmem->size/512); > + gendev = disk_to_dev(disk); > + > + virtio_device_ready(vdev); > + device_add_disk(&vdev->dev, disk); > + > + if (devm_add_action_or_reset(&vdev->dev, > + virtio_pmem_release_disk, vpmem)) > + return -ENOMEM; > + > + revalidate_disk(disk); > + return 0; > +out: > + vdev->config->del_vqs(vdev); > + return err; > +} Here we have a mix of code that is common and some that is virtio specific, the shared code should be factored out into a common helper that both drivers call.
> > This patch adds virtio-pmem driver for KVM guest. > > Guest reads the persistent memory range information > > over virtio bus from Qemu and reserves the range > > as persistent memory. Guest also allocates a block > > device corresponding to the pmem range which later > > can be accessed with DAX compatible file systems. > > Idea is to use the virtio channel between guest and > > host to perform the block device flush for guest pmem > > DAX device. > > > > There is work to do including DAX file system support > > and other advanced features. > > > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com> > > --- > > drivers/virtio/Kconfig | 10 ++ > > drivers/virtio/Makefile | 1 + > > drivers/virtio/virtio_pmem.c | 322 > > +++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/virtio_pmem.h | 55 +++++++ > > 4 files changed, 388 insertions(+) > > create mode 100644 drivers/virtio/virtio_pmem.c > > create mode 100644 include/uapi/linux/virtio_pmem.h > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > > index cff773f15b7e..0192c4bda54b 100644 > > --- a/drivers/virtio/Kconfig > > +++ b/drivers/virtio/Kconfig > > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY > > > > If unsure, say Y. > > > > +config VIRTIO_PMEM > > + tristate "Virtio pmem driver" > > + depends on VIRTIO > > + ---help--- > > + This driver adds persistent memory range within a KVM guest. > > I think we need to call this something other than persistent memory to > make it clear that this not memory where the persistence can be > managed from userspace. The persistence point always requires a driver > call, so this is something distinctly different than "persistent > memory". For example, it's a bug if this memory range ends up backing > a device-dax range in the guest where there is no such thing as a > driver callback to perform the flushing. How does this solution > protect against that scenario? yes, you are right we are not providing device_dax in this case so it should be clear from name. Any suggestion for name? > > > + It also associates a block device corresponding to the pmem > > + range. > > + > > + If unsure, say M. > > + > > config VIRTIO_BALLOON > > tristate "Virtio balloon driver" > > depends on VIRTIO > > diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile > > index 41e30e3dc842..032ade725cc2 100644 > > --- a/drivers/virtio/Makefile > > +++ b/drivers/virtio/Makefile > > @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o > > virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o > > obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o > > obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o > > +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o > > diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c > > new file mode 100644 > > index 000000000000..74e47cae0e24 > > --- /dev/null > > +++ b/drivers/virtio/virtio_pmem.c > > @@ -0,0 +1,322 @@ > > +/* > > + * virtio-pmem driver > > + */ > > + > > +#include <linux/virtio.h> > > +#include <linux/swap.h> > > +#include <linux/workqueue.h> > > +#include <linux/delay.h> > > +#include <linux/slab.h> > > +#include <linux/module.h> > > +#include <linux/oom.h> > > +#include <linux/wait.h> > > +#include <linux/mm.h> > > +#include <linux/mount.h> > > +#include <linux/magic.h> > > +#include <linux/virtio_pmem.h> > > + > > +void devm_vpmem_disable(struct device *dev, struct resource *res, void > > *addr) > > +{ > > + devm_memunmap(dev, addr); > > + devm_release_mem_region(dev, res->start, resource_size(res)); > > +} > > + > > +static void pmem_flush_done(struct virtqueue *vq) > > +{ > > + return; > > +}; > > + > > +static void virtio_pmem_release_queue(void *q) > > +{ > > + blk_cleanup_queue(q); > > +} > > + > > +static void virtio_pmem_freeze_queue(void *q) > > +{ > > + blk_freeze_queue_start(q); > > +} > > + > > +static void virtio_pmem_release_disk(void *__pmem) > > +{ > > + struct virtio_pmem *pmem = __pmem; > > + > > + del_gendisk(pmem->disk); > > + put_disk(pmem->disk); > > +} > > This code seems identical to the base pmem case, it should move to the > shared code object. Sure! > > > + > > +static int init_vq(struct virtio_pmem *vpmem) > > +{ > > + struct virtqueue *vq; > > + > > + /* single vq */ > > + vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, > > "flush_queue"); > > + > > + if (IS_ERR(vq)) > > + return PTR_ERR(vq); > > + > > + return 0; > > +} > > + > > +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem, > > + struct resource *res, struct vmem_altmap *altmap) > > +{ > > + u32 start_pad = 0, end_trunc = 0; > > + resource_size_t start, size; > > + unsigned long npfns; > > + phys_addr_t offset; > > + > > + size = resource_size(res); > > + start = PHYS_SECTION_ALIGN_DOWN(res->start); > > + > > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > > + IORES_DESC_NONE) == REGION_MIXED) { > > + > > + start = res->start; > > + start_pad = PHYS_SECTION_ALIGN_UP(start) - start; > > + } > > + start = res->start; > > + size = PHYS_SECTION_ALIGN_UP(start + size) - start; > > + > > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > > + IORES_DESC_NONE) == REGION_MIXED) { > > + > > + size = resource_size(res); > > + end_trunc = start + size - > > + PHYS_SECTION_ALIGN_DOWN(start + size); > > + } > > + > > + start += start_pad; > > + size = resource_size(res); > > + npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K) > > + / PAGE_SIZE); > > + > > + /* > > + * vmemmap_populate_hugepages() allocates the memmap array in > > + * HPAGE_SIZE chunks. > > + */ > > + offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start; > > + vpmem->data_offset = offset; > > + > > + struct vmem_altmap __altmap = { > > + .base_pfn = init_altmap_base(start+start_pad), > > + .reserve = init_altmap_reserve(start+start_pad), > > + }; > > + > > + res->start += start_pad; > > + res->end -= end_trunc; > > + memcpy(altmap, &__altmap, sizeof(*altmap)); > > + altmap->free = PHYS_PFN(offset - SZ_8K); > > + altmap->alloc = 0; > > + > > + return altmap; > > +} > > + > > +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page > > *page, > > + unsigned int len, unsigned int off, bool is_write, > > + sector_t sector) > > +{ > > + blk_status_t rc = BLK_STS_OK; > > + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; > > + void *pmem_addr = pmem->virt_addr + pmem_off; > > + > > + if (!is_write) { > > + rc = read_pmem(page, off, pmem_addr, len); > > + flush_dcache_page(page); > > + } else { > > + flush_dcache_page(page); > > + write_pmem(pmem_addr, page, off, len); > > + } > > + > > + return rc; > > +} > > + > > +static int vpmem_rw_page(struct block_device *bdev, sector_t sector, > > + struct page *page, bool is_write) > > +{ > > + struct virtio_pmem *pmem = bdev->bd_queue->queuedata; > > + blk_status_t rc; > > + > > + rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, > > + 0, is_write, sector); > > + > > + if (rc == 0) > > + page_endio(page, is_write, 0); > > + > > + return blk_status_to_errno(rc); > > +} > > + > > +#ifndef REQ_FLUSH > > +#define REQ_FLUSH REQ_PREFLUSH > > +#endif > > + > > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q, > > + struct bio *bio) > > +{ > > + blk_status_t rc = 0; > > + struct bio_vec bvec; > > + struct bvec_iter iter; > > + struct virtio_pmem *pmem = q->queuedata; > > + > > + if (bio->bi_opf & REQ_FLUSH) > > + //todo host flush command > > + > > + bio_for_each_segment(bvec, bio, iter) { > > + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, > > + bvec.bv_offset, op_is_write(bio_op(bio)), > > + iter.bi_sector); > > + if (rc) { > > + bio->bi_status = rc; > > + break; > > + } > > + } > > + > > + bio_endio(bio); > > + return BLK_QC_T_NONE; > > +} > > Again, the above could be shared by both drivers. yes, I will do that. > > > + > > +static const struct block_device_operations pmem_fops = { > > + .owner = THIS_MODULE, > > + .rw_page = vpmem_rw_page, > > + //.revalidate_disk = nvdimm_revalidate_disk, > > +}; > > + > > +static int virtio_pmem_probe(struct virtio_device *vdev) > > +{ > > + struct virtio_pmem *vpmem; > > + int err = 0; > > + void *addr; > > + struct resource *res, res_pfn; > > + struct request_queue *q; > > + struct vmem_altmap __altmap, *altmap = NULL; > > + struct gendisk *disk; > > + struct device *gendev; > > + int nid = dev_to_node(&vdev->dev); > > + > > + if (!vdev->config->get) { > > + dev_err(&vdev->dev, "%s failure: config disabled\n", > > + __func__); > > + return -EINVAL; > > + } > > + > > + vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), > > + GFP_KERNEL); > > + > > + if (!vpmem) { > > + err = -ENOMEM; > > + goto out; > > + } > > + > > + dev_set_drvdata(&vdev->dev, vpmem); > > + > > + vpmem->vdev = vdev; > > + err = init_vq(vpmem); > > + if (err) > > + goto out; > > + > > + if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) { > > + dev_err(&vdev->dev, "%s : pmem not supported\n", > > + __func__); > > + goto out; > > + } > > + > > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > > + start, &vpmem->start); > > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > > + size, &vpmem->size); > > + > > + res_pfn.start = vpmem->start; > > + res_pfn.end = vpmem->start + vpmem->size-1; > > + > > + /* used for allocating memmap in the pmem device */ > > + altmap = setup_pmem_pfn(vpmem, &res_pfn, &__altmap); > > + > > + res = devm_request_mem_region(&vdev->dev, > > + res_pfn.start, resource_size(&res_pfn), > > "virtio-pmem"); > > + > > + if (!res) { > > + dev_warn(&vdev->dev, "could not reserve region "); > > + return -EBUSY; > > + } > > + > > + q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev)); > > + > > + if (!q) > > + return -ENOMEM; > > + > > + if (devm_add_action_or_reset(&vdev->dev, > > + virtio_pmem_release_queue, q)) > > + return -ENOMEM; > > + > > + vpmem->pfn_flags = PFN_DEV; > > + > > + /* allocate memap in pmem device itself */ > > + if (IS_ENABLED(CONFIG_ZONE_DEVICE)) { > > + > > + addr = devm_memremap_pages(&vdev->dev, res, > > + &q->q_usage_counter, altmap); > > + vpmem->pfn_flags |= PFN_MAP; > > + } else > > + addr = devm_memremap(&vdev->dev, vpmem->start, > > + vpmem->size, ARCH_MEMREMAP_PMEM); > > + > > + /* > > + * At release time the queue must be frozen before > > + * devm_memremap_pages is unwound > > + */ > > + if (devm_add_action_or_reset(&vdev->dev, > > + virtio_pmem_freeze_queue, q)) > > + return -ENOMEM; > > + > > + if (IS_ERR(addr)) > > + return PTR_ERR(addr); > > + > > + vpmem->virt_addr = addr; > > + blk_queue_write_cache(q, 0, 0); > > + blk_queue_make_request(q, virtio_pmem_make_request); > > + blk_queue_physical_block_size(q, PAGE_SIZE); > > + blk_queue_logical_block_size(q, 512); > > + blk_queue_max_hw_sectors(q, UINT_MAX); > > + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); > > + queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); > > + q->queuedata = vpmem; > > + > > + disk = alloc_disk_node(0, nid); > > + if (!disk) > > + return -ENOMEM; > > + vpmem->disk = disk; > > + > > + disk->fops = &pmem_fops; > > + disk->queue = q; > > + disk->flags = GENHD_FL_EXT_DEVT; > > + strcpy(disk->disk_name, "vpmem"); > > + set_capacity(disk, vpmem->size/512); > > + gendev = disk_to_dev(disk); > > + > > + virtio_device_ready(vdev); > > + device_add_disk(&vdev->dev, disk); > > + > > + if (devm_add_action_or_reset(&vdev->dev, > > + virtio_pmem_release_disk, vpmem)) > > + return -ENOMEM; > > + > > + revalidate_disk(disk); > > + return 0; > > +out: > > + vdev->config->del_vqs(vdev); > > + return err; > > +} > > Here we have a mix of code that is common and some that is virtio > specific, the shared code should be factored out into a common helper > that both drivers call. yes, i will factor this as well. > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> >
On Thu, Oct 12, 2017 at 2:25 PM, Pankaj Gupta <pagupta@redhat.com> wrote: > >> > This patch adds virtio-pmem driver for KVM guest. >> > Guest reads the persistent memory range information >> > over virtio bus from Qemu and reserves the range >> > as persistent memory. Guest also allocates a block >> > device corresponding to the pmem range which later >> > can be accessed with DAX compatible file systems. >> > Idea is to use the virtio channel between guest and >> > host to perform the block device flush for guest pmem >> > DAX device. >> > >> > There is work to do including DAX file system support >> > and other advanced features. >> > >> > Signed-off-by: Pankaj Gupta <pagupta@redhat.com> >> > --- >> > drivers/virtio/Kconfig | 10 ++ >> > drivers/virtio/Makefile | 1 + >> > drivers/virtio/virtio_pmem.c | 322 >> > +++++++++++++++++++++++++++++++++++++++ >> > include/uapi/linux/virtio_pmem.h | 55 +++++++ >> > 4 files changed, 388 insertions(+) >> > create mode 100644 drivers/virtio/virtio_pmem.c >> > create mode 100644 include/uapi/linux/virtio_pmem.h >> > >> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig >> > index cff773f15b7e..0192c4bda54b 100644 >> > --- a/drivers/virtio/Kconfig >> > +++ b/drivers/virtio/Kconfig >> > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY >> > >> > If unsure, say Y. >> > >> > +config VIRTIO_PMEM >> > + tristate "Virtio pmem driver" >> > + depends on VIRTIO >> > + ---help--- >> > + This driver adds persistent memory range within a KVM guest. >> >> I think we need to call this something other than persistent memory to >> make it clear that this not memory where the persistence can be >> managed from userspace. The persistence point always requires a driver >> call, so this is something distinctly different than "persistent >> memory". For example, it's a bug if this memory range ends up backing >> a device-dax range in the guest where there is no such thing as a >> driver callback to perform the flushing. How does this solution >> protect against that scenario? > > yes, you are right we are not providing device_dax in this case so it should > be clear from name. Any suggestion for name? So currently /proc/iomem in a guest with a pmem device attached to a namespace looks like this: c00000000-13bfffffff : Persistent Memory c00000000-13bfffffff : namespace2.0 Can we call it "Virtio Shared Memory" to make it clear it is a different beast than typical "Persistent Memory"? You can likely inject your own name into the resource tree the same way we do in the NFIT driver. See acpi_nfit_insert_resource().
> > On Thu, Oct 12, 2017 at 2:25 PM, Pankaj Gupta <pagupta@redhat.com> wrote: > > > >> > This patch adds virtio-pmem driver for KVM guest. > >> > Guest reads the persistent memory range information > >> > over virtio bus from Qemu and reserves the range > >> > as persistent memory. Guest also allocates a block > >> > device corresponding to the pmem range which later > >> > can be accessed with DAX compatible file systems. > >> > Idea is to use the virtio channel between guest and > >> > host to perform the block device flush for guest pmem > >> > DAX device. > >> > > >> > There is work to do including DAX file system support > >> > and other advanced features. > >> > > >> > Signed-off-by: Pankaj Gupta <pagupta@redhat.com> > >> > --- > >> > drivers/virtio/Kconfig | 10 ++ > >> > drivers/virtio/Makefile | 1 + > >> > drivers/virtio/virtio_pmem.c | 322 > >> > +++++++++++++++++++++++++++++++++++++++ > >> > include/uapi/linux/virtio_pmem.h | 55 +++++++ > >> > 4 files changed, 388 insertions(+) > >> > create mode 100644 drivers/virtio/virtio_pmem.c > >> > create mode 100644 include/uapi/linux/virtio_pmem.h > >> > > >> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > >> > index cff773f15b7e..0192c4bda54b 100644 > >> > --- a/drivers/virtio/Kconfig > >> > +++ b/drivers/virtio/Kconfig > >> > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY > >> > > >> > If unsure, say Y. > >> > > >> > +config VIRTIO_PMEM > >> > + tristate "Virtio pmem driver" > >> > + depends on VIRTIO > >> > + ---help--- > >> > + This driver adds persistent memory range within a KVM guest. > >> > >> I think we need to call this something other than persistent memory to > >> make it clear that this not memory where the persistence can be > >> managed from userspace. The persistence point always requires a driver > >> call, so this is something distinctly different than "persistent > >> memory". For example, it's a bug if this memory range ends up backing > >> a device-dax range in the guest where there is no such thing as a > >> driver callback to perform the flushing. How does this solution > >> protect against that scenario? > > > > yes, you are right we are not providing device_dax in this case so it > > should > > be clear from name. Any suggestion for name? > > So currently /proc/iomem in a guest with a pmem device attached to a > namespace looks like this: > > c00000000-13bfffffff : Persistent Memory > c00000000-13bfffffff : namespace2.0 > > Can we call it "Virtio Shared Memory" to make it clear it is a > different beast than typical "Persistent Memory"? You can likely I think somewhere we need persistent keyword 'Virtio Persistent Memory' or so. > inject your own name into the resource tree the same way we do in the > NFIT driver. See acpi_nfit_insert_resource(). Sure! thank you.
On Thu, 2017-10-12 at 18:18 -0400, Pankaj Gupta wrote: > > > > On Thu, Oct 12, 2017 at 2:25 PM, Pankaj Gupta <pagupta@redhat.com> > > wrote: > > > > > > > >  This patch adds virtio-pmem driver for KVM guest. > > > > >  Guest reads the persistent memory range information > > > > >  over virtio bus from Qemu and reserves the range > > > > >  as persistent memory. Guest also allocates a block > > > > >  device corresponding to the pmem range which later > > > > >  can be accessed with DAX compatible file systems. > > > > >  Idea is to use the virtio channel between guest and > > > > >  host to perform the block device flush for guest pmem > > > > >  DAX device. > > > > > > > > > >  There is work to do including DAX file system support > > > > >  and other advanced features. > > > > > > > > > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com> > > > > > --- > > > > >  drivers/virtio/Kconfig           |  10 ++ > > > > >  drivers/virtio/Makefile          |   1 + > > > > >  drivers/virtio/virtio_pmem.c     | 322 > > > > >  +++++++++++++++++++++++++++++++++++++++ > > > > >  include/uapi/linux/virtio_pmem.h |  55 +++++++ > > > > >  4 files changed, 388 insertions(+) > > > > >  create mode 100644 drivers/virtio/virtio_pmem.c > > > > >  create mode 100644 include/uapi/linux/virtio_pmem.h > > > > > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > > > > > index cff773f15b7e..0192c4bda54b 100644 > > > > > --- a/drivers/virtio/Kconfig > > > > > +++ b/drivers/virtio/Kconfig > > > > > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY > > > > > > > > > >           If unsure, say Y. > > > > > > > > > > +config VIRTIO_PMEM > > > > > +       tristate "Virtio pmem driver" > > > > > +       depends on VIRTIO > > > > > +       ---help--- > > > > > +        This driver adds persistent memory range within a > > > > > KVM guest. With "Virtio Block Backed Pmem" we could name the config option VIRTIO_BLOCK_PMEM The documentation text could make it clear to people that the image shows up as a disk image on the host, but as a pmem memory range in the guest. > > > > I think we need to call this something other than persistent > > > > memory to > > > > make it clear that this not memory where the persistence can be > > > > managed from userspace. The persistence point always requires > > > > > > So currently /proc/iomem in a guest with a pmem device attached to > > a > > namespace looks like this: > > > >     c00000000-13bfffffff : Persistent Memory > >        c00000000-13bfffffff : namespace2.0 > > > > Can we call it "Virtio Shared Memory" to make it clear it is a > > different beast than typical "Persistent Memory"?  You can likely > > I think somewhere we need persistent keyword 'Virtio Persistent > Memory' or > so. Still hoping for better ideas than "Virtio Block Backed Pmem" :)
> > > > > > > > > >  This patch adds virtio-pmem driver for KVM guest. > > > > > >  Guest reads the persistent memory range information > > > > > >  over virtio bus from Qemu and reserves the range > > > > > >  as persistent memory. Guest also allocates a block > > > > > >  device corresponding to the pmem range which later > > > > > >  can be accessed with DAX compatible file systems. > > > > > >  Idea is to use the virtio channel between guest and > > > > > >  host to perform the block device flush for guest pmem > > > > > >  DAX device. > > > > > > > > > > > >  There is work to do including DAX file system support > > > > > >  and other advanced features. > > > > > > > > > > > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com> > > > > > > --- > > > > > >  drivers/virtio/Kconfig           |  10 ++ > > > > > >  drivers/virtio/Makefile          |   1 + > > > > > >  drivers/virtio/virtio_pmem.c     | 322 > > > > > >  +++++++++++++++++++++++++++++++++++++++ > > > > > >  include/uapi/linux/virtio_pmem.h |  55 +++++++ > > > > > >  4 files changed, 388 insertions(+) > > > > > >  create mode 100644 drivers/virtio/virtio_pmem.c > > > > > >  create mode 100644 include/uapi/linux/virtio_pmem.h > > > > > > > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > > > > > > index cff773f15b7e..0192c4bda54b 100644 > > > > > > --- a/drivers/virtio/Kconfig > > > > > > +++ b/drivers/virtio/Kconfig > > > > > > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY > > > > > > > > > > > >           If unsure, say Y. > > > > > > > > > > > > +config VIRTIO_PMEM > > > > > > +       tristate "Virtio pmem driver" > > > > > > +       depends on VIRTIO > > > > > > +       ---help--- > > > > > > +        This driver adds persistent memory range within a > > > > > > KVM guest. > > With "Virtio Block Backed Pmem" we could name the config > option VIRTIO_BLOCK_PMEM > > The documentation text could make it clear to people that the > image shows up as a disk image on the host, but as a pmem > memory range in the guest. yes, this looks better. thank you. > > > > > > I think we need to call this something other than persistent > > > > > memory to > > > > > make it clear that this not memory where the persistence can be > > > > > managed from userspace. The persistence point always requires > > > > > > > > So currently /proc/iomem in a guest with a pmem device attached to > > > a > > > namespace looks like this: > > > > > >     c00000000-13bfffffff : Persistent Memory > > >        c00000000-13bfffffff : namespace2.0 > > > > > > Can we call it "Virtio Shared Memory" to make it clear it is a > > > different beast than typical "Persistent Memory"?  You can likely > > > > I think somewhere we need persistent keyword 'Virtio Persistent > > Memory' or > > so. > > Still hoping for better ideas than "Virtio Block Backed Pmem" :) :-) >
> > > wrote: > > > > > > > > > >  This patch adds virtio-pmem driver for KVM guest. > > > > > >  Guest reads the persistent memory range information > > > > > >  over virtio bus from Qemu and reserves the range > > > > > >  as persistent memory. Guest also allocates a block > > > > > >  device corresponding to the pmem range which later > > > > > >  can be accessed with DAX compatible file systems. > > > > > >  Idea is to use the virtio channel between guest and > > > > > >  host to perform the block device flush for guest pmem > > > > > >  DAX device. > > > > > > > > > > > >  There is work to do including DAX file system support > > > > > >  and other advanced features. > > > > > > > > > > > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com> > > > > > > --- > > > > > >  drivers/virtio/Kconfig           |  10 ++ > > > > > >  drivers/virtio/Makefile          |   1 + > > > > > >  drivers/virtio/virtio_pmem.c     | 322 > > > > > >  +++++++++++++++++++++++++++++++++++++++ > > > > > >  include/uapi/linux/virtio_pmem.h |  55 +++++++ > > > > > >  4 files changed, 388 insertions(+) > > > > > >  create mode 100644 drivers/virtio/virtio_pmem.c > > > > > >  create mode 100644 include/uapi/linux/virtio_pmem.h > > > > > > > > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > > > > > > index cff773f15b7e..0192c4bda54b 100644 > > > > > > --- a/drivers/virtio/Kconfig > > > > > > +++ b/drivers/virtio/Kconfig > > > > > > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY > > > > > > > > > > > >           If unsure, say Y. > > > > > > > > > > > > +config VIRTIO_PMEM > > > > > > +       tristate "Virtio pmem driver" > > > > > > +       depends on VIRTIO > > > > > > +       ---help--- > > > > > > +        This driver adds persistent memory range within a > > > > > > KVM guest. > > With "Virtio Block Backed Pmem" we could name the config > option VIRTIO_BLOCK_PMEM > > The documentation text could make it clear to people that the > image shows up as a disk image on the host, but as a pmem > memory range in the guest. > > > > > > I think we need to call this something other than persistent > > > > > memory to > > > > > make it clear that this not memory where the persistence can be > > > > > managed from userspace. The persistence point always requires > > > > > > > > So currently /proc/iomem in a guest with a pmem device attached to > > > a > > > namespace looks like this: > > > > > >     c00000000-13bfffffff : Persistent Memory > > >        c00000000-13bfffffff : namespace2.0 > > > > > > Can we call it "Virtio Shared Memory" to make it clear it is a > > > different beast than typical "Persistent Memory"?  You can likely > > > > I think somewhere we need persistent keyword 'Virtio Persistent > > Memory' or > > so. > > Still hoping for better ideas than "Virtio Block Backed Pmem" :) > Dan, I have a query regarding below patch [*]. My assumption is its halted because of memory hotplug restructuring work? Anything I am missing here? [*] https://www.mail-archive.com/linux-nvdimm@lists.01.org/msg02978.html
On Thu, Oct 12, 2017 at 3:52 PM, Pankaj Gupta <pagupta@redhat.com> wrote: > Dan, > > I have a query regarding below patch [*]. My assumption is its halted > because of memory hotplug restructuring work? Anything I am missing > here? > > [*] https://www.mail-archive.com/linux-nvdimm@lists.01.org/msg02978.html It's fallen to the back of my queue since the original driving need of platform firmware changing offsets from boot-to-boot is no longer an issue. However, it does mean that you need to arrange for 128MB aligned devm_memremap_pages() ranges for the foreseeable future.
> > Dan, > > > > I have a query regarding below patch [*]. My assumption is its halted > > because of memory hotplug restructuring work? Anything I am missing > > here? > > > > [*] https://www.mail-archive.com/linux-nvdimm@lists.01.org/msg02978.html > > It's fallen to the back of my queue since the original driving need of > platform firmware changing offsets from boot-to-boot is no longer an > issue. However, it does mean that you need to arrange for 128MB > aligned devm_memremap_pages() ranges for the foreseeable future. o.k I will provide 128MB aligned pages to devm_memremap_pages() function. Thanks, Pankaj
On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote: > This patch adds virtio-pmem driver for KVM guest. > Guest reads the persistent memory range information > over virtio bus from Qemu and reserves the range > as persistent memory. Guest also allocates a block > device corresponding to the pmem range which later > can be accessed with DAX compatible file systems. > Idea is to use the virtio channel between guest and > host to perform the block device flush for guest pmem > DAX device. > > There is work to do including DAX file system support > and other advanced features. > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com> > --- > drivers/virtio/Kconfig | 10 ++ > drivers/virtio/Makefile | 1 + > drivers/virtio/virtio_pmem.c | 322 +++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/virtio_pmem.h | 55 +++++++ > 4 files changed, 388 insertions(+) > create mode 100644 drivers/virtio/virtio_pmem.c > create mode 100644 include/uapi/linux/virtio_pmem.h > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > index cff773f15b7e..0192c4bda54b 100644 > --- a/drivers/virtio/Kconfig > +++ b/drivers/virtio/Kconfig > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY > > If unsure, say Y. > > +config VIRTIO_PMEM > + tristate "Virtio pmem driver" > + depends on VIRTIO > + ---help--- > + This driver adds persistent memory range within a KVM guest. > + It also associates a block device corresponding to the pmem > + range. > + > + If unsure, say M. > + > config VIRTIO_BALLOON > tristate "Virtio balloon driver" > depends on VIRTIO > diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile > index 41e30e3dc842..032ade725cc2 100644 > --- a/drivers/virtio/Makefile > +++ b/drivers/virtio/Makefile > @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o > virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o > obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o > obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o > +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o > diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c > new file mode 100644 > index 000000000000..74e47cae0e24 > --- /dev/null > +++ b/drivers/virtio/virtio_pmem.c > @@ -0,0 +1,322 @@ > +/* > + * virtio-pmem driver > + */ > + > +#include <linux/virtio.h> > +#include <linux/swap.h> > +#include <linux/workqueue.h> > +#include <linux/delay.h> > +#include <linux/slab.h> > +#include <linux/module.h> > +#include <linux/oom.h> > +#include <linux/wait.h> > +#include <linux/mm.h> > +#include <linux/mount.h> > +#include <linux/magic.h> > +#include <linux/virtio_pmem.h> > + > +void devm_vpmem_disable(struct device *dev, struct resource *res, void *addr) > +{ > + devm_memunmap(dev, addr); > + devm_release_mem_region(dev, res->start, resource_size(res)); > +} > + > +static void pmem_flush_done(struct virtqueue *vq) > +{ > + return; > +}; > + > +static void virtio_pmem_release_queue(void *q) > +{ > + blk_cleanup_queue(q); > +} > + > +static void virtio_pmem_freeze_queue(void *q) > +{ > + blk_freeze_queue_start(q); > +} > + > +static void virtio_pmem_release_disk(void *__pmem) > +{ > + struct virtio_pmem *pmem = __pmem; > + > + del_gendisk(pmem->disk); > + put_disk(pmem->disk); > +} > + > +static int init_vq(struct virtio_pmem *vpmem) > +{ > + struct virtqueue *vq; > + > + /* single vq */ > + vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, "flush_queue"); > + > + if (IS_ERR(vq)) > + return PTR_ERR(vq); > + > + return 0; > +} > + > +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem, > + struct resource *res, struct vmem_altmap *altmap) > +{ > + u32 start_pad = 0, end_trunc = 0; > + resource_size_t start, size; > + unsigned long npfns; > + phys_addr_t offset; > + > + size = resource_size(res); > + start = PHYS_SECTION_ALIGN_DOWN(res->start); > + > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > + IORES_DESC_NONE) == REGION_MIXED) { > + > + start = res->start; > + start_pad = PHYS_SECTION_ALIGN_UP(start) - start; > + } > + start = res->start; > + size = PHYS_SECTION_ALIGN_UP(start + size) - start; > + > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > + IORES_DESC_NONE) == REGION_MIXED) { > + > + size = resource_size(res); > + end_trunc = start + size - > + PHYS_SECTION_ALIGN_DOWN(start + size); > + } > + > + start += start_pad; > + size = resource_size(res); > + npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K) > + / PAGE_SIZE); > + > + /* > + * vmemmap_populate_hugepages() allocates the memmap array in > + * HPAGE_SIZE chunks. > + */ > + offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start; > + vpmem->data_offset = offset; > + > + struct vmem_altmap __altmap = { > + .base_pfn = init_altmap_base(start+start_pad), > + .reserve = init_altmap_reserve(start+start_pad), > + }; > + > + res->start += start_pad; > + res->end -= end_trunc; > + memcpy(altmap, &__altmap, sizeof(*altmap)); > + altmap->free = PHYS_PFN(offset - SZ_8K); > + altmap->alloc = 0; The __altmap part can be rewritten using compound literal syntax: *altmap = (struct vmem_altmap) { .base_pfn = init_altmap_base(start+start_pad), .reserve = init_altmap_reserve(start+start_pad), .free = PHYS_PFN(offset - SZ_8K), }; This eliminates the temporary variable, memcpy, and explicit alloc = 0 initialization. > + > + return altmap; > +} > + > +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page *page, > + unsigned int len, unsigned int off, bool is_write, > + sector_t sector) > +{ > + blk_status_t rc = BLK_STS_OK; > + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; > + void *pmem_addr = pmem->virt_addr + pmem_off; > + > + if (!is_write) { > + rc = read_pmem(page, off, pmem_addr, len); > + flush_dcache_page(page); > + } else { > + flush_dcache_page(page); What are the semantics of this device? Why flush the dcache here if an explicit flush command needs to be sent over the virtqueue? > + write_pmem(pmem_addr, page, off, len); > + } > + > + return rc; > +} > + > +static int vpmem_rw_page(struct block_device *bdev, sector_t sector, > + struct page *page, bool is_write) > +{ > + struct virtio_pmem *pmem = bdev->bd_queue->queuedata; > + blk_status_t rc; > + > + rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, > + 0, is_write, sector); > + > + if (rc == 0) > + page_endio(page, is_write, 0); > + > + return blk_status_to_errno(rc); > +} > + > +#ifndef REQ_FLUSH > +#define REQ_FLUSH REQ_PREFLUSH > +#endif Is this out-of-tree kernel module compatibility stuff that can be removed? > + > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q, > + struct bio *bio) > +{ > + blk_status_t rc = 0; > + struct bio_vec bvec; > + struct bvec_iter iter; > + struct virtio_pmem *pmem = q->queuedata; > + > + if (bio->bi_opf & REQ_FLUSH) > + //todo host flush command This detail is critical to the device design. What is the plan? > + > + bio_for_each_segment(bvec, bio, iter) { > + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, > + bvec.bv_offset, op_is_write(bio_op(bio)), > + iter.bi_sector); > + if (rc) { > + bio->bi_status = rc; > + break; > + } > + } > + > + bio_endio(bio); > + return BLK_QC_T_NONE; > +} > + > +static const struct block_device_operations pmem_fops = { > + .owner = THIS_MODULE, > + .rw_page = vpmem_rw_page, > + //.revalidate_disk = nvdimm_revalidate_disk, > +}; > + > +static int virtio_pmem_probe(struct virtio_device *vdev) > +{ > + struct virtio_pmem *vpmem; > + int err = 0; > + void *addr; > + struct resource *res, res_pfn; > + struct request_queue *q; > + struct vmem_altmap __altmap, *altmap = NULL; > + struct gendisk *disk; > + struct device *gendev; > + int nid = dev_to_node(&vdev->dev); > + > + if (!vdev->config->get) { > + dev_err(&vdev->dev, "%s failure: config disabled\n", > + __func__); > + return -EINVAL; > + } > + > + vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), > + GFP_KERNEL); > + > + if (!vpmem) { > + err = -ENOMEM; > + goto out; > + } > + > + dev_set_drvdata(&vdev->dev, vpmem); > + > + vpmem->vdev = vdev; > + err = init_vq(vpmem); > + if (err) > + goto out; > + > + if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) { > + dev_err(&vdev->dev, "%s : pmem not supported\n", > + __func__); > + goto out; > + } Why is VIRTIO_PMEM_PLUG optional for this new device type if it's already required by the first version of the driver? err is not set when the error path is taken. > + > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > + start, &vpmem->start); > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > + size, &vpmem->size); The struct resource start and size fields can vary between architectures. Virtio device configuration space layout must not vary between architectures. Please use u64. > + > + res_pfn.start = vpmem->start; > + res_pfn.end = vpmem->start + vpmem->size-1; > + > + /* used for allocating memmap in the pmem device */ > + altmap = setup_pmem_pfn(vpmem, &res_pfn, &__altmap); > + > + res = devm_request_mem_region(&vdev->dev, > + res_pfn.start, resource_size(&res_pfn), "virtio-pmem"); > + > + if (!res) { > + dev_warn(&vdev->dev, "could not reserve region "); > + return -EBUSY; > + } > + > + q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev)); > + > + if (!q) > + return -ENOMEM; > + > + if (devm_add_action_or_reset(&vdev->dev, > + virtio_pmem_release_queue, q)) > + return -ENOMEM; > + > + vpmem->pfn_flags = PFN_DEV; > + > + /* allocate memap in pmem device itself */ > + if (IS_ENABLED(CONFIG_ZONE_DEVICE)) { > + > + addr = devm_memremap_pages(&vdev->dev, res, > + &q->q_usage_counter, altmap); > + vpmem->pfn_flags |= PFN_MAP; > + } else > + addr = devm_memremap(&vdev->dev, vpmem->start, > + vpmem->size, ARCH_MEMREMAP_PMEM); > + > + /* > + * At release time the queue must be frozen before > + * devm_memremap_pages is unwound > + */ > + if (devm_add_action_or_reset(&vdev->dev, > + virtio_pmem_freeze_queue, q)) > + return -ENOMEM; > + > + if (IS_ERR(addr)) > + return PTR_ERR(addr); > + > + vpmem->virt_addr = addr; > + blk_queue_write_cache(q, 0, 0); > + blk_queue_make_request(q, virtio_pmem_make_request); > + blk_queue_physical_block_size(q, PAGE_SIZE); > + blk_queue_logical_block_size(q, 512); > + blk_queue_max_hw_sectors(q, UINT_MAX); > + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); > + queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); > + q->queuedata = vpmem; > + > + disk = alloc_disk_node(0, nid); > + if (!disk) > + return -ENOMEM; The return statements in this function look suspicious. There probably needs to be a cleanup to avoid memory leaks. > + vpmem->disk = disk; > + > + disk->fops = &pmem_fops; > + disk->queue = q; > + disk->flags = GENHD_FL_EXT_DEVT; > + strcpy(disk->disk_name, "vpmem"); > + set_capacity(disk, vpmem->size/512); > + gendev = disk_to_dev(disk); > + > + virtio_device_ready(vdev); > + device_add_disk(&vdev->dev, disk); > + > + if (devm_add_action_or_reset(&vdev->dev, > + virtio_pmem_release_disk, vpmem)) > + return -ENOMEM; > + > + revalidate_disk(disk); > + return 0; > +out: > + vdev->config->del_vqs(vdev); > + return err; > +} > + > +static struct virtio_driver virtio_pmem_driver = { > + .feature_table = features, > + .feature_table_size = ARRAY_SIZE(features), > + .driver.name = KBUILD_MODNAME, > + .driver.owner = THIS_MODULE, > + .id_table = id_table, > + .probe = virtio_pmem_probe, > + //.remove = virtio_pmem_remove, > +}; > + > +module_virtio_driver(virtio_pmem_driver); > +MODULE_DEVICE_TABLE(virtio, id_table); > +MODULE_DESCRIPTION("Virtio pmem driver"); > +MODULE_LICENSE("GPL"); > diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h > new file mode 100644 > index 000000000000..ec0c728c79ba > --- /dev/null > +++ b/include/uapi/linux/virtio_pmem.h > @@ -0,0 +1,55 @@ > +/* > + * Virtio pmem Device > + * > + * > + */ > + > +#ifndef _LINUX_VIRTIO_PMEM_H > +#define _LINUX_VIRTIO_PMEM_H > + > +#include <linux/types.h> > +#include <linux/virtio_types.h> > +#include <linux/virtio_ids.h> > +#include <linux/virtio_config.h> > +#include <linux/pfn_t.h> > +#include <linux/fs.h> > +#include <linux/blk-mq.h> > +#include <linux/pmem_common.h> include/uapi/ headers must compile when included from userspace applications. The header should define userspace APIs only. If you want to define kernel-internal APIs, please add them to include/linux/ or similar. This also means that include/uapi/ headers cannot include include/linux/pmem_common.h or other kernel headers outside #ifdef __KERNEL__. This header file should only contain struct virtio_pmem_config, VIRTIO_PMEM_PLUG, and other virtio device definitions. > + > +bool pmem_should_map_pages(struct device *dev); > + > +#define VIRTIO_PMEM_PLUG 0 > + > +struct virtio_pmem_config { > + > +uint64_t start; > +uint64_t size; > +uint64_t align; > +uint64_t data_offset; > +}; > + > +struct virtio_pmem { > + > + struct virtio_device *vdev; > + struct virtqueue *req_vq; > + > + uint64_t start; > + uint64_t size; > + uint64_t align; > + uint64_t data_offset; > + u64 pfn_flags; > + void *virt_addr; > + struct gendisk *disk; > +} __packed; > + > +static struct virtio_device_id id_table[] = { > + { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID }, > + { 0 }, > +}; > + > +static unsigned int features[] = { > + VIRTIO_PMEM_PLUG, > +}; > + > +#endif > + > -- > 2.13.0 >
> > On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote: > > This patch adds virtio-pmem driver for KVM guest. > > Guest reads the persistent memory range information > > over virtio bus from Qemu and reserves the range > > as persistent memory. Guest also allocates a block > > device corresponding to the pmem range which later > > can be accessed with DAX compatible file systems. > > Idea is to use the virtio channel between guest and > > host to perform the block device flush for guest pmem > > DAX device. > > > > There is work to do including DAX file system support > > and other advanced features. > > > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com> > > --- > > drivers/virtio/Kconfig | 10 ++ > > drivers/virtio/Makefile | 1 + > > drivers/virtio/virtio_pmem.c | 322 > > +++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/virtio_pmem.h | 55 +++++++ > > 4 files changed, 388 insertions(+) > > create mode 100644 drivers/virtio/virtio_pmem.c > > create mode 100644 include/uapi/linux/virtio_pmem.h > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > > index cff773f15b7e..0192c4bda54b 100644 > > --- a/drivers/virtio/Kconfig > > +++ b/drivers/virtio/Kconfig > > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY > > > > If unsure, say Y. > > > > +config VIRTIO_PMEM > > + tristate "Virtio pmem driver" > > + depends on VIRTIO > > + ---help--- > > + This driver adds persistent memory range within a KVM guest. > > + It also associates a block device corresponding to the pmem > > + range. > > + > > + If unsure, say M. > > + > > config VIRTIO_BALLOON > > tristate "Virtio balloon driver" > > depends on VIRTIO > > diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile > > index 41e30e3dc842..032ade725cc2 100644 > > --- a/drivers/virtio/Makefile > > +++ b/drivers/virtio/Makefile > > @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o > > virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o > > obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o > > obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o > > +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o > > diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c > > new file mode 100644 > > index 000000000000..74e47cae0e24 > > --- /dev/null > > +++ b/drivers/virtio/virtio_pmem.c > > @@ -0,0 +1,322 @@ > > +/* > > + * virtio-pmem driver > > + */ > > + > > +#include <linux/virtio.h> > > +#include <linux/swap.h> > > +#include <linux/workqueue.h> > > +#include <linux/delay.h> > > +#include <linux/slab.h> > > +#include <linux/module.h> > > +#include <linux/oom.h> > > +#include <linux/wait.h> > > +#include <linux/mm.h> > > +#include <linux/mount.h> > > +#include <linux/magic.h> > > +#include <linux/virtio_pmem.h> > > + > > +void devm_vpmem_disable(struct device *dev, struct resource *res, void > > *addr) > > +{ > > + devm_memunmap(dev, addr); > > + devm_release_mem_region(dev, res->start, resource_size(res)); > > +} > > + > > +static void pmem_flush_done(struct virtqueue *vq) > > +{ > > + return; > > +}; > > + > > +static void virtio_pmem_release_queue(void *q) > > +{ > > + blk_cleanup_queue(q); > > +} > > + > > +static void virtio_pmem_freeze_queue(void *q) > > +{ > > + blk_freeze_queue_start(q); > > +} > > + > > +static void virtio_pmem_release_disk(void *__pmem) > > +{ > > + struct virtio_pmem *pmem = __pmem; > > + > > + del_gendisk(pmem->disk); > > + put_disk(pmem->disk); > > +} > > + > > +static int init_vq(struct virtio_pmem *vpmem) > > +{ > > + struct virtqueue *vq; > > + > > + /* single vq */ > > + vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, "flush_queue"); > > + > > + if (IS_ERR(vq)) > > + return PTR_ERR(vq); > > + > > + return 0; > > +} > > + > > +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem, > > + struct resource *res, struct vmem_altmap *altmap) > > +{ > > + u32 start_pad = 0, end_trunc = 0; > > + resource_size_t start, size; > > + unsigned long npfns; > > + phys_addr_t offset; > > + > > + size = resource_size(res); > > + start = PHYS_SECTION_ALIGN_DOWN(res->start); > > + > > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > > + IORES_DESC_NONE) == REGION_MIXED) { > > + > > + start = res->start; > > + start_pad = PHYS_SECTION_ALIGN_UP(start) - start; > > + } > > + start = res->start; > > + size = PHYS_SECTION_ALIGN_UP(start + size) - start; > > + > > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > > + IORES_DESC_NONE) == REGION_MIXED) { > > + > > + size = resource_size(res); > > + end_trunc = start + size - > > + PHYS_SECTION_ALIGN_DOWN(start + size); > > + } > > + > > + start += start_pad; > > + size = resource_size(res); > > + npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K) > > + / PAGE_SIZE); > > + > > + /* > > + * vmemmap_populate_hugepages() allocates the memmap array in > > + * HPAGE_SIZE chunks. > > + */ > > + offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start; > > + vpmem->data_offset = offset; > > + > > + struct vmem_altmap __altmap = { > > + .base_pfn = init_altmap_base(start+start_pad), > > + .reserve = init_altmap_reserve(start+start_pad), > > + }; > > + > > + res->start += start_pad; > > + res->end -= end_trunc; > > + memcpy(altmap, &__altmap, sizeof(*altmap)); > > + altmap->free = PHYS_PFN(offset - SZ_8K); > > + altmap->alloc = 0; > > The __altmap part can be rewritten using compound literal syntax: > > *altmap = (struct vmem_altmap) { > .base_pfn = init_altmap_base(start+start_pad), > .reserve = init_altmap_reserve(start+start_pad), > .free = PHYS_PFN(offset - SZ_8K), > }; > > This eliminates the temporary variable, memcpy, and explicit alloc = 0 > initialization. o.k will use it. > > > + > > + return altmap; > > +} > > + > > +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page > > *page, > > + unsigned int len, unsigned int off, bool is_write, > > + sector_t sector) > > +{ > > + blk_status_t rc = BLK_STS_OK; > > + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; > > + void *pmem_addr = pmem->virt_addr + pmem_off; > > + > > + if (!is_write) { > > + rc = read_pmem(page, off, pmem_addr, len); > > + flush_dcache_page(page); > > + } else { > > + flush_dcache_page(page); > > What are the semantics of this device? Why flush the dcache here if an > explicit flush command needs to be sent over the virtqueue? yes, we don't need it in this case. I yet have to think more about flushing with block and file-system level support :) > > > + write_pmem(pmem_addr, page, off, len); > > + } > > + > > + return rc; > > +} > > + > > +static int vpmem_rw_page(struct block_device *bdev, sector_t sector, > > + struct page *page, bool is_write) > > +{ > > + struct virtio_pmem *pmem = bdev->bd_queue->queuedata; > > + blk_status_t rc; > > + > > + rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, > > + 0, is_write, sector); > > + > > + if (rc == 0) > > + page_endio(page, is_write, 0); > > + > > + return blk_status_to_errno(rc); > > +} > > + > > +#ifndef REQ_FLUSH > > +#define REQ_FLUSH REQ_PREFLUSH > > +#endif > > Is this out-of-tree kernel module compatibility stuff that can be > removed? yes, will check what filesystem expects for flush if not used then remove. > > > + > > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q, > > + struct bio *bio) > > +{ > > + blk_status_t rc = 0; > > + struct bio_vec bvec; > > + struct bvec_iter iter; > > + struct virtio_pmem *pmem = q->queuedata; > > + > > + if (bio->bi_opf & REQ_FLUSH) > > + //todo host flush command > > This detail is critical to the device design. What is the plan? yes, this is good point. was thinking of guest sending a flush command to Qemu which will do a fsync on file fd. If we do a async flush and move the task to wait queue till we receive flush complete reply from host we can allow other tasks to execute in current cpu. Any suggestions you have or anything I am not foreseeing here? > > > + > > + bio_for_each_segment(bvec, bio, iter) { > > + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, > > + bvec.bv_offset, op_is_write(bio_op(bio)), > > + iter.bi_sector); > > + if (rc) { > > + bio->bi_status = rc; > > + break; > > + } > > + } > > + > > + bio_endio(bio); > > + return BLK_QC_T_NONE; > > +} > > + > > +static const struct block_device_operations pmem_fops = { > > + .owner = THIS_MODULE, > > + .rw_page = vpmem_rw_page, > > + //.revalidate_disk = nvdimm_revalidate_disk, > > +}; > > + > > +static int virtio_pmem_probe(struct virtio_device *vdev) > > +{ > > + struct virtio_pmem *vpmem; > > + int err = 0; > > + void *addr; > > + struct resource *res, res_pfn; > > + struct request_queue *q; > > + struct vmem_altmap __altmap, *altmap = NULL; > > + struct gendisk *disk; > > + struct device *gendev; > > + int nid = dev_to_node(&vdev->dev); > > + > > + if (!vdev->config->get) { > > + dev_err(&vdev->dev, "%s failure: config disabled\n", > > + __func__); > > + return -EINVAL; > > + } > > + > > + vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), > > + GFP_KERNEL); > > + > > + if (!vpmem) { > > + err = -ENOMEM; > > + goto out; > > + } > > + > > + dev_set_drvdata(&vdev->dev, vpmem); > > + > > + vpmem->vdev = vdev; > > + err = init_vq(vpmem); > > + if (err) > > + goto out; > > + > > + if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) { > > + dev_err(&vdev->dev, "%s : pmem not supported\n", > > + __func__); > > + goto out; > > + } > > Why is VIRTIO_PMEM_PLUG optional for this new device type if it's > already required by the first version of the driver? I just added it, initially kept it a place holder for any feature bit use. I will remove it if we are not using it. > > err is not set when the error path is taken. > > > + > > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > > + start, &vpmem->start); > > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > > + size, &vpmem->size); > > The struct resource start and size fields can vary between > architectures. Virtio device configuration space layout must not vary > between architectures. Please use u64. sure. > > > + > > + res_pfn.start = vpmem->start; > > + res_pfn.end = vpmem->start + vpmem->size-1; > > + > > + /* used for allocating memmap in the pmem device */ > > + altmap = setup_pmem_pfn(vpmem, &res_pfn, &__altmap); > > + > > + res = devm_request_mem_region(&vdev->dev, > > + res_pfn.start, resource_size(&res_pfn), "virtio-pmem"); > > + > > + if (!res) { > > + dev_warn(&vdev->dev, "could not reserve region "); > > + return -EBUSY; > > + } > > + > > + q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev)); > > + > > + if (!q) > > + return -ENOMEM; > > + > > + if (devm_add_action_or_reset(&vdev->dev, > > + virtio_pmem_release_queue, q)) > > + return -ENOMEM; > > + > > + vpmem->pfn_flags = PFN_DEV; > > + > > + /* allocate memap in pmem device itself */ > > + if (IS_ENABLED(CONFIG_ZONE_DEVICE)) { > > + > > + addr = devm_memremap_pages(&vdev->dev, res, > > + &q->q_usage_counter, altmap); > > + vpmem->pfn_flags |= PFN_MAP; > > + } else > > + addr = devm_memremap(&vdev->dev, vpmem->start, > > + vpmem->size, ARCH_MEMREMAP_PMEM); > > + > > + /* > > + * At release time the queue must be frozen before > > + * devm_memremap_pages is unwound > > + */ > > + if (devm_add_action_or_reset(&vdev->dev, > > + virtio_pmem_freeze_queue, q)) > > + return -ENOMEM; > > + > > + if (IS_ERR(addr)) > > + return PTR_ERR(addr); > > + > > + vpmem->virt_addr = addr; > > + blk_queue_write_cache(q, 0, 0); > > + blk_queue_make_request(q, virtio_pmem_make_request); > > + blk_queue_physical_block_size(q, PAGE_SIZE); > > + blk_queue_logical_block_size(q, 512); > > + blk_queue_max_hw_sectors(q, UINT_MAX); > > + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); > > + queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); > > + q->queuedata = vpmem; > > + > > + disk = alloc_disk_node(0, nid); > > + if (!disk) > > + return -ENOMEM; > > The return statements in this function look suspicious. There probably > needs to be a cleanup to avoid memory leaks. Sure! valid point. > > > + vpmem->disk = disk; > > + > > + disk->fops = &pmem_fops; > > + disk->queue = q; > > + disk->flags = GENHD_FL_EXT_DEVT; > > + strcpy(disk->disk_name, "vpmem"); > > + set_capacity(disk, vpmem->size/512); > > + gendev = disk_to_dev(disk); > > + > > + virtio_device_ready(vdev); > > + device_add_disk(&vdev->dev, disk); > > + > > + if (devm_add_action_or_reset(&vdev->dev, > > + virtio_pmem_release_disk, vpmem)) > > + return -ENOMEM; > > + > > + revalidate_disk(disk); > > + return 0; > > +out: > > + vdev->config->del_vqs(vdev); > > + return err; > > +} > > + > > +static struct virtio_driver virtio_pmem_driver = { > > + .feature_table = features, > > + .feature_table_size = ARRAY_SIZE(features), > > + .driver.name = KBUILD_MODNAME, > > + .driver.owner = THIS_MODULE, > > + .id_table = id_table, > > + .probe = virtio_pmem_probe, > > + //.remove = virtio_pmem_remove, > > +}; > > + > > +module_virtio_driver(virtio_pmem_driver); > > +MODULE_DEVICE_TABLE(virtio, id_table); > > +MODULE_DESCRIPTION("Virtio pmem driver"); > > +MODULE_LICENSE("GPL"); > > diff --git a/include/uapi/linux/virtio_pmem.h > > b/include/uapi/linux/virtio_pmem.h > > new file mode 100644 > > index 000000000000..ec0c728c79ba > > --- /dev/null > > +++ b/include/uapi/linux/virtio_pmem.h > > @@ -0,0 +1,55 @@ > > +/* > > + * Virtio pmem Device > > + * > > + * > > + */ > > + > > +#ifndef _LINUX_VIRTIO_PMEM_H > > +#define _LINUX_VIRTIO_PMEM_H > > + > > +#include <linux/types.h> > > +#include <linux/virtio_types.h> > > +#include <linux/virtio_ids.h> > > +#include <linux/virtio_config.h> > > +#include <linux/pfn_t.h> > > +#include <linux/fs.h> > > +#include <linux/blk-mq.h> > > +#include <linux/pmem_common.h> > > include/uapi/ headers must compile when included from userspace > applications. The header should define userspace APIs only. > > If you want to define kernel-internal APIs, please add them to > include/linux/ or similar. > > This also means that include/uapi/ headers cannot include > include/linux/pmem_common.h or other kernel headers outside #ifdef > __KERNEL__. o.k > > This header file should only contain struct virtio_pmem_config, > VIRTIO_PMEM_PLUG, and other virtio device definitions. Sure! > > > + > > +bool pmem_should_map_pages(struct device *dev); > > + > > +#define VIRTIO_PMEM_PLUG 0 > > + > > +struct virtio_pmem_config { > > + > > +uint64_t start; > > +uint64_t size; > > +uint64_t align; > > +uint64_t data_offset; > > +}; > > + > > +struct virtio_pmem { > > + > > + struct virtio_device *vdev; > > + struct virtqueue *req_vq; > > + > > + uint64_t start; > > + uint64_t size; > > + uint64_t align; > > + uint64_t data_offset; > > + u64 pfn_flags; > > + void *virt_addr; > > + struct gendisk *disk; > > +} __packed; > > + > > +static struct virtio_device_id id_table[] = { > > + { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID }, > > + { 0 }, > > +}; > > + > > +static unsigned int features[] = { > > + VIRTIO_PMEM_PLUG, > > +}; > > + > > +#endif > > + > > -- > > 2.13.0 > > > Thanks, Pankaj
On Fri, Oct 13, 2017 at 2:44 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote: [..] >> +#ifndef REQ_FLUSH >> +#define REQ_FLUSH REQ_PREFLUSH >> +#endif > > Is this out-of-tree kernel module compatibility stuff that can be > removed? Yes, this was copied from the pmem driver where it can also be removed, it was used to workaround a merge order problem in linux-next when these definitions were changed several kernel releases back.
On Fri, Oct 13, 2017 at 06:48:15AM -0400, Pankaj Gupta wrote: > > On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote: > > > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q, > > > + struct bio *bio) > > > +{ > > > + blk_status_t rc = 0; > > > + struct bio_vec bvec; > > > + struct bvec_iter iter; > > > + struct virtio_pmem *pmem = q->queuedata; > > > + > > > + if (bio->bi_opf & REQ_FLUSH) > > > + //todo host flush command > > > > This detail is critical to the device design. What is the plan? > > yes, this is good point. > > was thinking of guest sending a flush command to Qemu which > will do a fsync on file fd. Previously there was discussion about fsyncing a specific file range instead of the whole file. This could perform better in cases where only a subset of dirty pages need to be flushed. One possibility is to design the virtio interface to communicate ranges but the emulation code simply fsyncs the fd for the time being. Later on, if the necessary kernel and userspace interfaces are added, we can make use of the interface. > If we do a async flush and move the task to wait queue till we receive > flush complete reply from host we can allow other tasks to execute > in current cpu. > > Any suggestions you have or anything I am not foreseeing here? My main thought about this patch series is whether pmem should be a virtio-blk feature bit instead of a whole new device. There is quite a bit of overlap between the two. Stefan
On Mon, Oct 16, 2017 at 7:47 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > On Fri, Oct 13, 2017 at 06:48:15AM -0400, Pankaj Gupta wrote: >> > On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote: >> > > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q, >> > > + struct bio *bio) >> > > +{ >> > > + blk_status_t rc = 0; >> > > + struct bio_vec bvec; >> > > + struct bvec_iter iter; >> > > + struct virtio_pmem *pmem = q->queuedata; >> > > + >> > > + if (bio->bi_opf & REQ_FLUSH) >> > > + //todo host flush command >> > >> > This detail is critical to the device design. What is the plan? >> >> yes, this is good point. >> >> was thinking of guest sending a flush command to Qemu which >> will do a fsync on file fd. > > Previously there was discussion about fsyncing a specific file range > instead of the whole file. This could perform better in cases where > only a subset of dirty pages need to be flushed. > > One possibility is to design the virtio interface to communicate ranges > but the emulation code simply fsyncs the fd for the time being. Later > on, if the necessary kernel and userspace interfaces are added, we can > make use of the interface. Range based is not a natural storage cache management mechanism. All that is it available typically is a full write-cache-flush mechanism and upper layers would need to customized for range-based flushing. >> If we do a async flush and move the task to wait queue till we receive >> flush complete reply from host we can allow other tasks to execute >> in current cpu. >> >> Any suggestions you have or anything I am not foreseeing here? > > My main thought about this patch series is whether pmem should be a > virtio-blk feature bit instead of a whole new device. There is quite a > bit of overlap between the two. I'd be open to that... there's already provisions in the pmem driver for platforms where cpu caches are flushed on power-loss, a virtio mode for this shared-memory case seems reasonable.
> > On Fri, Oct 13, 2017 at 06:48:15AM -0400, Pankaj Gupta wrote: > > > On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote: > > > > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q, > > > > + struct bio *bio) > > > > +{ > > > > + blk_status_t rc = 0; > > > > + struct bio_vec bvec; > > > > + struct bvec_iter iter; > > > > + struct virtio_pmem *pmem = q->queuedata; > > > > + > > > > + if (bio->bi_opf & REQ_FLUSH) > > > > + //todo host flush command > > > > > > This detail is critical to the device design. What is the plan? > > > > yes, this is good point. > > > > was thinking of guest sending a flush command to Qemu which > > will do a fsync on file fd. > > Previously there was discussion about fsyncing a specific file range > instead of the whole file. This could perform better in cases where > only a subset of dirty pages need to be flushed. yes, We had discussion about this and decided to do entire block flush then to range level flush. > > One possibility is to design the virtio interface to communicate ranges > but the emulation code simply fsyncs the fd for the time being. Later > on, if the necessary kernel and userspace interfaces are added, we can > make use of the interface. > > > If we do a async flush and move the task to wait queue till we receive > > flush complete reply from host we can allow other tasks to execute > > in current cpu. > > > > Any suggestions you have or anything I am not foreseeing here? > > My main thought about this patch series is whether pmem should be a > virtio-blk feature bit instead of a whole new device. There is quite a > bit of overlap between the two. Exposing options with existing virtio-blk device to be used as persistent memory range at high level would require additional below features: - Use a persistent memory range with an option to allocate memmap array in the device itself for . - Block operations for DAX and persistent memory range. - Bifurcation at filesystem level based on type of virtio-blk device selected. - Bifurcation of flushing interface and communication channel between guest & host. But yes these features can be dynamically configured based on type of device added? What if we have virtio-blk:virtio-pmem (m:n) devices ratio?And scale involved? If i understand correctly virtio-blk is high performance interface with multiqueue support and additional features at host side like data-plane mode etc. If we bloat it with additional stuff(even when we need them) and provide locking with additional features both at guest as well as host side we will get a hit in performance? Also as requirement of both the interfaces would grow it will be more difficult to maintain? I would prefer more simpler interfaces with defined functionality but yes common code can be shared and used using well defined wrappers. > > Stefan >
I think this driver is at entirely the wrong level. If you want to expose pmem to a guest with flushing assist do it as pmem, and not a block driver.
> > I think this driver is at entirely the wrong level. > > If you want to expose pmem to a guest with flushing assist do it > as pmem, and not a block driver. Are you saying do it as existing i.e ACPI pmem like interface? The reason we have created this new driver is exiting pmem driver does not define proper semantics for guest flushing requests. Regarding block support of driver, we want to achieve DAX support to bypass guest page cache. Also, we want to utilize existing DAX capable file-system interfaces(e.g fsync) from userspace file API's to trigger the host side flush request. Below link has details of previous discussion: https://marc.info/?l=kvm&m=150091133700361&w=2 Thanks, Pankaj
On Tue, Oct 17, 2017 at 03:40:56AM -0400, Pankaj Gupta wrote: > Are you saying do it as existing i.e ACPI pmem like interface? > The reason we have created this new driver is exiting pmem driver > does not define proper semantics for guest flushing requests. At this point I'm caring about the Linux-internal interface, and for that it should be integrated into the nvdimm subsystem and not a block driver. How the host <-> guest interface looks is a different idea. > > Regarding block support of driver, we want to achieve DAX support > to bypass guest page cache. Also, we want to utilize existing DAX > capable file-system interfaces(e.g fsync) from userspace file API's > to trigger the host side flush request. Well, if you want to support XFS+DAX better don't make it a block devices, because I'll post patches soon to stop using the block device entirely for the DAX case.
> > Are you saying do it as existing i.e ACPI pmem like interface? > > The reason we have created this new driver is exiting pmem driver > > does not define proper semantics for guest flushing requests. > > At this point I'm caring about the Linux-internal interface, and > for that it should be integrated into the nvdimm subsystem and not > a block driver. How the host <-> guest interface looks is a different > idea. > > > > > Regarding block support of driver, we want to achieve DAX support > > to bypass guest page cache. Also, we want to utilize existing DAX > > capable file-system interfaces(e.g fsync) from userspace file API's > > to trigger the host side flush request. > > Well, if you want to support XFS+DAX better don't make it a block > devices, because I'll post patches soon to stop using the block device > entirely for the DAX case. o.k I will look at your patches once they are in mailing list. Thanks for the heads up. If I am guessing it right, we don't need block device additional features for pmem? We can bypass block device features like blk device cache flush etc. Also, still we would be supporting ext4 & XFS filesystem with pmem? If there is time to your patches can you please elaborate on this a bit. Thanks, Pankaj
On Tue, Oct 17, 2017 at 04:30:41AM -0400, Pankaj Gupta wrote: > > > > Are you saying do it as existing i.e ACPI pmem like interface? > > > The reason we have created this new driver is exiting pmem driver > > > does not define proper semantics for guest flushing requests. > > > > At this point I'm caring about the Linux-internal interface, and > > for that it should be integrated into the nvdimm subsystem and not > > a block driver. How the host <-> guest interface looks is a different > > idea. > > > > > > > > Regarding block support of driver, we want to achieve DAX support > > > to bypass guest page cache. Also, we want to utilize existing DAX > > > capable file-system interfaces(e.g fsync) from userspace file API's > > > to trigger the host side flush request. > > > > Well, if you want to support XFS+DAX better don't make it a block > > devices, because I'll post patches soon to stop using the block device > > entirely for the DAX case. > > o.k I will look at your patches once they are in mailing list. > Thanks for the heads up. > > If I am guessing it right, we don't need block device additional features > for pmem? We can bypass block device features like blk device cache flush etc. > Also, still we would be supporting ext4 & XFS filesystem with pmem? > > If there is time to your patches can you please elaborate on this a bit. I think the idea is that the nvdimm subsystem already adds block device semantics on top of the struct nvdimms that it manages. See drivers/nvdimm/blk.c. So it would be cleaner to make virtio-pmem an nvdimm bus. This will eliminate the duplication between your driver and drivers/nvdimm/ code. Try "git grep nvdimm_bus_register" to find drivers that use the nvdimm subsystem. The virtio-pmem driver could register itself similar to the existing ACPI NFIT and BIOS e820 pmem drivers. Stefan
On Wed, Oct 18, 2017 at 6:03 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > On Tue, Oct 17, 2017 at 04:30:41AM -0400, Pankaj Gupta wrote: >> >> > > Are you saying do it as existing i.e ACPI pmem like interface? >> > > The reason we have created this new driver is exiting pmem driver >> > > does not define proper semantics for guest flushing requests. >> > >> > At this point I'm caring about the Linux-internal interface, and >> > for that it should be integrated into the nvdimm subsystem and not >> > a block driver. How the host <-> guest interface looks is a different >> > idea. >> > >> > > >> > > Regarding block support of driver, we want to achieve DAX support >> > > to bypass guest page cache. Also, we want to utilize existing DAX >> > > capable file-system interfaces(e.g fsync) from userspace file API's >> > > to trigger the host side flush request. >> > >> > Well, if you want to support XFS+DAX better don't make it a block >> > devices, because I'll post patches soon to stop using the block device >> > entirely for the DAX case. >> >> o.k I will look at your patches once they are in mailing list. >> Thanks for the heads up. >> >> If I am guessing it right, we don't need block device additional features >> for pmem? We can bypass block device features like blk device cache flush etc. >> Also, still we would be supporting ext4 & XFS filesystem with pmem? >> >> If there is time to your patches can you please elaborate on this a bit. > > I think the idea is that the nvdimm subsystem already adds block device > semantics on top of the struct nvdimms that it manages. See > drivers/nvdimm/blk.c. > > So it would be cleaner to make virtio-pmem an nvdimm bus. This will > eliminate the duplication between your driver and drivers/nvdimm/ code. > Try "git grep nvdimm_bus_register" to find drivers that use the nvdimm > subsystem. This use case is not "Persistent Memory". Persistent Memory is something you can map and make persistent with CPU instructions. Anything that requires a driver call is device driver managed "Shared Memory".
On Wed, Oct 18, 2017 at 08:51:37AM -0700, Dan Williams wrote: > On Wed, Oct 18, 2017 at 6:03 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > On Tue, Oct 17, 2017 at 04:30:41AM -0400, Pankaj Gupta wrote: > >> > >> > > Are you saying do it as existing i.e ACPI pmem like interface? > >> > > The reason we have created this new driver is exiting pmem driver > >> > > does not define proper semantics for guest flushing requests. > >> > > >> > At this point I'm caring about the Linux-internal interface, and > >> > for that it should be integrated into the nvdimm subsystem and not > >> > a block driver. How the host <-> guest interface looks is a different > >> > idea. > >> > > >> > > > >> > > Regarding block support of driver, we want to achieve DAX support > >> > > to bypass guest page cache. Also, we want to utilize existing DAX > >> > > capable file-system interfaces(e.g fsync) from userspace file API's > >> > > to trigger the host side flush request. > >> > > >> > Well, if you want to support XFS+DAX better don't make it a block > >> > devices, because I'll post patches soon to stop using the block device > >> > entirely for the DAX case. > >> > >> o.k I will look at your patches once they are in mailing list. > >> Thanks for the heads up. > >> > >> If I am guessing it right, we don't need block device additional features > >> for pmem? We can bypass block device features like blk device cache flush etc. > >> Also, still we would be supporting ext4 & XFS filesystem with pmem? > >> > >> If there is time to your patches can you please elaborate on this a bit. > > > > I think the idea is that the nvdimm subsystem already adds block device > > semantics on top of the struct nvdimms that it manages. See > > drivers/nvdimm/blk.c. > > > > So it would be cleaner to make virtio-pmem an nvdimm bus. This will > > eliminate the duplication between your driver and drivers/nvdimm/ code. > > Try "git grep nvdimm_bus_register" to find drivers that use the nvdimm > > subsystem. > > This use case is not "Persistent Memory". Persistent Memory is > something you can map and make persistent with CPU instructions. > Anything that requires a driver call is device driver managed "Shared > Memory". Dan, in that case do you have ideas regarding Christoph Hellwig's comment that this driver should be integrated into the nvdimm subsystem instead of a new block driver? Stefan
On Wed, Oct 18, 2017 at 08:51:37AM -0700, Dan Williams wrote: > This use case is not "Persistent Memory". Persistent Memory is > something you can map and make persistent with CPU instructions. > Anything that requires a driver call is device driver managed "Shared > Memory". How is this any different than the existing nvdimm_flush()? If you really care about the not driver thing it could easily be a write to a doorbell page or a hypercall, but in the end that's just semantics.
On Thu, Oct 19, 2017 at 1:01 AM, Christoph Hellwig <hch@infradead.org> wrote: > On Wed, Oct 18, 2017 at 08:51:37AM -0700, Dan Williams wrote: >> This use case is not "Persistent Memory". Persistent Memory is >> something you can map and make persistent with CPU instructions. >> Anything that requires a driver call is device driver managed "Shared >> Memory". > > How is this any different than the existing nvdimm_flush()? If you > really care about the not driver thing it could easily be a write > to a doorbell page or a hypercall, but in the end that's just semantics. The difference is that nvdimm_flush() is not mandatory, and that the platform will automatically perform the same flush at power-fail. Applications should be able to assume that if they are using MAP_SYNC that no other coordination with the kernel or the hypervisor is necessary. Advertising this as a generic Persistent Memory range to the guest means that the guest could theoretically use it with device-dax where there is no driver or filesystem sync interface. The hypervisor will be waiting for flush notifications and the guest will just issue cache flushes and sfence instructions. So, as far as I can see we need to differentiate this virtio-model from standard "Persistent Memory" to the guest and remove the possibility of guests/applications making the wrong assumption. Non-ODP RDMA in a guest comes to mind...
On Thu, Oct 19, 2017 at 11:21:26AM -0700, Dan Williams wrote: > The difference is that nvdimm_flush() is not mandatory, and that the > platform will automatically perform the same flush at power-fail. > Applications should be able to assume that if they are using MAP_SYNC > that no other coordination with the kernel or the hypervisor is > necessary. > > Advertising this as a generic Persistent Memory range to the guest > means that the guest could theoretically use it with device-dax where > there is no driver or filesystem sync interface. The hypervisor will > be waiting for flush notifications and the guest will just issue cache > flushes and sfence instructions. So, as far as I can see we need to > differentiate this virtio-model from standard "Persistent Memory" to > the guest and remove the possibility of guests/applications making the > wrong assumption. So add a flag that it is not. We already have the nd_volatile type, that is special. For now only in Linux, but I think adding this type to the spec eventually would be very useful for efficiently exposing directly mappable device to VM guests.
On Fri, Oct 20, 2017 at 1:00 AM, Christoph Hellwig <hch@infradead.org> wrote: > On Thu, Oct 19, 2017 at 11:21:26AM -0700, Dan Williams wrote: >> The difference is that nvdimm_flush() is not mandatory, and that the >> platform will automatically perform the same flush at power-fail. >> Applications should be able to assume that if they are using MAP_SYNC >> that no other coordination with the kernel or the hypervisor is >> necessary. >> >> Advertising this as a generic Persistent Memory range to the guest >> means that the guest could theoretically use it with device-dax where >> there is no driver or filesystem sync interface. The hypervisor will >> be waiting for flush notifications and the guest will just issue cache >> flushes and sfence instructions. So, as far as I can see we need to >> differentiate this virtio-model from standard "Persistent Memory" to >> the guest and remove the possibility of guests/applications making the >> wrong assumption. > > So add a flag that it is not. We already have the nd_volatile type, > that is special. For now only in Linux, but I think adding this type > to the spec eventually would be very useful for efficiently exposing > directly mappable device to VM guests. Right, that's the same recommendation I gave. https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg08404.html ...so maybe I'm misunderstanding your concern? It sounds like we're on the same page.
On Fri, Oct 20, 2017 at 08:05:09AM -0700, Dan Williams wrote: > Right, that's the same recommendation I gave. > > https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg08404.html > > ...so maybe I'm misunderstanding your concern? It sounds like we're on > the same page. Yes, the above is exactly what I think we should do it. And in many ways virtio seems overkill if we could just have a hypercall or doorbell page as the queueing infrastructure in virtio shouldn't really be needed.
On Fri, Oct 20, 2017 at 9:06 AM, Christoph Hellwig <hch@infradead.org> wrote: > On Fri, Oct 20, 2017 at 08:05:09AM -0700, Dan Williams wrote: >> Right, that's the same recommendation I gave. >> >> https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg08404.html >> >> ...so maybe I'm misunderstanding your concern? It sounds like we're on >> the same page. > > Yes, the above is exactly what I think we should do it. And in many > ways virtio seems overkill if we could just have a hypercall or doorbell > page as the queueing infrastructure in virtio shouldn't really be > needed. Ah ok, yes, get rid of the virtio-pmem driver and just make nvdimm_flush() do a hypercall based on region-type flag.
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index cff773f15b7e..0192c4bda54b 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY If unsure, say Y. +config VIRTIO_PMEM + tristate "Virtio pmem driver" + depends on VIRTIO + ---help--- + This driver adds persistent memory range within a KVM guest. + It also associates a block device corresponding to the pmem + range. + + If unsure, say M. + config VIRTIO_BALLOON tristate "Virtio balloon driver" depends on VIRTIO diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index 41e30e3dc842..032ade725cc2 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c new file mode 100644 index 000000000000..74e47cae0e24 --- /dev/null +++ b/drivers/virtio/virtio_pmem.c @@ -0,0 +1,322 @@ +/* + * virtio-pmem driver + */ + +#include <linux/virtio.h> +#include <linux/swap.h> +#include <linux/workqueue.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/oom.h> +#include <linux/wait.h> +#include <linux/mm.h> +#include <linux/mount.h> +#include <linux/magic.h> +#include <linux/virtio_pmem.h> + +void devm_vpmem_disable(struct device *dev, struct resource *res, void *addr) +{ + devm_memunmap(dev, addr); + devm_release_mem_region(dev, res->start, resource_size(res)); +} + +static void pmem_flush_done(struct virtqueue *vq) +{ + return; +}; + +static void virtio_pmem_release_queue(void *q) +{ + blk_cleanup_queue(q); +} + +static void virtio_pmem_freeze_queue(void *q) +{ + blk_freeze_queue_start(q); +} + +static void virtio_pmem_release_disk(void *__pmem) +{ + struct virtio_pmem *pmem = __pmem; + + del_gendisk(pmem->disk); + put_disk(pmem->disk); +} + +static int init_vq(struct virtio_pmem *vpmem) +{ + struct virtqueue *vq; + + /* single vq */ + vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, "flush_queue"); + + if (IS_ERR(vq)) + return PTR_ERR(vq); + + return 0; +} + +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem, + struct resource *res, struct vmem_altmap *altmap) +{ + u32 start_pad = 0, end_trunc = 0; + resource_size_t start, size; + unsigned long npfns; + phys_addr_t offset; + + size = resource_size(res); + start = PHYS_SECTION_ALIGN_DOWN(res->start); + + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, + IORES_DESC_NONE) == REGION_MIXED) { + + start = res->start; + start_pad = PHYS_SECTION_ALIGN_UP(start) - start; + } + start = res->start; + size = PHYS_SECTION_ALIGN_UP(start + size) - start; + + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, + IORES_DESC_NONE) == REGION_MIXED) { + + size = resource_size(res); + end_trunc = start + size - + PHYS_SECTION_ALIGN_DOWN(start + size); + } + + start += start_pad; + size = resource_size(res); + npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K) + / PAGE_SIZE); + + /* + * vmemmap_populate_hugepages() allocates the memmap array in + * HPAGE_SIZE chunks. + */ + offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start; + vpmem->data_offset = offset; + + struct vmem_altmap __altmap = { + .base_pfn = init_altmap_base(start+start_pad), + .reserve = init_altmap_reserve(start+start_pad), + }; + + res->start += start_pad; + res->end -= end_trunc; + memcpy(altmap, &__altmap, sizeof(*altmap)); + altmap->free = PHYS_PFN(offset - SZ_8K); + altmap->alloc = 0; + + return altmap; +} + +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page *page, + unsigned int len, unsigned int off, bool is_write, + sector_t sector) +{ + blk_status_t rc = BLK_STS_OK; + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; + void *pmem_addr = pmem->virt_addr + pmem_off; + + if (!is_write) { + rc = read_pmem(page, off, pmem_addr, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + write_pmem(pmem_addr, page, off, len); + } + + return rc; +} + +static int vpmem_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, bool is_write) +{ + struct virtio_pmem *pmem = bdev->bd_queue->queuedata; + blk_status_t rc; + + rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, + 0, is_write, sector); + + if (rc == 0) + page_endio(page, is_write, 0); + + return blk_status_to_errno(rc); +} + +#ifndef REQ_FLUSH +#define REQ_FLUSH REQ_PREFLUSH +#endif + +static blk_qc_t virtio_pmem_make_request(struct request_queue *q, + struct bio *bio) +{ + blk_status_t rc = 0; + struct bio_vec bvec; + struct bvec_iter iter; + struct virtio_pmem *pmem = q->queuedata; + + if (bio->bi_opf & REQ_FLUSH) + //todo host flush command + + bio_for_each_segment(bvec, bio, iter) { + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, + bvec.bv_offset, op_is_write(bio_op(bio)), + iter.bi_sector); + if (rc) { + bio->bi_status = rc; + break; + } + } + + bio_endio(bio); + return BLK_QC_T_NONE; +} + +static const struct block_device_operations pmem_fops = { + .owner = THIS_MODULE, + .rw_page = vpmem_rw_page, + //.revalidate_disk = nvdimm_revalidate_disk, +}; + +static int virtio_pmem_probe(struct virtio_device *vdev) +{ + struct virtio_pmem *vpmem; + int err = 0; + void *addr; + struct resource *res, res_pfn; + struct request_queue *q; + struct vmem_altmap __altmap, *altmap = NULL; + struct gendisk *disk; + struct device *gendev; + int nid = dev_to_node(&vdev->dev); + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config disabled\n", + __func__); + return -EINVAL; + } + + vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), + GFP_KERNEL); + + if (!vpmem) { + err = -ENOMEM; + goto out; + } + + dev_set_drvdata(&vdev->dev, vpmem); + + vpmem->vdev = vdev; + err = init_vq(vpmem); + if (err) + goto out; + + if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) { + dev_err(&vdev->dev, "%s : pmem not supported\n", + __func__); + goto out; + } + + virtio_cread(vpmem->vdev, struct virtio_pmem_config, + start, &vpmem->start); + virtio_cread(vpmem->vdev, struct virtio_pmem_config, + size, &vpmem->size); + + res_pfn.start = vpmem->start; + res_pfn.end = vpmem->start + vpmem->size-1; + + /* used for allocating memmap in the pmem device */ + altmap = setup_pmem_pfn(vpmem, &res_pfn, &__altmap); + + res = devm_request_mem_region(&vdev->dev, + res_pfn.start, resource_size(&res_pfn), "virtio-pmem"); + + if (!res) { + dev_warn(&vdev->dev, "could not reserve region "); + return -EBUSY; + } + + q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev)); + + if (!q) + return -ENOMEM; + + if (devm_add_action_or_reset(&vdev->dev, + virtio_pmem_release_queue, q)) + return -ENOMEM; + + vpmem->pfn_flags = PFN_DEV; + + /* allocate memap in pmem device itself */ + if (IS_ENABLED(CONFIG_ZONE_DEVICE)) { + + addr = devm_memremap_pages(&vdev->dev, res, + &q->q_usage_counter, altmap); + vpmem->pfn_flags |= PFN_MAP; + } else + addr = devm_memremap(&vdev->dev, vpmem->start, + vpmem->size, ARCH_MEMREMAP_PMEM); + + /* + * At release time the queue must be frozen before + * devm_memremap_pages is unwound + */ + if (devm_add_action_or_reset(&vdev->dev, + virtio_pmem_freeze_queue, q)) + return -ENOMEM; + + if (IS_ERR(addr)) + return PTR_ERR(addr); + + vpmem->virt_addr = addr; + blk_queue_write_cache(q, 0, 0); + blk_queue_make_request(q, virtio_pmem_make_request); + blk_queue_physical_block_size(q, PAGE_SIZE); + blk_queue_logical_block_size(q, 512); + blk_queue_max_hw_sectors(q, UINT_MAX); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); + q->queuedata = vpmem; + + disk = alloc_disk_node(0, nid); + if (!disk) + return -ENOMEM; + vpmem->disk = disk; + + disk->fops = &pmem_fops; + disk->queue = q; + disk->flags = GENHD_FL_EXT_DEVT; + strcpy(disk->disk_name, "vpmem"); + set_capacity(disk, vpmem->size/512); + gendev = disk_to_dev(disk); + + virtio_device_ready(vdev); + device_add_disk(&vdev->dev, disk); + + if (devm_add_action_or_reset(&vdev->dev, + virtio_pmem_release_disk, vpmem)) + return -ENOMEM; + + revalidate_disk(disk); + return 0; +out: + vdev->config->del_vqs(vdev); + return err; +} + +static struct virtio_driver virtio_pmem_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtio_pmem_probe, + //.remove = virtio_pmem_remove, +}; + +module_virtio_driver(virtio_pmem_driver); +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio pmem driver"); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h new file mode 100644 index 000000000000..ec0c728c79ba --- /dev/null +++ b/include/uapi/linux/virtio_pmem.h @@ -0,0 +1,55 @@ +/* + * Virtio pmem Device + * + * + */ + +#ifndef _LINUX_VIRTIO_PMEM_H +#define _LINUX_VIRTIO_PMEM_H + +#include <linux/types.h> +#include <linux/virtio_types.h> +#include <linux/virtio_ids.h> +#include <linux/virtio_config.h> +#include <linux/pfn_t.h> +#include <linux/fs.h> +#include <linux/blk-mq.h> +#include <linux/pmem_common.h> + +bool pmem_should_map_pages(struct device *dev); + +#define VIRTIO_PMEM_PLUG 0 + +struct virtio_pmem_config { + +uint64_t start; +uint64_t size; +uint64_t align; +uint64_t data_offset; +}; + +struct virtio_pmem { + + struct virtio_device *vdev; + struct virtqueue *req_vq; + + uint64_t start; + uint64_t size; + uint64_t align; + uint64_t data_offset; + u64 pfn_flags; + void *virt_addr; + struct gendisk *disk; +} __packed; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static unsigned int features[] = { + VIRTIO_PMEM_PLUG, +}; + +#endif +
This patch adds virtio-pmem driver for KVM guest. Guest reads the persistent memory range information over virtio bus from Qemu and reserves the range as persistent memory. Guest also allocates a block device corresponding to the pmem range which later can be accessed with DAX compatible file systems. Idea is to use the virtio channel between guest and host to perform the block device flush for guest pmem DAX device. There is work to do including DAX file system support and other advanced features. Signed-off-by: Pankaj Gupta <pagupta@redhat.com> --- drivers/virtio/Kconfig | 10 ++ drivers/virtio/Makefile | 1 + drivers/virtio/virtio_pmem.c | 322 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_pmem.h | 55 +++++++ 4 files changed, 388 insertions(+) create mode 100644 drivers/virtio/virtio_pmem.c create mode 100644 include/uapi/linux/virtio_pmem.h