[1/3] pmem: Initial version of persistent memory driver
diff mbox

Message ID 1427358764-6126-2-git-send-email-hch@lst.de
State New, archived
Headers show

Commit Message

Christoph Hellwig March 26, 2015, 8:32 a.m. UTC
From: Ross Zwisler <ross.zwisler@linux.intel.com>

PMEM is a new driver that presents a reserved range of memory as a
block device.  This is useful for developing with NV-DIMMs, and
can be used with volatile memory as a development platform.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
[hch: convert to use a platform_device for discovery, fix partition
 support]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 MAINTAINERS            |   6 +
 drivers/block/Kconfig  |  13 ++
 drivers/block/Makefile |   1 +
 drivers/block/pmem.c   | 373 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 393 insertions(+)
 create mode 100644 drivers/block/pmem.c

Comments

Dan Williams March 26, 2015, 2:12 p.m. UTC | #1
On Thu, Mar 26, 2015 at 1:32 AM, Christoph Hellwig <hch@lst.de> wrote:
> From: Ross Zwisler <ross.zwisler@linux.intel.com>
>
> PMEM is a new driver that presents a reserved range of memory as a
> block device.  This is useful for developing with NV-DIMMs, and
> can be used with volatile memory as a development platform.
>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> [hch: convert to use a platform_device for discovery, fix partition
>  support]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> ---
>  MAINTAINERS            |   6 +
>  drivers/block/Kconfig  |  13 ++
>  drivers/block/Makefile |   1 +
>  drivers/block/pmem.c   | 373 +++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 393 insertions(+)
>  create mode 100644 drivers/block/pmem.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 358eb01..efacf2b 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8063,6 +8063,12 @@ S:       Maintained
>  F:     Documentation/blockdev/ramdisk.txt
>  F:     drivers/block/brd.c
>
> +PERSISTENT MEMORY DRIVER
> +M:     Ross Zwisler <ross.zwisler@linux.intel.com>
> +L:     linux-nvdimm@lists.01.org
> +S:     Supported
> +F:     drivers/block/pmem.c
> +
>  RANDOM NUMBER DRIVER
>  M:     "Theodore Ts'o" <tytso@mit.edu>
>  S:     Maintained
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 1b8094d..9284aaf 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -404,6 +404,19 @@ config BLK_DEV_RAM_DAX
>           and will prevent RAM block device backing store memory from being
>           allocated from highmem (only a problem for highmem systems).
>
> +config BLK_DEV_PMEM
> +       tristate "Persistent memory block device support"
> +       help
> +         Saying Y here will allow you to use a contiguous range of reserved
> +         memory as one or more block devices.  Memory for PMEM should be
> +         reserved using the "memmap" kernel parameter.
> +
> +         To compile this driver as a module, choose M here: the module will be
> +         called pmem.
> +
> +         Most normal users won't need this functionality, and can thus say N
> +         here.
> +
>  config CDROM_PKTCDVD
>         tristate "Packet writing on CD/DVD media"
>         depends on !UML
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 02b688d..9cc6c18 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM)                += ps3vram.o
>  obj-$(CONFIG_ATARI_FLOPPY)     += ataflop.o
>  obj-$(CONFIG_AMIGA_Z2RAM)      += z2ram.o
>  obj-$(CONFIG_BLK_DEV_RAM)      += brd.o
> +obj-$(CONFIG_BLK_DEV_PMEM)     += pmem.o
>  obj-$(CONFIG_BLK_DEV_LOOP)     += loop.o
>  obj-$(CONFIG_BLK_CPQ_DA)       += cpqarray.o
>  obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
> new file mode 100644
> index 0000000..545b13b
> --- /dev/null
> +++ b/drivers/block/pmem.c
> @@ -0,0 +1,373 @@
> +/*
> + * Persistent Memory Driver
> + * Copyright (c) 2014, Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * This driver is heavily based on drivers/block/brd.c.
> + * Copyright (C) 2007 Nick Piggin
> + * Copyright (C) 2007 Novell Inc.
> + */
> +
> +#include <asm/cacheflush.h>
> +#include <linux/blkdev.h>
> +#include <linux/hdreg.h>
> +#include <linux/init.h>
> +#include <linux/platform_device.h>
> +#include <linux/module.h>
> +#include <linux/moduleparam.h>
> +#include <linux/slab.h>
> +
> +#define SECTOR_SHIFT           9
> +#define PAGE_SECTORS_SHIFT     (PAGE_SHIFT - SECTOR_SHIFT)
> +#define PAGE_SECTORS           (1 << PAGE_SECTORS_SHIFT)
> +
> +#define PMEM_MINORS            16
> +
> +struct pmem_device {
> +       struct request_queue    *pmem_queue;
> +       struct gendisk          *pmem_disk;
> +
> +       /* One contiguous memory region per device */
> +       phys_addr_t             phys_addr;
> +       void                    *virt_addr;
> +       size_t                  size;
> +};
> +
> +static int pmem_major;
> +static atomic_t pmem_index;
> +
> +static int pmem_getgeo(struct block_device *bd, struct hd_geometry *geo)
> +{
> +       /* some standard values */
> +       geo->heads = 1 << 6;
> +       geo->sectors = 1 << 5;
> +       geo->cylinders = get_capacity(bd->bd_disk) >> 11;
> +       return 0;
> +}
> +
> +/*
> + * direct translation from (pmem,sector) => void*
> + * We do not require that sector be page aligned.
> + * The return value will point to the beginning of the page containing the
> + * given sector, not to the sector itself.
> + */
> +static void *pmem_lookup_pg_addr(struct pmem_device *pmem, sector_t sector)
> +{
> +       size_t page_offset = sector >> PAGE_SECTORS_SHIFT;
> +       size_t offset = page_offset << PAGE_SHIFT;
> +
> +       BUG_ON(offset >= pmem->size);
> +       return pmem->virt_addr + offset;
> +}
> +
> +/* sector must be page aligned */
> +static unsigned long pmem_lookup_pfn(struct pmem_device *pmem, sector_t sector)
> +{
> +       size_t page_offset = sector >> PAGE_SECTORS_SHIFT;
> +
> +       BUG_ON(sector & (PAGE_SECTORS - 1));
> +       return (pmem->phys_addr >> PAGE_SHIFT) + page_offset;
> +}
> +
> +/*
> + * sector is not required to be page aligned.
> + * n is at most a single page, but could be less.
> + */
> +static void copy_to_pmem(struct pmem_device *pmem, const void *src,
> +                       sector_t sector, size_t n)
> +{
> +       void *dst;
> +       unsigned int offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
> +       size_t copy;
> +
> +       BUG_ON(n > PAGE_SIZE);
> +
> +       copy = min_t(size_t, n, PAGE_SIZE - offset);
> +       dst = pmem_lookup_pg_addr(pmem, sector);
> +       memcpy(dst + offset, src, copy);
> +
> +       if (copy < n) {
> +               src += copy;
> +               sector += copy >> SECTOR_SHIFT;
> +               copy = n - copy;
> +               dst = pmem_lookup_pg_addr(pmem, sector);
> +               memcpy(dst, src, copy);
> +       }
> +}
> +
> +/*
> + * sector is not required to be page aligned.
> + * n is at most a single page, but could be less.
> + */
> +static void copy_from_pmem(void *dst, struct pmem_device *pmem,
> +                         sector_t sector, size_t n)
> +{
> +       void *src;
> +       unsigned int offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
> +       size_t copy;
> +
> +       BUG_ON(n > PAGE_SIZE);
> +
> +       copy = min_t(size_t, n, PAGE_SIZE - offset);
> +       src = pmem_lookup_pg_addr(pmem, sector);
> +
> +       memcpy(dst, src + offset, copy);
> +
> +       if (copy < n) {
> +               dst += copy;
> +               sector += copy >> SECTOR_SHIFT;
> +               copy = n - copy;
> +               src = pmem_lookup_pg_addr(pmem, sector);
> +               memcpy(dst, src, copy);
> +       }
> +}
> +
> +static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
> +                       unsigned int len, unsigned int off, int rw,
> +                       sector_t sector)
> +{
> +       void *mem = kmap_atomic(page);
> +
> +       if (rw == READ) {
> +               copy_from_pmem(mem + off, pmem, sector, len);
> +               flush_dcache_page(page);
> +       } else {
> +               /*
> +                * FIXME: Need more involved flushing to ensure that writes to
> +                * NVDIMMs are actually durable before returning.
> +                */
> +               flush_dcache_page(page);
> +               copy_to_pmem(pmem, mem + off, sector, len);
> +       }
> +
> +       kunmap_atomic(mem);
> +}
> +
> +static void pmem_make_request(struct request_queue *q, struct bio *bio)
> +{
> +       struct block_device *bdev = bio->bi_bdev;
> +       struct pmem_device *pmem = bdev->bd_disk->private_data;
> +       int rw;
> +       struct bio_vec bvec;
> +       sector_t sector;
> +       struct bvec_iter iter;
> +       int err = 0;
> +
> +       sector = bio->bi_iter.bi_sector;
> +       if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
> +               err = -EIO;
> +               goto out;
> +       }
> +
> +       BUG_ON(bio->bi_rw & REQ_DISCARD);
> +
> +       rw = bio_rw(bio);
> +       if (rw == READA)
> +               rw = READ;
> +
> +       bio_for_each_segment(bvec, bio, iter) {
> +               unsigned int len = bvec.bv_len;
> +
> +               BUG_ON(len > PAGE_SIZE);
> +               pmem_do_bvec(pmem, bvec.bv_page, len,
> +                           bvec.bv_offset, rw, sector);
> +               sector += len >> SECTOR_SHIFT;
> +       }
> +
> +out:
> +       bio_endio(bio, err);
> +}
> +
> +static int pmem_rw_page(struct block_device *bdev, sector_t sector,
> +                      struct page *page, int rw)
> +{
> +       struct pmem_device *pmem = bdev->bd_disk->private_data;
> +
> +       pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
> +       page_endio(page, rw & WRITE, 0);
> +       return 0;
> +}
> +
> +static long pmem_direct_access(struct block_device *bdev, sector_t sector,
> +                             void **kaddr, unsigned long *pfn, long size)
> +{
> +       struct pmem_device *pmem = bdev->bd_disk->private_data;
> +
> +       if (!pmem)
> +               return -ENODEV;
> +
> +       *kaddr = pmem_lookup_pg_addr(pmem, sector);
> +       *pfn = pmem_lookup_pfn(pmem, sector);
> +
> +       return pmem->size - (sector * 512);
> +}
> +
> +static const struct block_device_operations pmem_fops = {
> +       .owner =                THIS_MODULE,
> +       .rw_page =              pmem_rw_page,
> +       .direct_access =        pmem_direct_access,
> +       .getgeo =               pmem_getgeo,
> +};
> +
> +/* pmem->phys_addr and pmem->size need to be set.
> + * Will then set virt_addr if successful.
> + */
> +static int pmem_mapmem(struct pmem_device *pmem)
> +{
> +       struct resource *res_mem;
> +       int err;
> +
> +       res_mem = request_mem_region_exclusive(pmem->phys_addr, pmem->size,
> +                                              "pmem");

Isn't request_mem_region() enough?  i.e. it seems
request_mem_region_exclusive() assumes no DAX, at least in theory?

> +       if (!res_mem) {
> +               pr_warn("pmem: request_mem_region_exclusive phys=0x%llx size=0x%zx failed\n",
> +                          pmem->phys_addr, pmem->size);
> +               return -EINVAL;
> +       }
> +
> +       /*
> +        * Map the memory as non-cachable, as we can't write back the contents
> +        * of the CPU caches in case of a crash.
> +        */
> +       pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);

This is fine for now, but I think we're going to end up with a
continuum of solutions to this problem based on the platform and the
device.  Some ADR platforms have firmware that takes actions like
flushing caches on a "power going away" signal.  Other platforms have
cache management instructions that we can use on either a per-i/o or
per REQ_FUA/FLUSH request.  Hmm, with this being in the memory map by
default I think this poses a challenge for VIVT caches and aliased
accesses?  We can revisit this when arm support shows up.
Christoph Hellwig March 26, 2015, 2:35 p.m. UTC | #2
On Thu, Mar 26, 2015 at 07:12:23AM -0700, Dan Williams wrote:
> > +       struct resource *res_mem;
> > +       int err;
> > +
> > +       res_mem = request_mem_region_exclusive(pmem->phys_addr, pmem->size,
> > +                                              "pmem");
> 
> Isn't request_mem_region() enough?  i.e. it seems
> request_mem_region_exclusive() assumes no DAX, at least in theory?

This is 1:1 from the patch Ross sent, but I've been wondering why
request_mem_region_exclusive is used here.  All it does is setting the
IORESOURCE_EXCLUSIVE flag, which prevents /dev/mem and sysfs from accessing
the memory while the driver claims it. Besides pmem only a watchdog driver
and e1000 make use of this flag, and there's various function related to
it that are entirely unused.  It's a weird beast.

> This is fine for now, but I think we're going to end up with a
> continuum of solutions to this problem based on the platform and the
> device.  Some ADR platforms have firmware that takes actions like
> flushing caches on a "power going away" signal.  Other platforms have
> cache management instructions that we can use on either a per-i/o or
> per REQ_FUA/FLUSH request.  Hmm, with this being in the memory map by
> default I think this poses a challenge for VIVT caches and aliased
> accesses?  We can revisit this when arm support shows up.

Yes, I expect us to pass flags related to this through the platform_data
eventually, but I think that starting with the simplest and safest version
is probably the best idea.
Boaz Harrosh March 26, 2015, 2:52 p.m. UTC | #3
On 03/26/2015 04:12 PM, Dan Williams wrote:
> On Thu, Mar 26, 2015 at 1:32 AM, Christoph Hellwig <hch@lst.de> wrote:
>> From: Ross Zwisler <ross.zwisler@linux.intel.com>
>>

Dan something is Broken with you mailer program it keeps dropping the
CC when sending replies.

For example Both me and Ross who were on CC got dropped, Jens Axboe
though got add back.

Its not only this email, it is all the emails in this series, please
check what is going on.

Thanks
Boaz

>> PMEM is a new driver that presents a reserved range of memory as a
>> block device.  This is useful for developing with NV-DIMMs, and
>> can be used with volatile memory as a development platform.
>>
>> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
>> [hch: convert to use a platform_device for discovery, fix partition
>>  support]
<>
Dan Williams March 26, 2015, 3:59 p.m. UTC | #4
On Thu, Mar 26, 2015 at 7:52 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> On 03/26/2015 04:12 PM, Dan Williams wrote:
>> On Thu, Mar 26, 2015 at 1:32 AM, Christoph Hellwig <hch@lst.de> wrote:
>>> From: Ross Zwisler <ross.zwisler@linux.intel.com>
>>>
>
> Dan something is Broken with you mailer program it keeps dropping the
> CC when sending replies.
>
> For example Both me and Ross who were on CC got dropped, Jens Axboe
> though got add back.
>
> Its not only this email, it is all the emails in this series, please
> check what is going on.

They show up in the archives:
https://lists.01.org/pipermail/linux-nvdimm/2015-March/thread.html

Sometimes vger.kernel.org drops intel.com mails, it's outside my control.
Ross Zwisler March 26, 2015, 9:37 p.m. UTC | #5
On Thu, 2015-03-26 at 15:35 +0100, Christoph Hellwig wrote:
> On Thu, Mar 26, 2015 at 07:12:23AM -0700, Dan Williams wrote:
> > > +       struct resource *res_mem;
> > > +       int err;
> > > +
> > > +       res_mem = request_mem_region_exclusive(pmem->phys_addr, pmem->size,
> > > +                                              "pmem");
> > 
> > Isn't request_mem_region() enough?  i.e. it seems
> > request_mem_region_exclusive() assumes no DAX, at least in theory?
> 
> This is 1:1 from the patch Ross sent, but I've been wondering why
> request_mem_region_exclusive is used here.  All it does is setting the
> IORESOURCE_EXCLUSIVE flag, which prevents /dev/mem and sysfs from accessing
> the memory while the driver claims it. Besides pmem only a watchdog driver
> and e1000 make use of this flag, and there's various function related to
> it that are entirely unused.  It's a weird beast.

I don't have a compelling reason to use request_mem_region_exclusive()
over request_mem_region().  If the latter is cleaner I'm fine with the
change.

Patch
diff mbox

diff --git a/MAINTAINERS b/MAINTAINERS
index 358eb01..efacf2b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8063,6 +8063,12 @@  S:	Maintained
 F:	Documentation/blockdev/ramdisk.txt
 F:	drivers/block/brd.c
 
+PERSISTENT MEMORY DRIVER
+M:	Ross Zwisler <ross.zwisler@linux.intel.com>
+L:	linux-nvdimm@lists.01.org
+S:	Supported
+F:	drivers/block/pmem.c
+
 RANDOM NUMBER DRIVER
 M:	"Theodore Ts'o" <tytso@mit.edu>
 S:	Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1b8094d..9284aaf 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -404,6 +404,19 @@  config BLK_DEV_RAM_DAX
 	  and will prevent RAM block device backing store memory from being
 	  allocated from highmem (only a problem for highmem systems).
 
+config BLK_DEV_PMEM
+	tristate "Persistent memory block device support"
+	help
+	  Saying Y here will allow you to use a contiguous range of reserved
+	  memory as one or more block devices.  Memory for PMEM should be
+	  reserved using the "memmap" kernel parameter.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called pmem.
+
+	  Most normal users won't need this functionality, and can thus say N
+	  here.
+
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media"
 	depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..9cc6c18 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@  obj-$(CONFIG_PS3_VRAM)		+= ps3vram.o
 obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
+obj-$(CONFIG_BLK_DEV_PMEM)	+= pmem.o
 obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
new file mode 100644
index 0000000..545b13b
--- /dev/null
+++ b/drivers/block/pmem.c
@@ -0,0 +1,373 @@ 
+/*
+ * Persistent Memory Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/brd.c.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include <asm/cacheflush.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+
+#define SECTOR_SHIFT		9
+#define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
+
+#define PMEM_MINORS		16
+
+struct pmem_device {
+	struct request_queue	*pmem_queue;
+	struct gendisk		*pmem_disk;
+
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	void			*virt_addr;
+	size_t			size;
+};
+
+static int pmem_major;
+static atomic_t pmem_index;
+
+static int pmem_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+	/* some standard values */
+	geo->heads = 1 << 6;
+	geo->sectors = 1 << 5;
+	geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+	return 0;
+}
+
+/*
+ * direct translation from (pmem,sector) => void*
+ * We do not require that sector be page aligned.
+ * The return value will point to the beginning of the page containing the
+ * given sector, not to the sector itself.
+ */
+static void *pmem_lookup_pg_addr(struct pmem_device *pmem, sector_t sector)
+{
+	size_t page_offset = sector >> PAGE_SECTORS_SHIFT;
+	size_t offset = page_offset << PAGE_SHIFT;
+
+	BUG_ON(offset >= pmem->size);
+	return pmem->virt_addr + offset;
+}
+
+/* sector must be page aligned */
+static unsigned long pmem_lookup_pfn(struct pmem_device *pmem, sector_t sector)
+{
+	size_t page_offset = sector >> PAGE_SECTORS_SHIFT;
+
+	BUG_ON(sector & (PAGE_SECTORS - 1));
+	return (pmem->phys_addr >> PAGE_SHIFT) + page_offset;
+}
+
+/*
+ * sector is not required to be page aligned.
+ * n is at most a single page, but could be less.
+ */
+static void copy_to_pmem(struct pmem_device *pmem, const void *src,
+			sector_t sector, size_t n)
+{
+	void *dst;
+	unsigned int offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	BUG_ON(n > PAGE_SIZE);
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	dst = pmem_lookup_pg_addr(pmem, sector);
+	memcpy(dst + offset, src, copy);
+
+	if (copy < n) {
+		src += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		dst = pmem_lookup_pg_addr(pmem, sector);
+		memcpy(dst, src, copy);
+	}
+}
+
+/*
+ * sector is not required to be page aligned.
+ * n is at most a single page, but could be less.
+ */
+static void copy_from_pmem(void *dst, struct pmem_device *pmem,
+			  sector_t sector, size_t n)
+{
+	void *src;
+	unsigned int offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	BUG_ON(n > PAGE_SIZE);
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	src = pmem_lookup_pg_addr(pmem, sector);
+
+	memcpy(dst, src + offset, copy);
+
+	if (copy < n) {
+		dst += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		src = pmem_lookup_pg_addr(pmem, sector);
+		memcpy(dst, src, copy);
+	}
+}
+
+static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+			unsigned int len, unsigned int off, int rw,
+			sector_t sector)
+{
+	void *mem = kmap_atomic(page);
+
+	if (rw == READ) {
+		copy_from_pmem(mem + off, pmem, sector, len);
+		flush_dcache_page(page);
+	} else {
+		/*
+		 * FIXME: Need more involved flushing to ensure that writes to
+		 * NVDIMMs are actually durable before returning.
+		 */
+		flush_dcache_page(page);
+		copy_to_pmem(pmem, mem + off, sector, len);
+	}
+
+	kunmap_atomic(mem);
+}
+
+static void pmem_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct block_device *bdev = bio->bi_bdev;
+	struct pmem_device *pmem = bdev->bd_disk->private_data;
+	int rw;
+	struct bio_vec bvec;
+	sector_t sector;
+	struct bvec_iter iter;
+	int err = 0;
+
+	sector = bio->bi_iter.bi_sector;
+	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
+		err = -EIO;
+		goto out;
+	}
+
+	BUG_ON(bio->bi_rw & REQ_DISCARD);
+
+	rw = bio_rw(bio);
+	if (rw == READA)
+		rw = READ;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+
+		BUG_ON(len > PAGE_SIZE);
+		pmem_do_bvec(pmem, bvec.bv_page, len,
+			    bvec.bv_offset, rw, sector);
+		sector += len >> SECTOR_SHIFT;
+	}
+
+out:
+	bio_endio(bio, err);
+}
+
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, int rw)
+{
+	struct pmem_device *pmem = bdev->bd_disk->private_data;
+
+	pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+	page_endio(page, rw & WRITE, 0);
+	return 0;
+}
+
+static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+			      void **kaddr, unsigned long *pfn, long size)
+{
+	struct pmem_device *pmem = bdev->bd_disk->private_data;
+
+	if (!pmem)
+		return -ENODEV;
+
+	*kaddr = pmem_lookup_pg_addr(pmem, sector);
+	*pfn = pmem_lookup_pfn(pmem, sector);
+
+	return pmem->size - (sector * 512);
+}
+
+static const struct block_device_operations pmem_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		pmem_rw_page,
+	.direct_access =	pmem_direct_access,
+	.getgeo =		pmem_getgeo,
+};
+
+/* pmem->phys_addr and pmem->size need to be set.
+ * Will then set virt_addr if successful.
+ */
+static int pmem_mapmem(struct pmem_device *pmem)
+{
+	struct resource *res_mem;
+	int err;
+
+	res_mem = request_mem_region_exclusive(pmem->phys_addr, pmem->size,
+					       "pmem");
+	if (!res_mem) {
+		pr_warn("pmem: request_mem_region_exclusive phys=0x%llx size=0x%zx failed\n",
+			   pmem->phys_addr, pmem->size);
+		return -EINVAL;
+	}
+
+	/*
+	 * Map the memory as non-cachable, as we can't write back the contents
+	 * of the CPU caches in case of a crash.
+	 */
+	pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
+	if (unlikely(!pmem->virt_addr)) {
+		err = -ENXIO;
+		goto out_release;
+	}
+	return 0;
+
+out_release:
+	release_mem_region(pmem->phys_addr, pmem->size);
+	return err;
+}
+
+static void pmem_unmapmem(struct pmem_device *pmem)
+{
+	if (unlikely(!pmem->virt_addr))
+		return;
+
+	iounmap(pmem->virt_addr);
+	release_mem_region(pmem->phys_addr, pmem->size);
+	pmem->virt_addr = NULL;
+}
+
+static int pmem_probe(struct platform_device *pdev)
+{
+	struct pmem_device *pmem;
+	struct gendisk *disk;
+	struct resource *res;
+	int idx, err;
+
+	if (WARN_ON(pdev->num_resources > 1))
+		return -ENXIO;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENXIO;
+
+	pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
+	if (unlikely(!pmem))
+		return -ENOMEM;
+
+	pmem->phys_addr = res->start;
+	pmem->size = resource_size(res);
+
+	err = pmem_mapmem(pmem);
+	if (unlikely(err))
+		goto out_free_dev;
+
+	err = -ENOMEM;
+	pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
+	if (unlikely(!pmem->pmem_queue))
+		goto out_unmap;
+
+	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
+	blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
+	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
+
+	disk = alloc_disk(PMEM_MINORS);
+	if (unlikely(!disk))
+		goto out_free_queue;
+
+	idx = atomic_inc_return(&pmem_index) - 1;
+
+	disk->major		= pmem_major;
+	disk->first_minor	= PMEM_MINORS * idx;
+	disk->fops		= &pmem_fops;
+	disk->private_data	= pmem;
+	disk->queue		= pmem->pmem_queue;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "pmem%d", idx);
+	disk->driverfs_dev = &pdev->dev;
+	set_capacity(disk, pmem->size >> SECTOR_SHIFT);
+	pmem->pmem_disk = disk;
+
+	add_disk(disk);
+
+	platform_set_drvdata(pdev, pmem);
+	return 0;
+
+out_free_queue:
+	blk_cleanup_queue(pmem->pmem_queue);
+out_unmap:
+	pmem_unmapmem(pmem);
+out_free_dev:
+	kfree(pmem);
+	return err;
+}
+
+static int pmem_remove(struct platform_device *pdev)
+{
+	struct pmem_device *pmem = platform_get_drvdata(pdev);
+
+	del_gendisk(pmem->pmem_disk);
+	put_disk(pmem->pmem_disk);
+	blk_cleanup_queue(pmem->pmem_queue);
+	pmem_unmapmem(pmem);
+	kfree(pmem);
+
+	return 0;
+}
+
+static struct platform_driver pmem_driver = {
+	.probe		= pmem_probe,
+	.remove		= pmem_remove,
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= "pmem",
+	},
+};
+
+static int __init pmem_init(void)
+{
+	int error;
+
+	pmem_major = register_blkdev(0, "pmem");
+	if (pmem_major < 0)
+		return pmem_major;
+
+	error = platform_driver_register(&pmem_driver);
+	if (error)
+		unregister_blkdev(pmem_major, "pmem");
+	return error;
+}
+
+static void pmem_exit(void)
+{
+	platform_driver_unregister(&pmem_driver);
+	unregister_blkdev(pmem_major, "pmem");
+}
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+
+module_init(pmem_init);
+module_exit(pmem_exit);