diff mbox series

[v12,02/12] bcache: initialize the nvm pages allocator

Message ID 20210811170224.42837-3-colyli@suse.de (mailing list archive)
State New, archived
Headers show
Series bcache: support NVDIMM for journaling | expand

Commit Message

Coly Li Aug. 11, 2021, 5:02 p.m. UTC
From: Jianpeng Ma <jianpeng.ma@intel.com>

This patch define the prototype data structures in memory and
initializes the nvm pages allocator.

The nvm address space which is managed by this allocator can consist of
many nvm namespaces, and some namespaces can compose into one nvm set,
like cache set. For this initial implementation, only one set can be
supported.

The users of this nvm pages allocator need to call register_namespace()
to register the nvdimm device (like /dev/pmemX) into this allocator as
the instance of struct nvm_namespace.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/Kconfig     |  10 +
 drivers/md/bcache/Makefile    |   1 +
 drivers/md/bcache/nvm-pages.c | 339 ++++++++++++++++++++++++++++++++++
 drivers/md/bcache/nvm-pages.h |  96 ++++++++++
 drivers/md/bcache/super.c     |   3 +
 5 files changed, 449 insertions(+)
 create mode 100644 drivers/md/bcache/nvm-pages.c
 create mode 100644 drivers/md/bcache/nvm-pages.h

Comments

Dan Williams Aug. 12, 2021, 5:43 a.m. UTC | #1
On Wed, Aug 11, 2021 at 10:04 AM Coly Li <colyli@suse.de> wrote:
>
> From: Jianpeng Ma <jianpeng.ma@intel.com>
>
> This patch define the prototype data structures in memory and
> initializes the nvm pages allocator.
>
> The nvm address space which is managed by this allocator can consist of
> many nvm namespaces, and some namespaces can compose into one nvm set,
> like cache set. For this initial implementation, only one set can be
> supported.
>
> The users of this nvm pages allocator need to call register_namespace()
> to register the nvdimm device (like /dev/pmemX) into this allocator as
> the instance of struct nvm_namespace.
>
> Reported-by: Randy Dunlap <rdunlap@infradead.org>
> Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
> Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
> Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Hannes Reinecke <hare@suse.de>
> Cc: Jens Axboe <axboe@kernel.dk>
> ---
>  drivers/md/bcache/Kconfig     |  10 +
>  drivers/md/bcache/Makefile    |   1 +
>  drivers/md/bcache/nvm-pages.c | 339 ++++++++++++++++++++++++++++++++++
>  drivers/md/bcache/nvm-pages.h |  96 ++++++++++
>  drivers/md/bcache/super.c     |   3 +
>  5 files changed, 449 insertions(+)
>  create mode 100644 drivers/md/bcache/nvm-pages.c
>  create mode 100644 drivers/md/bcache/nvm-pages.h
>
> diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
> index d1ca4d059c20..a69f6c0e0507 100644
> --- a/drivers/md/bcache/Kconfig
> +++ b/drivers/md/bcache/Kconfig
> @@ -35,3 +35,13 @@ config BCACHE_ASYNC_REGISTRATION
>         device path into this file will returns immediately and the real
>         registration work is handled in kernel work queue in asynchronous
>         way.
> +
> +config BCACHE_NVM_PAGES
> +       bool "NVDIMM support for bcache (EXPERIMENTAL)"
> +       depends on BCACHE
> +       depends on 64BIT
> +       depends on LIBNVDIMM
> +       depends on DAX
> +       help
> +         Allocate/release NV-memory pages for bcache and provide allocated pages
> +         for each requestor after system reboot.
> diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
> index 5b87e59676b8..2397bb7c7ffd 100644
> --- a/drivers/md/bcache/Makefile
> +++ b/drivers/md/bcache/Makefile
> @@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE)    += bcache.o
>  bcache-y               := alloc.o bset.o btree.o closure.o debug.o extents.o\
>         io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
>         util.o writeback.o features.o
> +bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvm-pages.o
> diff --git a/drivers/md/bcache/nvm-pages.c b/drivers/md/bcache/nvm-pages.c
> new file mode 100644
> index 000000000000..6184c628d9cc
> --- /dev/null
> +++ b/drivers/md/bcache/nvm-pages.c
> @@ -0,0 +1,339 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Nvdimm page-buddy allocator
> + *
> + * Copyright (c) 2021, Intel Corporation.
> + * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>.
> + * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>.
> + */
> +
> +#include "bcache.h"
> +#include "nvm-pages.h"
> +
> +#include <linux/slab.h>
> +#include <linux/list.h>
> +#include <linux/mutex.h>
> +#include <linux/dax.h>
> +#include <linux/pfn_t.h>
> +#include <linux/libnvdimm.h>
> +#include <linux/mm_types.h>
> +#include <linux/err.h>
> +#include <linux/pagemap.h>
> +#include <linux/bitmap.h>
> +#include <linux/blkdev.h>
> +
> +struct bch_nvmpg_set *global_nvmpg_set;
> +
> +void *bch_nvmpg_offset_to_ptr(unsigned long offset)
> +{
> +       int ns_id = BCH_NVMPG_GET_NS_ID(offset);
> +       struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id];
> +
> +       if (offset == 0)
> +               return NULL;
> +
> +       ns_id = BCH_NVMPG_GET_NS_ID(offset);
> +       ns = global_nvmpg_set->ns_tbl[ns_id];
> +
> +       if (ns)
> +               return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset));
> +
> +       pr_err("Invalid ns_id %u\n", ns_id);
> +       return NULL;
> +}
> +
> +unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
> +{
> +       int ns_id = ns->ns_id;
> +       unsigned long offset = (unsigned long)(ptr - ns->base_addr);
> +
> +       return BCH_NVMPG_OFFSET(ns_id, offset);
> +}
> +
> +static void release_ns_tbl(struct bch_nvmpg_set *set)
> +{
> +       int i;
> +       struct bch_nvmpg_ns *ns;
> +
> +       for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
> +               ns = set->ns_tbl[i];
> +               if (ns) {
> +                       blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
> +                       set->ns_tbl[i] = NULL;
> +                       set->attached_ns--;
> +                       kfree(ns);
> +               }
> +       }
> +
> +       if (set->attached_ns)
> +               pr_err("unexpected attached_ns: %u\n", set->attached_ns);
> +}
> +
> +static void release_nvmpg_set(struct bch_nvmpg_set *set)
> +{
> +       release_ns_tbl(set);
> +       kfree(set);
> +}
> +
> +/* Namespace 0 contains all meta data of the nvmpg allocation set */
> +static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
> +{
> +       struct bch_nvmpg_set_header *set_header;
> +
> +       if (ns->ns_id != 0) {
> +               pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
> +                      ns->ns_id);
> +               return -EINVAL;
> +       }
> +
> +       set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset);
> +
> +       mutex_lock(&global_nvmpg_set->lock);
> +       global_nvmpg_set->set_header = set_header;
> +       global_nvmpg_set->heads_size = set_header->size;
> +       global_nvmpg_set->heads_used = set_header->used;
> +       mutex_unlock(&global_nvmpg_set->lock);
> +
> +       return 0;
> +}
> +
> +static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
> +{
> +       struct bch_nvmpg_sb *sb = ns->sb;
> +       int rc = 0;
> +
> +       mutex_lock(&global_nvmpg_set->lock);
> +
> +       if (global_nvmpg_set->ns_tbl[sb->this_ns]) {
> +               pr_err("ns_id %u already attached.\n", ns->ns_id);
> +               rc = -EEXIST;
> +               goto unlock;
> +       }
> +
> +       if (ns->ns_id != 0) {
> +               pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id);
> +               rc = -EINVAL;
> +               goto unlock;
> +       }
> +
> +       if (global_nvmpg_set->attached_ns > 0) {
> +               pr_err("multiple namespace attaching not supported yet\n");
> +               rc = -EOPNOTSUPP;
> +               goto unlock;
> +       }
> +
> +       if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) {
> +               pr_err("namespace counters error: attached %u > total %u\n",
> +                      global_nvmpg_set->attached_ns,
> +                      global_nvmpg_set->total_ns);
> +               rc = -EINVAL;
> +               goto unlock;
> +       }
> +
> +       memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16);
> +       global_nvmpg_set->ns_tbl[sb->this_ns] = ns;
> +       global_nvmpg_set->attached_ns++;
> +       global_nvmpg_set->total_ns = sb->total_ns;
> +
> +unlock:
> +       mutex_unlock(&global_nvmpg_set->lock);
> +       return rc;
> +}
> +
> +static int read_nvdimm_meta_super(struct block_device *bdev,
> +                                 struct bch_nvmpg_ns *ns)
> +{
> +       struct page *page;
> +       struct bch_nvmpg_sb *sb;
> +       uint64_t expected_csum = 0;
> +       int r;
> +
> +       page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
> +                               BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
> +
> +       if (IS_ERR(page))
> +               return -EIO;
> +
> +       sb = (struct bch_nvmpg_sb *)
> +            (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET));
> +
> +       r = -EINVAL;
> +       expected_csum = csum_set(sb);
> +       if (expected_csum != sb->csum) {
> +               pr_info("csum is not match with expected one\n");
> +               goto put_page;
> +       }
> +
> +       if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
> +               pr_info("invalid bch_nvmpg_magic\n");
> +               goto put_page;
> +       }
> +
> +       if (sb->sb_offset !=
> +           BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) {
> +               pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset);
> +               goto put_page;
> +       }
> +
> +       r = -EOPNOTSUPP;
> +       if (sb->total_ns != 1) {
> +               pr_info("multiple name space not supported yet.\n");
> +               goto put_page;
> +       }
> +
> +
> +       r = 0;
> +       /* Necessary for DAX mapping */
> +       ns->page_size = sb->page_size;
> +       ns->pages_total = sb->pages_total;
> +
> +put_page:
> +       put_page(page);
> +       return r;
> +}
> +
> +struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
> +{
> +       struct bch_nvmpg_ns *ns = NULL;
> +       struct bch_nvmpg_sb *sb = NULL;
> +       char buf[BDEVNAME_SIZE];
> +       struct block_device *bdev;
> +       pgoff_t pgoff;
> +       int id, err;
> +       char *path;
> +       long dax_ret = 0;
> +
> +       path = kstrndup(dev_path, 512, GFP_KERNEL);
> +       if (!path) {
> +               pr_err("kstrndup failed\n");
> +               return ERR_PTR(-ENOMEM);
> +       }
> +
> +       bdev = blkdev_get_by_path(strim(path),
> +                                 FMODE_READ|FMODE_WRITE|FMODE_EXEC,
> +                                 global_nvmpg_set);
> +       if (IS_ERR(bdev)) {
> +               pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev));
> +               kfree(path);
> +               return ERR_PTR(PTR_ERR(bdev));
> +       }
> +
> +       err = -ENOMEM;
> +       ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL);
> +       if (!ns)
> +               goto bdput;
> +
> +       err = -EIO;
> +       if (read_nvdimm_meta_super(bdev, ns)) {
> +               pr_err("%s read nvdimm meta super block failed.\n",
> +                      bdevname(bdev, buf));
> +               goto free_ns;
> +       }
> +
> +       err = -EOPNOTSUPP;
> +       if (!bdev_dax_supported(bdev, ns->page_size)) {
> +               pr_err("%s don't support DAX\n", bdevname(bdev, buf));
> +               goto free_ns;
> +       }
> +
> +       err = -EINVAL;
> +       if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
> +               pr_err("invalid offset of %s\n", bdevname(bdev, buf));
> +               goto free_ns;
> +       }
> +
> +       err = -ENOMEM;
> +       ns->dax_dev = fs_dax_get_by_bdev(bdev);
> +       if (!ns->dax_dev) {
> +               pr_err("can't by dax device by %s\n", bdevname(bdev, buf));
> +               goto free_ns;
> +       }
> +
> +       err = -EINVAL;
> +       id = dax_read_lock();
> +       dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
> +                                   &ns->base_addr, &ns->start_pfn);
> +       if (dax_ret <= 0) {
> +               pr_err("dax_direct_access error\n");
> +               dax_read_unlock(id);
> +               goto free_ns;
> +       }
> +
> +       if (dax_ret < ns->pages_total) {
> +               pr_warn("mapped range %ld is less than ns->pages_total %lu\n",
> +                       dax_ret, ns->pages_total);

This failure will become a common occurrence with CXL namespaces that
will have discontiguous range support. It's already the case for
dax-devices for soft-reserved memory [1]. In the CXL case the
discontinuity will be 256MB aligned, for the soft-reserved dax-devices
the discontinuity granularity can be as small as 4K.

[1]: https://elixir.bootlin.com/linux/v5.14-rc5/source/drivers/dax/device.c#L414
Coly Li Aug. 12, 2021, 8:26 a.m. UTC | #2
On 8/12/21 1:43 PM, Dan Williams wrote:
> On Wed, Aug 11, 2021 at 10:04 AM Coly Li <colyli@suse.de> wrote:
>> From: Jianpeng Ma <jianpeng.ma@intel.com>
>>
>> This patch define the prototype data structures in memory and
>> initializes the nvm pages allocator.
>>
>> The nvm address space which is managed by this allocator can consist of
>> many nvm namespaces, and some namespaces can compose into one nvm set,
>> like cache set. For this initial implementation, only one set can be
>> supported.
>>
>> The users of this nvm pages allocator need to call register_namespace()
>> to register the nvdimm device (like /dev/pmemX) into this allocator as
>> the instance of struct nvm_namespace.
>>
>> Reported-by: Randy Dunlap <rdunlap@infradead.org>
>> Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
>> Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
>> Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: Dan Williams <dan.j.williams@intel.com>
>> Cc: Hannes Reinecke <hare@suse.de>
>> Cc: Jens Axboe <axboe@kernel.dk>
>> ---
>>  drivers/md/bcache/Kconfig     |  10 +
>>  drivers/md/bcache/Makefile    |   1 +
>>  drivers/md/bcache/nvm-pages.c | 339 ++++++++++++++++++++++++++++++++++
>>  drivers/md/bcache/nvm-pages.h |  96 ++++++++++
>>  drivers/md/bcache/super.c     |   3 +
>>  5 files changed, 449 insertions(+)
>>  create mode 100644 drivers/md/bcache/nvm-pages.c
>>  create mode 100644 drivers/md/bcache/nvm-pages.h
>>
[snipped]
>> +
>> +       err = -EOPNOTSUPP;
>> +       if (!bdev_dax_supported(bdev, ns->page_size)) {
>> +               pr_err("%s don't support DAX\n", bdevname(bdev, buf));
>> +               goto free_ns;
>> +       }
>> +
>> +       err = -EINVAL;
>> +       if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
>> +               pr_err("invalid offset of %s\n", bdevname(bdev, buf));
>> +               goto free_ns;
>> +       }
>> +
>> +       err = -ENOMEM;
>> +       ns->dax_dev = fs_dax_get_by_bdev(bdev);
>> +       if (!ns->dax_dev) {
>> +               pr_err("can't by dax device by %s\n", bdevname(bdev, buf));
>> +               goto free_ns;
>> +       }
>> +
>> +       err = -EINVAL;
>> +       id = dax_read_lock();
>> +       dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
>> +                                   &ns->base_addr, &ns->start_pfn);
>> +       if (dax_ret <= 0) {
>> +               pr_err("dax_direct_access error\n");
>> +               dax_read_unlock(id);
>> +               goto free_ns;
>> +       }
>> +
>> +       if (dax_ret < ns->pages_total) {
>> +               pr_warn("mapped range %ld is less than ns->pages_total %lu\n",
>> +                       dax_ret, ns->pages_total);

Hi Dan,

Many thanks for your information.

> This failure will become a common occurrence with CXL namespaces that
> will have discontiguous range support. It's already the case for
> dax-devices for soft-reserved memory [1]. In the CXL case the
> discontinuity will be 256MB aligned, for the soft-reserved dax-devices
> the discontinuity granularity can be as small as 4K.
>
> [1]: https://elixir.bootlin.com/linux/v5.14-rc5/source/drivers/dax/device.c#L414

Fortunately the on-media allocation list format works with multiple
ranges of the namespace. For the in-memory struct bch_nvmpg_ns currently
assumes the namespace is a flat continuous range. Yes, we need to
consider and support multiple ranges in struct bch_nvmpg_ns for buddy
allocation initialization to skip the discontinuous gap.

It will be in the to-do list for next work. Thanks for your comments and
hint.

Coly Li
diff mbox series

Patch

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index d1ca4d059c20..a69f6c0e0507 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -35,3 +35,13 @@  config BCACHE_ASYNC_REGISTRATION
 	device path into this file will returns immediately and the real
 	registration work is handled in kernel work queue in asynchronous
 	way.
+
+config BCACHE_NVM_PAGES
+	bool "NVDIMM support for bcache (EXPERIMENTAL)"
+	depends on BCACHE
+	depends on 64BIT
+	depends on LIBNVDIMM
+	depends on DAX
+	help
+	  Allocate/release NV-memory pages for bcache and provide allocated pages
+	  for each requestor after system reboot.
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 5b87e59676b8..2397bb7c7ffd 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -5,3 +5,4 @@  obj-$(CONFIG_BCACHE)	+= bcache.o
 bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
 	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
 	util.o writeback.o features.o
+bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvm-pages.o
diff --git a/drivers/md/bcache/nvm-pages.c b/drivers/md/bcache/nvm-pages.c
new file mode 100644
index 000000000000..6184c628d9cc
--- /dev/null
+++ b/drivers/md/bcache/nvm-pages.c
@@ -0,0 +1,339 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Nvdimm page-buddy allocator
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>.
+ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>.
+ */
+
+#include "bcache.h"
+#include "nvm-pages.h"
+
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/dax.h>
+#include <linux/pfn_t.h>
+#include <linux/libnvdimm.h>
+#include <linux/mm_types.h>
+#include <linux/err.h>
+#include <linux/pagemap.h>
+#include <linux/bitmap.h>
+#include <linux/blkdev.h>
+
+struct bch_nvmpg_set *global_nvmpg_set;
+
+void *bch_nvmpg_offset_to_ptr(unsigned long offset)
+{
+	int ns_id = BCH_NVMPG_GET_NS_ID(offset);
+	struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[ns_id];
+
+	if (offset == 0)
+		return NULL;
+
+	ns_id = BCH_NVMPG_GET_NS_ID(offset);
+	ns = global_nvmpg_set->ns_tbl[ns_id];
+
+	if (ns)
+		return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset));
+
+	pr_err("Invalid ns_id %u\n", ns_id);
+	return NULL;
+}
+
+unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr)
+{
+	int ns_id = ns->ns_id;
+	unsigned long offset = (unsigned long)(ptr - ns->base_addr);
+
+	return BCH_NVMPG_OFFSET(ns_id, offset);
+}
+
+static void release_ns_tbl(struct bch_nvmpg_set *set)
+{
+	int i;
+	struct bch_nvmpg_ns *ns;
+
+	for (i = 0; i < BCH_NVMPG_NS_MAX; i++) {
+		ns = set->ns_tbl[i];
+		if (ns) {
+			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+			set->ns_tbl[i] = NULL;
+			set->attached_ns--;
+			kfree(ns);
+		}
+	}
+
+	if (set->attached_ns)
+		pr_err("unexpected attached_ns: %u\n", set->attached_ns);
+}
+
+static void release_nvmpg_set(struct bch_nvmpg_set *set)
+{
+	release_ns_tbl(set);
+	kfree(set);
+}
+
+/* Namespace 0 contains all meta data of the nvmpg allocation set */
+static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns)
+{
+	struct bch_nvmpg_set_header *set_header;
+
+	if (ns->ns_id != 0) {
+		pr_err("unexpected ns_id %u for first nvmpg namespace.\n",
+		       ns->ns_id);
+		return -EINVAL;
+	}
+
+	set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset);
+
+	mutex_lock(&global_nvmpg_set->lock);
+	global_nvmpg_set->set_header = set_header;
+	global_nvmpg_set->heads_size = set_header->size;
+	global_nvmpg_set->heads_used = set_header->used;
+	mutex_unlock(&global_nvmpg_set->lock);
+
+	return 0;
+}
+
+static int attach_nvmpg_set(struct bch_nvmpg_ns *ns)
+{
+	struct bch_nvmpg_sb *sb = ns->sb;
+	int rc = 0;
+
+	mutex_lock(&global_nvmpg_set->lock);
+
+	if (global_nvmpg_set->ns_tbl[sb->this_ns]) {
+		pr_err("ns_id %u already attached.\n", ns->ns_id);
+		rc = -EEXIST;
+		goto unlock;
+	}
+
+	if (ns->ns_id != 0) {
+		pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id);
+		rc = -EINVAL;
+		goto unlock;
+	}
+
+	if (global_nvmpg_set->attached_ns > 0) {
+		pr_err("multiple namespace attaching not supported yet\n");
+		rc = -EOPNOTSUPP;
+		goto unlock;
+	}
+
+	if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) {
+		pr_err("namespace counters error: attached %u > total %u\n",
+		       global_nvmpg_set->attached_ns,
+		       global_nvmpg_set->total_ns);
+		rc = -EINVAL;
+		goto unlock;
+	}
+
+	memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16);
+	global_nvmpg_set->ns_tbl[sb->this_ns] = ns;
+	global_nvmpg_set->attached_ns++;
+	global_nvmpg_set->total_ns = sb->total_ns;
+
+unlock:
+	mutex_unlock(&global_nvmpg_set->lock);
+	return rc;
+}
+
+static int read_nvdimm_meta_super(struct block_device *bdev,
+				  struct bch_nvmpg_ns *ns)
+{
+	struct page *page;
+	struct bch_nvmpg_sb *sb;
+	uint64_t expected_csum = 0;
+	int r;
+
+	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+				BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
+
+	if (IS_ERR(page))
+		return -EIO;
+
+	sb = (struct bch_nvmpg_sb *)
+	     (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET));
+
+	r = -EINVAL;
+	expected_csum = csum_set(sb);
+	if (expected_csum != sb->csum) {
+		pr_info("csum is not match with expected one\n");
+		goto put_page;
+	}
+
+	if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
+		pr_info("invalid bch_nvmpg_magic\n");
+		goto put_page;
+	}
+
+	if (sb->sb_offset !=
+	    BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) {
+		pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset);
+		goto put_page;
+	}
+
+	r = -EOPNOTSUPP;
+	if (sb->total_ns != 1) {
+		pr_info("multiple name space not supported yet.\n");
+		goto put_page;
+	}
+
+
+	r = 0;
+	/* Necessary for DAX mapping */
+	ns->page_size = sb->page_size;
+	ns->pages_total = sb->pages_total;
+
+put_page:
+	put_page(page);
+	return r;
+}
+
+struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+{
+	struct bch_nvmpg_ns *ns = NULL;
+	struct bch_nvmpg_sb *sb = NULL;
+	char buf[BDEVNAME_SIZE];
+	struct block_device *bdev;
+	pgoff_t pgoff;
+	int id, err;
+	char *path;
+	long dax_ret = 0;
+
+	path = kstrndup(dev_path, 512, GFP_KERNEL);
+	if (!path) {
+		pr_err("kstrndup failed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	bdev = blkdev_get_by_path(strim(path),
+				  FMODE_READ|FMODE_WRITE|FMODE_EXEC,
+				  global_nvmpg_set);
+	if (IS_ERR(bdev)) {
+		pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev));
+		kfree(path);
+		return ERR_PTR(PTR_ERR(bdev));
+	}
+
+	err = -ENOMEM;
+	ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL);
+	if (!ns)
+		goto bdput;
+
+	err = -EIO;
+	if (read_nvdimm_meta_super(bdev, ns)) {
+		pr_err("%s read nvdimm meta super block failed.\n",
+		       bdevname(bdev, buf));
+		goto free_ns;
+	}
+
+	err = -EOPNOTSUPP;
+	if (!bdev_dax_supported(bdev, ns->page_size)) {
+		pr_err("%s don't support DAX\n", bdevname(bdev, buf));
+		goto free_ns;
+	}
+
+	err = -EINVAL;
+	if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
+		pr_err("invalid offset of %s\n", bdevname(bdev, buf));
+		goto free_ns;
+	}
+
+	err = -ENOMEM;
+	ns->dax_dev = fs_dax_get_by_bdev(bdev);
+	if (!ns->dax_dev) {
+		pr_err("can't by dax device by %s\n", bdevname(bdev, buf));
+		goto free_ns;
+	}
+
+	err = -EINVAL;
+	id = dax_read_lock();
+	dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
+				    &ns->base_addr, &ns->start_pfn);
+	if (dax_ret <= 0) {
+		pr_err("dax_direct_access error\n");
+		dax_read_unlock(id);
+		goto free_ns;
+	}
+
+	if (dax_ret < ns->pages_total) {
+		pr_warn("mapped range %ld is less than ns->pages_total %lu\n",
+			dax_ret, ns->pages_total);
+	}
+	dax_read_unlock(id);
+
+	sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET);
+
+	err = -EINVAL;
+	/* Check magic again to make sure DAX mapping is correct */
+	if (memcmp(sb->magic, bch_nvmpg_magic, 16)) {
+		pr_err("invalid bch_nvmpg_magic after DAX mapping\n");
+		goto free_ns;
+	}
+
+	if ((global_nvmpg_set->attached_ns > 0) &&
+	     memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) {
+		pr_err("set uuid does not match with ns_id %u\n", ns->ns_id);
+		goto free_ns;
+	}
+
+	if (sb->set_header_offset !=
+	    BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) {
+		pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n",
+		       sb->this_ns,
+		       BCH_NVMPG_GET_NS_ID(sb->set_header_offset),
+		       BCH_NVMPG_GET_OFFSET(sb->set_header_offset));
+		goto free_ns;
+	}
+
+	ns->page_size = sb->page_size;
+	ns->pages_offset = sb->pages_offset;
+	ns->pages_total = sb->pages_total;
+	ns->sb = sb;
+	ns->free = 0;
+	ns->bdev = bdev;
+	ns->set = global_nvmpg_set;
+
+	err = attach_nvmpg_set(ns);
+	if (err < 0)
+		goto free_ns;
+
+	mutex_init(&ns->lock);
+
+	err = init_nvmpg_set_header(ns);
+	if (err < 0)
+		goto free_ns;
+
+	kfree(path);
+	return ns;
+
+free_ns:
+	kfree(ns);
+bdput:
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+	kfree(path);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(bch_register_namespace);
+
+int __init bch_nvmpg_init(void)
+{
+	global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL);
+	if (!global_nvmpg_set)
+		return -ENOMEM;
+
+	global_nvmpg_set->total_ns = 0;
+	mutex_init(&global_nvmpg_set->lock);
+
+	pr_info("bcache nvm init\n");
+	return 0;
+}
+
+void bch_nvmpg_exit(void)
+{
+	release_nvmpg_set(global_nvmpg_set);
+	pr_info("bcache nvm exit\n");
+}
diff --git a/drivers/md/bcache/nvm-pages.h b/drivers/md/bcache/nvm-pages.h
new file mode 100644
index 000000000000..827cff695608
--- /dev/null
+++ b/drivers/md/bcache/nvm-pages.h
@@ -0,0 +1,96 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BCACHE_NVM_PAGES_H
+#define _BCACHE_NVM_PAGES_H
+
+#include <linux/bcache-nvm.h>
+#include <linux/libnvdimm.h>
+
+/*
+ * Bcache NVDIMM in memory data structures
+ */
+
+/*
+ * The following three structures in memory records which page(s) allocated
+ * to which owner. After reboot from power failure, they will be initialized
+ * based on nvm pages superblock in NVDIMM device.
+ */
+struct bch_nvmpg_ns {
+	struct bch_nvmpg_sb *sb;
+	void *base_addr;
+
+	unsigned char uuid[16];
+	int ns_id;
+	unsigned int page_size;
+	unsigned long free;
+	unsigned long pages_offset;
+	unsigned long pages_total;
+	pfn_t start_pfn;
+
+	struct dax_device *dax_dev;
+	struct block_device *bdev;
+	struct bch_nvmpg_set *set;
+
+	struct mutex lock;
+};
+
+/*
+ * A set of namespaces. Currently only one set can be supported.
+ */
+struct bch_nvmpg_set {
+	unsigned char set_uuid[16];
+
+	int heads_size;
+	int heads_used;
+	struct bch_nvmpg_set_header *set_header;
+
+	struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX];
+	int total_ns;
+	int attached_ns;
+
+	struct mutex lock;
+};
+
+#define BCH_NVMPG_NS_ID_BITS	3
+#define BCH_NVMPG_OFFSET_BITS	61
+#define BCH_NVMPG_NS_ID_MASK	((1UL<<BCH_NVMPG_NS_ID_BITS) - 1)
+#define BCH_NVMPG_OFFSET_MASK	((1UL<<BCH_NVMPG_OFFSET_BITS) - 1)
+
+#define BCH_NVMPG_GET_NS_ID(offset)					\
+	(((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK)
+
+#define BCH_NVMPG_GET_OFFSET(offset)	((offset) & BCH_NVMPG_OFFSET_MASK)
+
+#define BCH_NVMPG_OFFSET(ns_id, offset)					\
+	((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) |	\
+	 ((offset) & BCH_NVMPG_OFFSET_MASK))
+
+/* Indicate which field in bch_nvmpg_sb to be updated */
+#define BCH_NVMPG_TOTAL_NS	0	/* total_ns */
+
+void *bch_nvmpg_offset_to_ptr(unsigned long offset);
+unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr);
+
+#if defined(CONFIG_BCACHE_NVM_PAGES)
+
+struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path);
+int bch_nvmpg_init(void);
+void bch_nvmpg_exit(void);
+
+#else
+
+static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path)
+{
+	return NULL;
+}
+
+static inline int bch_nvmpg_init(void)
+{
+	return 0;
+}
+
+static inline void bch_nvmpg_exit(void) { }
+
+#endif /* CONFIG_BCACHE_NVM_PAGES */
+
+#endif /* _BCACHE_NVM_PAGES_H */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 185246a0d855..4326ffa0d21f 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -14,6 +14,7 @@ 
 #include "request.h"
 #include "writeback.h"
 #include "features.h"
+#include "nvm-pages.h"
 
 #include <linux/blkdev.h>
 #include <linux/pagemap.h>
@@ -2809,6 +2810,7 @@  static void bcache_exit(void)
 {
 	bch_debug_exit();
 	bch_request_exit();
+	bch_nvmpg_exit();
 	if (bcache_kobj)
 		kobject_put(bcache_kobj);
 	if (bcache_wq)
@@ -2907,6 +2909,7 @@  static int __init bcache_init(void)
 
 	bch_debug_init();
 	closure_debug_init();
+	bch_nvmpg_init();
 
 	bcache_is_reboot = false;