diff mbox series

[05/14] bcache: initialization of the buddy

Message ID 20210615054921.101421-6-colyli@suse.de (mailing list archive)
State New, archived
Headers show
Series bcache patches for Linux v5.14 | expand

Commit Message

Coly Li June 15, 2021, 5:49 a.m. UTC
From: Jianpeng Ma <jianpeng.ma@intel.com>

This nvm pages allocator will implement the simple buddy to manage the
nvm address space. This patch initializes this buddy for new namespace.

the unit of alloc/free of the buddy is page. DAX device has their
struct page(in dram or PMEM).

        struct {        /* ZONE_DEVICE pages */
                /** @pgmap: Points to the hosting device page map. */
                struct dev_pagemap *pgmap;
                void *zone_device_data;
                /*
                 * ZONE_DEVICE private pages are counted as being
                 * mapped so the next 3 words hold the mapping, index,
                 * and private fields from the source anonymous or
                 * page cache page while the page is migrated to device
                 * private memory.
                 * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
                 * use the mapping, index, and private fields when
                 * pmem backed DAX files are mapped.
                 */
        };

ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use.
So the second/third word will be used as 'struct list_head ' which list
in buddy. The fourth word(that is normal struct page::index) store pgoff
which the page-offset in the dax device. And the fifth word (that is
normal struct page::private) store order of buddy. page_type will be used
to store buddy flags.

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Signed-off-by: Coly Li <colyli@suse.de>
---
 drivers/md/bcache/nvm-pages.c   | 156 +++++++++++++++++++++++++++++++-
 drivers/md/bcache/nvm-pages.h   |   6 ++
 include/uapi/linux/bcache-nvm.h |  10 +-
 3 files changed, 165 insertions(+), 7 deletions(-)

Comments

Hannes Reinecke June 22, 2021, 10:45 a.m. UTC | #1
On 6/15/21 7:49 AM, Coly Li wrote:
> From: Jianpeng Ma <jianpeng.ma@intel.com>
> 
> This nvm pages allocator will implement the simple buddy to manage the
> nvm address space. This patch initializes this buddy for new namespace.
> 
Please use 'buddy allocator' instead of just 'buddy'.

> the unit of alloc/free of the buddy is page. DAX device has their
> struct page(in dram or PMEM).
> 
>         struct {        /* ZONE_DEVICE pages */
>                 /** @pgmap: Points to the hosting device page map. */
>                 struct dev_pagemap *pgmap;
>                 void *zone_device_data;
>                 /*
>                  * ZONE_DEVICE private pages are counted as being
>                  * mapped so the next 3 words hold the mapping, index,
>                  * and private fields from the source anonymous or
>                  * page cache page while the page is migrated to device
>                  * private memory.
>                  * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
>                  * use the mapping, index, and private fields when
>                  * pmem backed DAX files are mapped.
>                  */
>         };
> 
> ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use.
> So the second/third word will be used as 'struct list_head ' which list
> in buddy. The fourth word(that is normal struct page::index) store pgoff
> which the page-offset in the dax device. And the fifth word (that is
> normal struct page::private) store order of buddy. page_type will be used
> to store buddy flags.
> 
> Reported-by: kernel test robot <lkp@intel.com>
> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
> Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
> Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
> Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
> Signed-off-by: Coly Li <colyli@suse.de>
> ---
>  drivers/md/bcache/nvm-pages.c   | 156 +++++++++++++++++++++++++++++++-
>  drivers/md/bcache/nvm-pages.h   |   6 ++
>  include/uapi/linux/bcache-nvm.h |  10 +-
>  3 files changed, 165 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/md/bcache/nvm-pages.c b/drivers/md/bcache/nvm-pages.c
> index 18fdadbc502f..804ee66e97be 100644
> --- a/drivers/md/bcache/nvm-pages.c
> +++ b/drivers/md/bcache/nvm-pages.c
> @@ -34,6 +34,10 @@ static void release_nvm_namespaces(struct bch_nvm_set *nvm_set)
>  	for (i = 0; i < nvm_set->total_namespaces_nr; i++) {
>  		ns = nvm_set->nss[i];
>  		if (ns) {
> +			kvfree(ns->pages_bitmap);
> +			if (ns->pgalloc_recs_bitmap)
> +				bitmap_free(ns->pgalloc_recs_bitmap);
> +
>  			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
>  			kfree(ns);
>  		}
> @@ -48,17 +52,130 @@ static void release_nvm_set(struct bch_nvm_set *nvm_set)
>  	kfree(nvm_set);
>  }
>  
> +static struct page *nvm_vaddr_to_page(struct bch_nvm_namespace *ns, void *addr)
> +{
> +	return virt_to_page(addr);
> +}
> +
> +static void *nvm_pgoff_to_vaddr(struct bch_nvm_namespace *ns, pgoff_t pgoff)
> +{
> +	return ns->kaddr + (pgoff << PAGE_SHIFT);
> +}
> +
> +static inline void remove_owner_space(struct bch_nvm_namespace *ns,
> +					pgoff_t pgoff, u64 nr)
> +{
> +	while (nr > 0) {
> +		unsigned int num = nr > UINT_MAX ? UINT_MAX : nr;
> +
> +		bitmap_set(ns->pages_bitmap, pgoff, num);
> +		nr -= num;
> +		pgoff += num;
> +	}
> +}
> +
> +#define BCH_PGOFF_TO_KVADDR(pgoff) ((void *)((unsigned long)pgoff << PAGE_SHIFT))
> +
>  static int init_owner_info(struct bch_nvm_namespace *ns)
>  {
>  	struct bch_owner_list_head *owner_list_head = ns->sb->owner_list_head;
> +	struct bch_nvm_pgalloc_recs *sys_recs;
> +	int i, j, k, rc = 0;
>  
>  	mutex_lock(&only_set->lock);
>  	only_set->owner_list_head = owner_list_head;
>  	only_set->owner_list_size = owner_list_head->size;
>  	only_set->owner_list_used = owner_list_head->used;
> +
> +	/* remove used space */
> +	remove_owner_space(ns, 0, div_u64(ns->pages_offset, ns->page_size));
> +
> +	sys_recs = ns->kaddr + BCH_NVM_PAGES_SYS_RECS_HEAD_OFFSET;
> +	/* suppose no hole in array */
> +	for (i = 0; i < owner_list_head->used; i++) {
> +		struct bch_nvm_pages_owner_head *head = &owner_list_head->heads[i];
> +
> +		for (j = 0; j < BCH_NVM_PAGES_NAMESPACES_MAX; j++) {
> +			struct bch_nvm_pgalloc_recs *pgalloc_recs = head->recs[j];
> +			unsigned long offset = (unsigned long)ns->kaddr >> PAGE_SHIFT;
> +			struct page *page;
> +
> +			while (pgalloc_recs) {
> +				u32 pgalloc_recs_pos = (unsigned int)(pgalloc_recs - sys_recs);
> +
> +				if (memcmp(pgalloc_recs->magic, bch_nvm_pages_pgalloc_magic, 16)) {
> +					pr_info("invalid bch_nvm_pages_pgalloc_magic\n");
> +					rc = -EINVAL;
> +					goto unlock;
> +				}
> +				if (memcmp(pgalloc_recs->owner_uuid, head->uuid, 16)) {
> +					pr_info("invalid owner_uuid in bch_nvm_pgalloc_recs\n");
> +					rc = -EINVAL;
> +					goto unlock;
> +				}
> +				if (pgalloc_recs->owner != head) {
> +					pr_info("invalid owner in bch_nvm_pgalloc_recs\n");
> +					rc = -EINVAL;
> +					goto unlock;
> +				}
> +
> +				/* recs array can has hole */

can have holes ?

> +				for (k = 0; k < pgalloc_recs->size; k++) {
> +					struct bch_pgalloc_rec *rec = &pgalloc_recs->recs[k];
> +
> +					if (rec->pgoff) {
> +						BUG_ON(rec->pgoff <= offset);
> +
> +						/* init struct page: index/private */
> +						page = nvm_vaddr_to_page(ns,
> +							BCH_PGOFF_TO_KVADDR(rec->pgoff));
> +
> +						set_page_private(page, rec->order);
> +						page->index = rec->pgoff - offset;
> +
> +						remove_owner_space(ns,
> +							rec->pgoff - offset,
> +							1L << rec->order);
> +					}
> +				}
> +				bitmap_set(ns->pgalloc_recs_bitmap, pgalloc_recs_pos, 1);
> +				pgalloc_recs = pgalloc_recs->next;
> +			}
> +		}
> +	}
> +unlock:
>  	mutex_unlock(&only_set->lock);
>  
> -	return 0;
> +	return rc;
> +}
> +
> +static void init_nvm_free_space(struct bch_nvm_namespace *ns)
> +{
> +	unsigned int start, end, pages;
> +	int i;
> +	struct page *page;
> +	pgoff_t pgoff_start;
> +
> +	bitmap_for_each_clear_region(ns->pages_bitmap, start, end, 0, ns->pages_total) {
> +		pgoff_start = start;
> +		pages = end - start;
> +
> +		while (pages) {
> +			for (i = BCH_MAX_ORDER - 1; i >= 0 ; i--) {
> +				if ((pgoff_start % (1L << i) == 0) && (pages >= (1L << i)))
> +					break;
> +			}
> +
> +			page = nvm_vaddr_to_page(ns, nvm_pgoff_to_vaddr(ns, pgoff_start));
> +			page->index = pgoff_start;
> +			set_page_private(page, i);
> +			__SetPageBuddy(page);
> +			list_add((struct list_head *)&page->zone_device_data, &ns->free_area[i]);
> +
> +			pgoff_start += 1L << i;
> +			pages -= 1L << i;
> +		}
> +	}
>  }
>  
>  static int attach_nvm_set(struct bch_nvm_namespace *ns)
> @@ -165,7 +282,7 @@ static int read_nvdimm_meta_super(struct block_device *bdev,
>  struct bch_nvm_namespace *bch_register_namespace(const char *dev_path)
>  {
>  	struct bch_nvm_namespace *ns;
> -	int err;
> +	int i, err;
>  	pgoff_t pgoff;
>  	char buf[BDEVNAME_SIZE];
>  	struct block_device *bdev;
> @@ -249,18 +366,49 @@ struct bch_nvm_namespace *bch_register_namespace(const char *dev_path)
>  	ns->nvm_set = only_set;
>  	mutex_init(&ns->lock);
>  
> +	/*
> +	 * parameters of bitmap_set/clear are unsigned int.
> +	 * Given currently size of nvm is far from exceeding this limit,
> +	 * so only add a WARN_ON message.
> +	 */
> +	WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX);
> +	ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total),
> +					sizeof(unsigned long), GFP_KERNEL);
> +	if (!ns->pages_bitmap) {
> +		err = -ENOMEM;
> +		goto clear_ns_nr;
> +	}
> +
> +	if (ns->sb->this_namespace_nr == 0) {
> +		ns->pgalloc_recs_bitmap = bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL);
> +		if (ns->pgalloc_recs_bitmap == NULL) {
> +			err = -ENOMEM;
> +			goto free_pages_bitmap;
> +		}
> +	}
> +
> +	for (i = 0; i < BCH_MAX_ORDER; i++)
> +		INIT_LIST_HEAD(&ns->free_area[i]);
> +
>  	if (ns->sb->this_namespace_nr == 0) {
>  		pr_info("only first namespace contain owner info\n");
>  		err = init_owner_info(ns);
>  		if (err < 0) {
>  			pr_info("init_owner_info met error %d\n", err);
> -			only_set->nss[ns->sb->this_namespace_nr] = NULL;
> -			goto free_ns;
> +			goto free_recs_bitmap;
>  		}
> +		/* init buddy allocator */
> +		init_nvm_free_space(ns);
>  	}
>  
>  	kfree(path);
>  	return ns;
> +free_recs_bitmap:
> +	bitmap_free(ns->pgalloc_recs_bitmap);
> +free_pages_bitmap:
> +	kvfree(ns->pages_bitmap);
> +clear_ns_nr:
> +	only_set->nss[ns->sb->this_namespace_nr] = NULL;
>  free_ns:
>  	kfree(ns);
>  bdput:
> diff --git a/drivers/md/bcache/nvm-pages.h b/drivers/md/bcache/nvm-pages.h
> index 3e24c4dee7fd..71beb244b9be 100644
> --- a/drivers/md/bcache/nvm-pages.h
> +++ b/drivers/md/bcache/nvm-pages.h
> @@ -16,6 +16,7 @@
>   * to which owner. After reboot from power failure, they will be initialized
>   * based on nvm pages superblock in NVDIMM device.
>   */
> +#define BCH_MAX_ORDER 20
>  struct bch_nvm_namespace {
>  	struct bch_nvm_pages_sb *sb;
>  	void *kaddr;
> @@ -27,6 +28,11 @@ struct bch_nvm_namespace {
>  	u64 pages_total;
>  	pfn_t start_pfn;
>  
> +	unsigned long *pages_bitmap;
> +	struct list_head free_area[BCH_MAX_ORDER];
> +
> +	unsigned long *pgalloc_recs_bitmap;
> >  	struct dax_device *dax_dev;
>  	struct block_device *bdev;
>  	struct bch_nvm_set *nvm_set;
> diff --git a/include/uapi/linux/bcache-nvm.h b/include/uapi/linux/bcache-nvm.h
> index 5094a6797679..1fdb3eaabf7e 100644
> --- a/include/uapi/linux/bcache-nvm.h
> +++ b/include/uapi/linux/bcache-nvm.h
> @@ -130,11 +130,15 @@ union {
>  };
>  };
>  
> -#define BCH_MAX_RECS					\
> -	((sizeof(struct bch_nvm_pgalloc_recs) -		\
> -	 offsetof(struct bch_nvm_pgalloc_recs, recs)) /	\
> +#define BCH_MAX_RECS							\
> +	((sizeof(struct bch_nvm_pgalloc_recs) -				\
> +	 offsetof(struct bch_nvm_pgalloc_recs, recs)) /			\
>  	 sizeof(struct bch_pgalloc_rec))
>  
> +#define BCH_MAX_PGALLOC_RECS						\
> +	((BCH_NVM_PAGES_OFFSET - BCH_NVM_PAGES_SYS_RECS_HEAD_OFFSET) /	\
> +	 sizeof(struct bch_nvm_pgalloc_recs))
> +
>  struct bch_nvm_pages_owner_head {
>  	unsigned char			uuid[16];
>  	unsigned char			label[BCH_NVM_PAGES_LABEL_SIZE];
> 
Cheers,

Hannes
Coly Li June 23, 2021, 5:35 a.m. UTC | #2
On 6/22/21 6:45 PM, Hannes Reinecke wrote:
> On 6/15/21 7:49 AM, Coly Li wrote:
>> From: Jianpeng Ma <jianpeng.ma@intel.com>
>>
>> This nvm pages allocator will implement the simple buddy to manage the
>> nvm address space. This patch initializes this buddy for new namespace.
>>
> Please use 'buddy allocator' instead of just 'buddy'.

Will update in next post.


>
>> the unit of alloc/free of the buddy is page. DAX device has their
>> struct page(in dram or PMEM).
>>
>>         struct {        /* ZONE_DEVICE pages */
>>                 /** @pgmap: Points to the hosting device page map. */
>>                 struct dev_pagemap *pgmap;
>>                 void *zone_device_data;
>>                 /*
>>                  * ZONE_DEVICE private pages are counted as being
>>                  * mapped so the next 3 words hold the mapping, index,
>>                  * and private fields from the source anonymous or
>>                  * page cache page while the page is migrated to device
>>                  * private memory.
>>                  * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
>>                  * use the mapping, index, and private fields when
>>                  * pmem backed DAX files are mapped.
>>                  */
>>         };
>>
>> ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use.
>> So the second/third word will be used as 'struct list_head ' which list
>> in buddy. The fourth word(that is normal struct page::index) store pgoff
>> which the page-offset in the dax device. And the fifth word (that is
>> normal struct page::private) store order of buddy. page_type will be used
>> to store buddy flags.
>>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
>> Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
>> Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com>
>> Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
>> Signed-off-by: Coly Li <colyli@suse.de>
>> ---
>>  drivers/md/bcache/nvm-pages.c   | 156 +++++++++++++++++++++++++++++++-
>>  drivers/md/bcache/nvm-pages.h   |   6 ++
>>  include/uapi/linux/bcache-nvm.h |  10 +-
>>  3 files changed, 165 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/md/bcache/nvm-pages.c b/drivers/md/bcache/nvm-pages.c
>> index 18fdadbc502f..804ee66e97be 100644
>> --- a/drivers/md/bcache/nvm-pages.c
>> +++ b/drivers/md/bcache/nvm-pages.c
>> @@ -34,6 +34,10 @@ static void release_nvm_namespaces(struct bch_nvm_set *nvm_set)
>>  	for (i = 0; i < nvm_set->total_namespaces_nr; i++) {
>>  		ns = nvm_set->nss[i];
>>  		if (ns) {
>> +			kvfree(ns->pages_bitmap);
>> +			if (ns->pgalloc_recs_bitmap)
>> +				bitmap_free(ns->pgalloc_recs_bitmap);
>> +
>>  			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
>>  			kfree(ns);
>>  		}
>> @@ -48,17 +52,130 @@ static void release_nvm_set(struct bch_nvm_set *nvm_set)
>>  	kfree(nvm_set);
>>  }
>>  
>> +static struct page *nvm_vaddr_to_page(struct bch_nvm_namespace *ns, void *addr)
>> +{
>> +	return virt_to_page(addr);
>> +}
>> +
>> +static void *nvm_pgoff_to_vaddr(struct bch_nvm_namespace *ns, pgoff_t pgoff)
>> +{
>> +	return ns->kaddr + (pgoff << PAGE_SHIFT);
>> +}
>> +
>> +static inline void remove_owner_space(struct bch_nvm_namespace *ns,
>> +					pgoff_t pgoff, u64 nr)
>> +{
>> +	while (nr > 0) {
>> +		unsigned int num = nr > UINT_MAX ? UINT_MAX : nr;
>> +
>> +		bitmap_set(ns->pages_bitmap, pgoff, num);
>> +		nr -= num;
>> +		pgoff += num;
>> +	}
>> +}
>> +
>> +#define BCH_PGOFF_TO_KVADDR(pgoff) ((void *)((unsigned long)pgoff << PAGE_SHIFT))
>> +
>>  static int init_owner_info(struct bch_nvm_namespace *ns)
>>  {
>>  	struct bch_owner_list_head *owner_list_head = ns->sb->owner_list_head;
>> +	struct bch_nvm_pgalloc_recs *sys_recs;
>> +	int i, j, k, rc = 0;
>>  
>>  	mutex_lock(&only_set->lock);
>>  	only_set->owner_list_head = owner_list_head;
>>  	only_set->owner_list_size = owner_list_head->size;
>>  	only_set->owner_list_used = owner_list_head->used;
>> +
>> +	/* remove used space */
>> +	remove_owner_space(ns, 0, div_u64(ns->pages_offset, ns->page_size));
>> +
>> +	sys_recs = ns->kaddr + BCH_NVM_PAGES_SYS_RECS_HEAD_OFFSET;
>> +	/* suppose no hole in array */
>> +	for (i = 0; i < owner_list_head->used; i++) {
>> +		struct bch_nvm_pages_owner_head *head = &owner_list_head->heads[i];
>> +
>> +		for (j = 0; j < BCH_NVM_PAGES_NAMESPACES_MAX; j++) {
>> +			struct bch_nvm_pgalloc_recs *pgalloc_recs = head->recs[j];
>> +			unsigned long offset = (unsigned long)ns->kaddr >> PAGE_SHIFT;
>> +			struct page *page;
>> +
>> +			while (pgalloc_recs) {
>> +				u32 pgalloc_recs_pos = (unsigned int)(pgalloc_recs - sys_recs);
>> +
>> +				if (memcmp(pgalloc_recs->magic, bch_nvm_pages_pgalloc_magic, 16)) {
>> +					pr_info("invalid bch_nvm_pages_pgalloc_magic\n");
>> +					rc = -EINVAL;
>> +					goto unlock;
>> +				}
>> +				if (memcmp(pgalloc_recs->owner_uuid, head->uuid, 16)) {
>> +					pr_info("invalid owner_uuid in bch_nvm_pgalloc_recs\n");
>> +					rc = -EINVAL;
>> +					goto unlock;
>> +				}
>> +				if (pgalloc_recs->owner != head) {
>> +					pr_info("invalid owner in bch_nvm_pgalloc_recs\n");
>> +					rc = -EINVAL;
>> +					goto unlock;
>> +				}
>> +
>> +				/* recs array can has hole */
> can have holes ?

It means the valid record is not always continuously stored in recs[]
from struct bch_nvm_pgalloc_recs. Because currently only 8 bytes write
to a 8 bytes aligned address on NVDIMM is stomic for power failure.

When a record is removed from the recs[] array by a block of NVDIMM pages
are freed, if the following valid records are moved forward to make all
records stored continuously, such memory movement is not atomic for power
failure. Then we need to design more complicated method to make sure the
meta data consistency for power failure.

Allowing hole (records can be non-continuously stored in recs[] array)
can make things much simpler here.

Thanks for your review.

Coly Li
Pavel Goran June 23, 2021, 5:46 a.m. UTC | #3
Hello Coly,

Wednesday, June 23, 2021, 12:35:21 PM, you wrote:

> ... (skipped a lot)
>>> +                            /* recs array can has hole */
>> can have holes ?

> It means the valid record is not always continuously stored in recs[]
> from struct bch_nvm_pgalloc_recs. Because currently only 8 bytes write
> to a 8 bytes aligned address on NVDIMM is stomic for power failure.

> ...

The issue is with the wording of this comment, not with the code or the
meaning of the comment.

The comment should read "recs array can have hole".

> Coly Li

Pavel Goran
Coly Li June 23, 2021, 6:03 a.m. UTC | #4
On 6/23/21 1:46 PM, Pavel Goran wrote:
> Hello Coly,
>
> Wednesday, June 23, 2021, 12:35:21 PM, you wrote:
>
>> ... (skipped a lot)
>>>> +                            /* recs array can has hole */
>>> can have holes ?
>> It means the valid record is not always continuously stored in recs[]
>> from struct bch_nvm_pgalloc_recs. Because currently only 8 bytes write
>> to a 8 bytes aligned address on NVDIMM is stomic for power failure.
>> ...
> The issue is with the wording of this comment, not with the code or the
> meaning of the comment.
>
> The comment should read "recs array can have hole".

Oh, I see. Thank Pavel for the hint :-) We will update it in next post.

Coly Li
diff mbox series

Patch

diff --git a/drivers/md/bcache/nvm-pages.c b/drivers/md/bcache/nvm-pages.c
index 18fdadbc502f..804ee66e97be 100644
--- a/drivers/md/bcache/nvm-pages.c
+++ b/drivers/md/bcache/nvm-pages.c
@@ -34,6 +34,10 @@  static void release_nvm_namespaces(struct bch_nvm_set *nvm_set)
 	for (i = 0; i < nvm_set->total_namespaces_nr; i++) {
 		ns = nvm_set->nss[i];
 		if (ns) {
+			kvfree(ns->pages_bitmap);
+			if (ns->pgalloc_recs_bitmap)
+				bitmap_free(ns->pgalloc_recs_bitmap);
+
 			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
 			kfree(ns);
 		}
@@ -48,17 +52,130 @@  static void release_nvm_set(struct bch_nvm_set *nvm_set)
 	kfree(nvm_set);
 }
 
+static struct page *nvm_vaddr_to_page(struct bch_nvm_namespace *ns, void *addr)
+{
+	return virt_to_page(addr);
+}
+
+static void *nvm_pgoff_to_vaddr(struct bch_nvm_namespace *ns, pgoff_t pgoff)
+{
+	return ns->kaddr + (pgoff << PAGE_SHIFT);
+}
+
+static inline void remove_owner_space(struct bch_nvm_namespace *ns,
+					pgoff_t pgoff, u64 nr)
+{
+	while (nr > 0) {
+		unsigned int num = nr > UINT_MAX ? UINT_MAX : nr;
+
+		bitmap_set(ns->pages_bitmap, pgoff, num);
+		nr -= num;
+		pgoff += num;
+	}
+}
+
+#define BCH_PGOFF_TO_KVADDR(pgoff) ((void *)((unsigned long)pgoff << PAGE_SHIFT))
+
 static int init_owner_info(struct bch_nvm_namespace *ns)
 {
 	struct bch_owner_list_head *owner_list_head = ns->sb->owner_list_head;
+	struct bch_nvm_pgalloc_recs *sys_recs;
+	int i, j, k, rc = 0;
 
 	mutex_lock(&only_set->lock);
 	only_set->owner_list_head = owner_list_head;
 	only_set->owner_list_size = owner_list_head->size;
 	only_set->owner_list_used = owner_list_head->used;
+
+	/* remove used space */
+	remove_owner_space(ns, 0, div_u64(ns->pages_offset, ns->page_size));
+
+	sys_recs = ns->kaddr + BCH_NVM_PAGES_SYS_RECS_HEAD_OFFSET;
+	/* suppose no hole in array */
+	for (i = 0; i < owner_list_head->used; i++) {
+		struct bch_nvm_pages_owner_head *head = &owner_list_head->heads[i];
+
+		for (j = 0; j < BCH_NVM_PAGES_NAMESPACES_MAX; j++) {
+			struct bch_nvm_pgalloc_recs *pgalloc_recs = head->recs[j];
+			unsigned long offset = (unsigned long)ns->kaddr >> PAGE_SHIFT;
+			struct page *page;
+
+			while (pgalloc_recs) {
+				u32 pgalloc_recs_pos = (unsigned int)(pgalloc_recs - sys_recs);
+
+				if (memcmp(pgalloc_recs->magic, bch_nvm_pages_pgalloc_magic, 16)) {
+					pr_info("invalid bch_nvm_pages_pgalloc_magic\n");
+					rc = -EINVAL;
+					goto unlock;
+				}
+				if (memcmp(pgalloc_recs->owner_uuid, head->uuid, 16)) {
+					pr_info("invalid owner_uuid in bch_nvm_pgalloc_recs\n");
+					rc = -EINVAL;
+					goto unlock;
+				}
+				if (pgalloc_recs->owner != head) {
+					pr_info("invalid owner in bch_nvm_pgalloc_recs\n");
+					rc = -EINVAL;
+					goto unlock;
+				}
+
+				/* recs array can has hole */
+				for (k = 0; k < pgalloc_recs->size; k++) {
+					struct bch_pgalloc_rec *rec = &pgalloc_recs->recs[k];
+
+					if (rec->pgoff) {
+						BUG_ON(rec->pgoff <= offset);
+
+						/* init struct page: index/private */
+						page = nvm_vaddr_to_page(ns,
+							BCH_PGOFF_TO_KVADDR(rec->pgoff));
+
+						set_page_private(page, rec->order);
+						page->index = rec->pgoff - offset;
+
+						remove_owner_space(ns,
+							rec->pgoff - offset,
+							1L << rec->order);
+					}
+				}
+				bitmap_set(ns->pgalloc_recs_bitmap, pgalloc_recs_pos, 1);
+				pgalloc_recs = pgalloc_recs->next;
+			}
+		}
+	}
+unlock:
 	mutex_unlock(&only_set->lock);
 
-	return 0;
+	return rc;
+}
+
+static void init_nvm_free_space(struct bch_nvm_namespace *ns)
+{
+	unsigned int start, end, pages;
+	int i;
+	struct page *page;
+	pgoff_t pgoff_start;
+
+	bitmap_for_each_clear_region(ns->pages_bitmap, start, end, 0, ns->pages_total) {
+		pgoff_start = start;
+		pages = end - start;
+
+		while (pages) {
+			for (i = BCH_MAX_ORDER - 1; i >= 0 ; i--) {
+				if ((pgoff_start % (1L << i) == 0) && (pages >= (1L << i)))
+					break;
+			}
+
+			page = nvm_vaddr_to_page(ns, nvm_pgoff_to_vaddr(ns, pgoff_start));
+			page->index = pgoff_start;
+			set_page_private(page, i);
+			__SetPageBuddy(page);
+			list_add((struct list_head *)&page->zone_device_data, &ns->free_area[i]);
+
+			pgoff_start += 1L << i;
+			pages -= 1L << i;
+		}
+	}
 }
 
 static int attach_nvm_set(struct bch_nvm_namespace *ns)
@@ -165,7 +282,7 @@  static int read_nvdimm_meta_super(struct block_device *bdev,
 struct bch_nvm_namespace *bch_register_namespace(const char *dev_path)
 {
 	struct bch_nvm_namespace *ns;
-	int err;
+	int i, err;
 	pgoff_t pgoff;
 	char buf[BDEVNAME_SIZE];
 	struct block_device *bdev;
@@ -249,18 +366,49 @@  struct bch_nvm_namespace *bch_register_namespace(const char *dev_path)
 	ns->nvm_set = only_set;
 	mutex_init(&ns->lock);
 
+	/*
+	 * parameters of bitmap_set/clear are unsigned int.
+	 * Given currently size of nvm is far from exceeding this limit,
+	 * so only add a WARN_ON message.
+	 */
+	WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX);
+	ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total),
+					sizeof(unsigned long), GFP_KERNEL);
+	if (!ns->pages_bitmap) {
+		err = -ENOMEM;
+		goto clear_ns_nr;
+	}
+
+	if (ns->sb->this_namespace_nr == 0) {
+		ns->pgalloc_recs_bitmap = bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL);
+		if (ns->pgalloc_recs_bitmap == NULL) {
+			err = -ENOMEM;
+			goto free_pages_bitmap;
+		}
+	}
+
+	for (i = 0; i < BCH_MAX_ORDER; i++)
+		INIT_LIST_HEAD(&ns->free_area[i]);
+
 	if (ns->sb->this_namespace_nr == 0) {
 		pr_info("only first namespace contain owner info\n");
 		err = init_owner_info(ns);
 		if (err < 0) {
 			pr_info("init_owner_info met error %d\n", err);
-			only_set->nss[ns->sb->this_namespace_nr] = NULL;
-			goto free_ns;
+			goto free_recs_bitmap;
 		}
+		/* init buddy allocator */
+		init_nvm_free_space(ns);
 	}
 
 	kfree(path);
 	return ns;
+free_recs_bitmap:
+	bitmap_free(ns->pgalloc_recs_bitmap);
+free_pages_bitmap:
+	kvfree(ns->pages_bitmap);
+clear_ns_nr:
+	only_set->nss[ns->sb->this_namespace_nr] = NULL;
 free_ns:
 	kfree(ns);
 bdput:
diff --git a/drivers/md/bcache/nvm-pages.h b/drivers/md/bcache/nvm-pages.h
index 3e24c4dee7fd..71beb244b9be 100644
--- a/drivers/md/bcache/nvm-pages.h
+++ b/drivers/md/bcache/nvm-pages.h
@@ -16,6 +16,7 @@ 
  * to which owner. After reboot from power failure, they will be initialized
  * based on nvm pages superblock in NVDIMM device.
  */
+#define BCH_MAX_ORDER 20
 struct bch_nvm_namespace {
 	struct bch_nvm_pages_sb *sb;
 	void *kaddr;
@@ -27,6 +28,11 @@  struct bch_nvm_namespace {
 	u64 pages_total;
 	pfn_t start_pfn;
 
+	unsigned long *pages_bitmap;
+	struct list_head free_area[BCH_MAX_ORDER];
+
+	unsigned long *pgalloc_recs_bitmap;
+
 	struct dax_device *dax_dev;
 	struct block_device *bdev;
 	struct bch_nvm_set *nvm_set;
diff --git a/include/uapi/linux/bcache-nvm.h b/include/uapi/linux/bcache-nvm.h
index 5094a6797679..1fdb3eaabf7e 100644
--- a/include/uapi/linux/bcache-nvm.h
+++ b/include/uapi/linux/bcache-nvm.h
@@ -130,11 +130,15 @@  union {
 };
 };
 
-#define BCH_MAX_RECS					\
-	((sizeof(struct bch_nvm_pgalloc_recs) -		\
-	 offsetof(struct bch_nvm_pgalloc_recs, recs)) /	\
+#define BCH_MAX_RECS							\
+	((sizeof(struct bch_nvm_pgalloc_recs) -				\
+	 offsetof(struct bch_nvm_pgalloc_recs, recs)) /			\
 	 sizeof(struct bch_pgalloc_rec))
 
+#define BCH_MAX_PGALLOC_RECS						\
+	((BCH_NVM_PAGES_OFFSET - BCH_NVM_PAGES_SYS_RECS_HEAD_OFFSET) /	\
+	 sizeof(struct bch_nvm_pgalloc_recs))
+
 struct bch_nvm_pages_owner_head {
 	unsigned char			uuid[16];
 	unsigned char			label[BCH_NVM_PAGES_LABEL_SIZE];