diff mbox series

[RFC,4/4] device-dax: Add a block device persistent type, BLK, for DAX KMEM

Message ID Yulo96W5ofaJranB@memverge.com (mailing list archive)
State New
Headers show
Series Allow persistent data on DAX device being used as KMEM | expand

Commit Message

Srinivas Aji Aug. 2, 2022, 6:12 p.m. UTC
When a DAX KMEM device is formatted as of type BLK, adding the DAX memory
exposes a block device /dev/kmem<numa_node> . A filesystem can be created
on this block device. Blocks which contain data are unavailable for use as
system memory, but blocks which are freed up using DISCARD become free for
system use.

The implementation uses an array which maps the logical block number
to the real block offset in the DAX device. This allows us to keep
block device semantics even though allocations can return any page.

Signed-off-by: Srinivas Aji <srinivas.aji@memverge.com>
---
 drivers/dax/Makefile       |   1 +
 drivers/dax/kmem.c         |   4 +-
 drivers/dax/kmem_blk.c     | 573 +++++++++++++++++++++++++++++++++++++
 drivers/dax/kmem_persist.h |   4 +
 4 files changed, 581 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dax/kmem_blk.c

Comments

Fabio M. De Francesco Aug. 3, 2022, 9:19 p.m. UTC | #1
On martedì 2 agosto 2022 20:12:07 CEST Srinivas Aji wrote:
> When a DAX KMEM device is formatted as of type BLK, adding the DAX memory
> exposes a block device /dev/kmem<numa_node> . A filesystem can be created
> on this block device. Blocks which contain data are unavailable for use 
as
> system memory, but blocks which are freed up using DISCARD become free 
for
> system use.
> 
> The implementation uses an array which maps the logical block number
> to the real block offset in the DAX device. This allows us to keep
> block device semantics even though allocations can return any page.
> 
> Signed-off-by: Srinivas Aji <srinivas.aji@memverge.com>
> ---
>  drivers/dax/Makefile       |   1 +
>  drivers/dax/kmem.c         |   4 +-
>  drivers/dax/kmem_blk.c     | 573 +++++++++++++++++++++++++++++++++++++
>  drivers/dax/kmem_persist.h |   4 +
>  4 files changed, 581 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/dax/kmem_blk.c

From a quick look at this code I see a mix of kmap(), kmap_atomic() and 
kmap_local_page(). Actually it's not obvious to me why you are still using 
those kmap() and kmap_atomic() functions since they are being deprecated 
and shouldn't be used in new code.

Furthermore, there's an ongoing effort towards replacing the latter two 
functions with kmap_local_page(). This implies that this code, sooner or 
later, will be refactored to get rid of those two functions.

Please take a look at highmem.rst which was update in mainline a couple of 
months ago, while a second round of changes are still in Andrew Morton's 
"mm-unstable" branch.

There are two main problems with kmap(): (1) It comes with an overhead as
mapping space is restricted and protected by a global lock for
synchronization and (2) it also requires global TLB invalidation when the
kmap’s pool wraps and it might block when the mapping space is fully
utilized until a slot becomes available.

With kmap_local_page() the mappings are per thread, CPU local, can take
page faults, and can be called from any context (including interrupts).
It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore,
the tasks can be preempted and, when they are scheduled to run again, the
kernel virtual addresses are restored and are still valid.

As said, since kmap_local_page() can be also called from atomic context,
and since the code shouldn't ever rely on an implicit preempt_disable(), 
this function can also safely replace kmap_atomic().

I haven't looked closely at your code because I haven't the necessary 
domain knowledge and experience to comment on design and implementation 
details. 

However, for what regards the mappings, it looks it can be converted to the 
only use of kmap_local_page and its related helpers even if it is 
immediately clear that there are some parts that need refactoring to not 
break the rules of local mapping/unmapping, especially when nesting several 
kmap_local_page() in loops.

> diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
> index 90a56ca3b345..d0a97f4af4ea 100644
> --- a/drivers/dax/Makefile
> +++ b/drivers/dax/Makefile
> @@ -3,6 +3,7 @@ obj-$(CONFIG_DAX) += dax.o
>  obj-$(CONFIG_DEV_DAX) += device_dax.o
>  obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
>  obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
> +obj-$(CONFIG_DEV_DAX_KMEM_PERSIST) += kmem_blk.o
>  
>  dax-y := super.o
>  dax-y += bus.o
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index 0ca6e14f7e73..0fa45d1ba9cc 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -534,8 +534,10 @@ static int __init dax_kmem_init(void)
>  	if (rc)
>  		kfree_const(kmem_name);
>  #ifdef CONFIG_DEV_DAX_KMEM_PERSIST
> -	if (rc == 0)
> +	if (rc == 0) {
>  		kmem_persist_type_register(&kmem_persist_none_ops);
> +		kmem_persist_type_register(&kmem_persist_blk_ops);
> +	}
>  #endif
>  	return rc;
>  }
> diff --git a/drivers/dax/kmem_blk.c b/drivers/dax/kmem_blk.c
> new file mode 100644
> index 000000000000..856b35713999
> --- /dev/null
> +++ b/drivers/dax/kmem_blk.c
> @@ -0,0 +1,573 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2022 MemVerge. All rights reserved. */
> +#include <linux/module.h>
> +#include <linux/major.h>
> +#include <linux/blkdev.h>
> +#include <linux/bio.h>
> +#include <linux/highmem.h>
> +#include <linux/pagemap.h>
> +#include <linux/bitops.h>
> +#include <linux/slab.h>
> +#include "dax-private.h"
> +#include "kmem_persist.h"
> +
> +static const unsigned int index_entries_per_page = (PAGE_SIZE / 
sizeof(u64));
> +
> +struct kmem_blk_super {
> +	struct kmem_persist_superblock header;
> +	u64 num_index_pages;
> +	u64 num_index_entries;
> +} __packed;
> +
> +struct kmem_blk_data {
> +	struct dev_dax *dev_dax;
> +	struct gendisk *disk;
> +	spinlock_t index_lock;
> +	struct kmem_blk_super *super;
> +	unsigned long num_index_pages;
> +	u64 *index_page[];
> +};
> +
> +// TODO: Make sure locking is sound for concurrent multiple IOs,
> +// i.e. writes and discards.
> +
> +static struct page *kmem_blk_get_page(struct kmem_blk_data *data,
> +				sector_t sector)
> +{
> +	pgoff_t i = sector >> PAGE_SECTORS_SHIFT;
> +	u64 page_num;
> +
> +	spin_lock(&data->index_lock);
> +	page_num = data->index_page
> +		[i / index_entries_per_page]
> +		[i % index_entries_per_page];
> +	spin_unlock(&data->index_lock);
> +
> +	if (page_num)
> +		return dax_kmem_index_to_page(page_num, data->dev_dax);
> +	else
> +		return NULL;
> +}
> +
> +/*
> + * copy_to_kmem_blk_setup must be called before copy_to_kmem_blk. It may 
sleep.
> + */
> +static int kmem_blk_insert_page(struct kmem_blk_data *data, sector_t 
sector)
> +{
> +	pgoff_t i = sector >> PAGE_SECTORS_SHIFT;
> +	struct page *page;
> +	unsigned long page_index = 0;
> +	u64 page_num; // TODO fixup u64 / unsigned long to use one type?
> +	u64 *index_ptr =
> +		&data->index_page
> +		[i / index_entries_per_page][i % 
index_entries_per_page];
> +
> +	/* Check if block exists */
> +	spin_lock(&data->index_lock);
> +	page_num = *index_ptr;
> +	spin_unlock(&data->index_lock);
> +	if (page_num)
> +		return 0;
> +
> +	page = dax_kmem_alloc_page(data->dev_dax, &page_index);
> +	if (!page) {
> +		dev_err(&data->dev_dax->dev, "Cannot allocate page\n");
> +		return -1;
> +	}
> +
> +	spin_lock(&data->index_lock);
> +	if (*index_ptr != 0)
> +		__free_page(page);
> +	else
> +		*index_ptr = page_index;
> +	spin_unlock(&data->index_lock);
> +
> +	return 0;
> +}
> +
> +static int kmem_blk_discard(struct kmem_blk_data *data,
> +			sector_t sector, size_t n)
> +{
> +	pgoff_t i = sector >> PAGE_SECTORS_SHIFT;
> +	struct page *page;
> +	u64 page_num; // TODO fixup u64 / unsigned long to use one type?
> +	u64 *index_ptr;
> +
> +	BUG_ON(sector & ((1 << PAGE_SECTORS_SHIFT) - 1));
> +	BUG_ON(n & (PAGE_SIZE - 1));
> +
> +	while (n > 0) {
> +		BUG_ON(i > data->super->num_index_entries);
> +		index_ptr =
> +			&data->index_page
> +			[i / index_entries_per_page]
> +			[i % index_entries_per_page];
> +		spin_lock(&data->index_lock);
> +		page_num = *index_ptr;
> +		if (page_num)
> +			*index_ptr = 0;
> +		spin_unlock(&data->index_lock);
> +		if (page_num) {
> +			page = dax_kmem_index_to_page(page_num, data-
>dev_dax);
> +			__free_page(page);
> +		}
> +		i++;
> +		n -= PAGE_SIZE;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * copy_to_kmem_blk_setup must be called before copy_to_kmem_blk. It may 
sleep.
> + */
> +static int copy_to_kmem_blk_setup(struct kmem_blk_data *data, sector_t 
sector, size_t n)
> +{
> +	unsigned int offset = (sector & (PAGE_SECTORS-1)) << 
SECTOR_SHIFT;
> +	size_t copy;
> +
> +	copy = min_t(size_t, n, PAGE_SIZE - offset);
> +	if (kmem_blk_insert_page(data, sector))
> +		return -ENOSPC;
> +	if (copy < n) {
> +		sector += copy >> SECTOR_SHIFT;
> +		if (kmem_blk_insert_page(data, sector))
> +			return -ENOSPC;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * Copy n bytes from src to the block device starting at sector. Does 
not sleep.
> + */
> +static void copy_to_kmem_blk(struct kmem_blk_data *data, const void 
*src,
> +			sector_t sector, size_t n)
> +{
> +	struct page *page;
> +	void *dst;
> +	unsigned int offset = (sector & (PAGE_SECTORS-1)) << 
SECTOR_SHIFT;
> +	size_t copy;
> +
> +	copy = min_t(size_t, n, PAGE_SIZE - offset);
> +	page = kmem_blk_get_page(data, sector);
> +	BUG_ON(!page);
> +
> +	dst = kmap_atomic(page);
> +	memcpy(dst + offset, src, copy);
> +	kunmap_atomic(dst);

Can you please replace the above three lines with memcpy_to_page()?

> +
> +	if (copy < n) {
> +		src += copy;
> +		sector += copy >> SECTOR_SHIFT;
> +		copy = n - copy;
> +		page = kmem_blk_get_page(data, sector);
> +		BUG_ON(!page);
> +
> +		dst = kmap_atomic(page);
> +		memcpy(dst, src, copy);
> +		kunmap_atomic(dst);

Same here, please.

> +	}
> +}
> +
> +/*
> + * Copy n bytes to dst from the block device starting at sector. Does 
not sleep.
> + */
> +static void copy_from_kmem_blk(void *dst, struct kmem_blk_data *data,
> +			sector_t sector, size_t n)
> +{
> +	struct page *page;
> +	void *src;
> +	unsigned int offset = (sector & (PAGE_SECTORS-1)) << 
SECTOR_SHIFT;
> +	size_t copy;
> +
> +	copy = min_t(size_t, n, PAGE_SIZE - offset);
> +	page = kmem_blk_get_page(data, sector);
> +	if (page) {
> +		src = kmap_atomic(page);
> +		memcpy(dst, src + offset, copy);
> +		kunmap_atomic(src);

Again.

> +	} else
> +		memset(dst, 0, copy);
> +
> +	if (copy < n) {
> +		dst += copy;
> +		sector += copy >> SECTOR_SHIFT;
> +		copy = n - copy;
> +		page = kmem_blk_get_page(data, sector);
> +		if (page) {
> +			src = kmap_atomic(page);
> +			memcpy(dst, src, copy);
> +			kunmap_atomic(src);

And again :-)

> +		} else
> +			memset(dst, 0, copy);
> +	}
> +}
> +
> +/*
> + * Process a single bvec of a bio.
> + */
> +static int kmem_blk_do_bvec(struct kmem_blk_data *data, struct page 
*page,
> +			unsigned int len, unsigned int off, unsigned 
int op,
> +			sector_t sector)
> +{
> +	void *mem = NULL;
> +	int err = 0;
> +
> +	if (op == REQ_OP_WRITE) {
> +		err = copy_to_kmem_blk_setup(data, sector, len);
> +		if (err)
> +			goto out;
> +	}
> +
> +	if (page)
> +		mem = kmap_atomic(page);

Are you implicitly relying to preempt_disable()? I'm not following closely. 
However, if so, you shouldn't rely on it. Please use kmap_local_page() and, 
if you need this switch and the calls under atomic context use the suited 
API in explicit ways.

> +	switch (op) {
> +	case REQ_OP_READ:
> +		copy_from_kmem_blk(mem + off, data, sector, len);
> +		flush_dcache_page(page);
> +		break;
> +	case REQ_OP_WRITE:
> +		flush_dcache_page(page);
> +		copy_to_kmem_blk(data, mem + off, sector, len);
> +		break;
> +	case REQ_OP_DISCARD:
> +		BUG_ON(page);
> +		kmem_blk_discard(data, sector, len);
> +		break;
> +	default:
> +		BUG();
> +		break;
> +	}
> +	if (mem)
> +		kunmap_atomic(mem);
> +
> +out:
> +	return err;
> +}
> +
> +static void kmem_blk_submit_bio(struct bio *bio)
> +{
> +	struct kmem_blk_data *data = bio->bi_bdev->bd_disk->private_data;
> +	sector_t sector = bio->bi_iter.bi_sector;
> +	struct bio_vec bvec;
> +	struct bvec_iter iter;
> +
> +	/*
> +	 * DISCARD and WRITE_ZEROES come separately and don't work with
> +	 * bio_for_each_segment
> +	 */
> +	switch (bio_op(bio)) {
> +	case REQ_OP_DISCARD:
> +	case REQ_OP_WRITE_ZEROES:
> +		kmem_blk_discard(data, sector, bio->bi_iter.bi_size);
> +		bio_endio(bio);
> +		return;
> +	default:
> +		break;
> +	}
> +
> +	bio_for_each_segment(bvec, bio, iter) {
> +		unsigned int len = bvec.bv_len;
> +		int err;
> +
> +		/* Don't support un-aligned buffer */
> +		WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
> +				(len & (SECTOR_SIZE - 1)));
> +		err = kmem_blk_do_bvec(data, bvec.bv_page, len, 
bvec.bv_offset,
> +				bio_op(bio), sector);
> +		if (err) {
> +			bio_io_error(bio);
> +			return;
> +		}
> +		sector += len >> SECTOR_SHIFT;
> +	}
> +
> +	bio_endio(bio);
> +}
> +
> +static int kmem_blk_rw_page(struct block_device *bdev, sector_t sector,
> +			struct page *page, unsigned int op)
> +{
> +	struct kmem_blk_data *data = bdev->bd_disk->private_data;
> +	int err;
> +
> +	if (PageTransHuge(page))
> +		return -EOPNOTSUPP;
> +	err = kmem_blk_do_bvec(data, page, PAGE_SIZE, 0, op, sector);
> +	page_endio(page, op_is_write(op), err);
> +	return err;
> +}
> +
> +static const struct block_device_operations kmem_blk_fops = {
> +	.owner =		THIS_MODULE,
> +	.submit_bio =		kmem_blk_submit_bio,
> +	.rw_page =		kmem_blk_rw_page,
> +};
> +
> +
> +
> +
> +
> +static int kmem_blk_disk_init(struct kmem_blk_data *data)
> +{
> +	struct gendisk *disk;
> +	int err;
> +
> +	disk = blk_alloc_disk(data->dev_dax->target_node);
> +	data->disk = disk;
> +
> +	disk->flags = GENHD_FL_NO_PART;
> +	disk->fops = &kmem_blk_fops;
> +	disk->private_data = data;
> +	snprintf(disk->disk_name, DISK_NAME_LEN, "kmem%d",
> +		data->dev_dax->target_node);
> +
> +	set_capacity(disk,
> +		data->super->num_index_entries << PAGE_SECTORS_SHIFT);
> +
> +	// TODO: Handle cases where PAGE_SIZE is too big.
> +	/* Set physical and logical block size to PAGE_SIZE */
> +	blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
> +	blk_queue_logical_block_size(disk->queue, PAGE_SIZE);
> +
> +	/* Tell the block layer that this is not a rotational device */
> +	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
> +	/* Don't use this for randomness */
> +	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
> +
> +	/* Support discard */
> +	blk_queue_flag_set(QUEUE_FLAG_DISCARD, disk->queue);
> +	disk->queue->limits.discard_granularity = PAGE_SIZE;
> +	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
> +	/* We can handle WRITE_ZEROES as DISCARD, at units of page size 
*/
> +	blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX);
> +
> +	err = add_disk(disk);
> +	if (err)
> +		goto out_cleanup_disk;
> +
> +	return 0;
> +out_cleanup_disk:
> +	blk_cleanup_disk(data->disk);
> +	data->disk = NULL;
> +	return err;
> +}
> +
> +
> +static void kmem_blk_disk_cleanup(struct kmem_blk_data *data)
> +{
> +	if (data->disk == NULL)
> +		return;
> +	del_gendisk(data->disk);
> +	blk_cleanup_disk(data->disk);
> +	data->disk = NULL;
> +}
> +
> +/* Format device with full allocation */
> +static int kmem_blk_format(struct dev_dax *dev_dax)
> +{
> +	struct kmem_blk_super *super =
> +		kmap_local_page(dax_kmem_index_to_page(0, dev_dax));
> +
> +	unsigned long num_pages = dax_kmem_num_pages(dev_dax);
> +	u64 i;
> +	/*
> +	 * c = a / b => c is largest c s.t. c * b <= a.
> +	 * c = (a + b - 1) / b is smallest c s.t. c * b >= a
> +	 * num_index_pages is the largest number such that
> +	 * 1 + num_index_pages + num_index_pages * index_entries_per_page 
>= num_pages
> +	 * num_index_pages *(1 + index_entries_per_page) >= num_pages - 1
> +	 * num_index_pages =
> +	 *   ((num_pages - 1) + (1 + index_entries_per_page) - 1 ) /
> +	 *   (1 + index_entries_per_page)
> +	 */
> +	u64 num_index_pages =
> +		(num_pages + index_entries_per_page - 1) /
> +		(1 + index_entries_per_page);
> +	super->header.magic = kmem_persist_magic;
> +	super->header.type = KMEM_PERSIST_BLK;
> +	super->num_index_pages = num_index_pages;
> +	super->num_index_entries = num_pages - 1 - num_index_pages;
> +
> +	for (i = 0; i < num_index_pages; i++) {
> +		u64 *index_array =
> +			kmap_local_page(dax_kmem_index_to_page(1 + i, 
dev_dax));
> +#if !defined(KMEM_PERSIST_BLK_FORMAT_FULL)
> +		memset(index_array, 0, PAGE_SIZE);
> +#else /* KMEM_PERSIST_BLK_FORMAT_FULL */
> +		u64 j;
> +
> +		for (j = 0; j < index_entries_per_page; j++) {
> +			u64 idx =
> +				1 + num_index_pages +
> +				i * index_entries_per_page + j;
> +
> +			if (idx >= num_pages)
> +				idx = 0;
> +			index_array[j] = idx;
> +		}
> +#endif
> +		kunmap_local(index_array);
> +	}
> +	kunmap_local(super);
> +	return 0;
> +}
> +
> +/* Free unused blocks in the dax memory to system */
> +static int kmem_blk_free_unused(struct kmem_blk_data *data)
> +{
> +	struct kmem_blk_super *super = data->super;
> +	unsigned long num_pages = dax_kmem_num_pages(data->dev_dax);
> +	u64 *alloc_bitmap;
> +	unsigned long i;
> +
> +	/* Bitmap for tracking allocated pages. Temporary */
> +	alloc_bitmap =
> +		kvzalloc(sizeof(u64) * BITS_TO_U64(num_pages), 
GFP_KERNEL);
> +	if (alloc_bitmap == NULL) {
> +		dev_err(&data->dev_dax->dev,
> +			"Unable to allocate bit array. Not freeing 
unused space.\n");
> +		return -ENOMEM;
> +	}
> +
> +	/* Free up pages unused by block storage to memory */
> +	for (i = 0; i < super->num_index_entries; i++) {
> +		u64 page_num = data->index_page
> +			[i / index_entries_per_page]
> +			[i % index_entries_per_page];
> +
> +		if (page_num != 0) {
> +			BUG_ON(page_num < 1 + super->num_index_pages 
||
> +				page_num >= num_pages);
> +			/* Set bit */
> +			alloc_bitmap[page_num / 64] |= 1 << (page_num 
% 64);
> +		}
> +	}
> +
> +	for (i = 1 + super->num_index_pages; i < num_pages; i++) {
> +		struct page *page;
> +
> +		if (!(alloc_bitmap[i / 64] & (1 << (i % 64)))) {
> +			/* Bit clear. Page not used */
> +			page = dax_kmem_index_to_page(i, data-
>dev_dax);
> +			__free_page(page);
> +		}
> +	}
> +
> +	kvfree(alloc_bitmap);
> +	return 0;
> +}
> +
> +static int kmem_blk_probe(struct dev_dax *dev_dax, void **persist_data)
> +{
> +	struct device *dev = &dev_dax->dev;
> +	struct kmem_blk_super *super;
> +	unsigned long i;
> +	struct kmem_blk_data *data;
> +	unsigned long num_pages = dax_kmem_num_pages(dev_dax);
> +
> +	if (num_pages == 0) {
> +		dev_err(dev, "Dax device for KMEM has no pages\n");
> +		*persist_data = NULL;
> +		return -1;
> +	}
> +
> +	super = kmap(dax_kmem_index_to_page(0, dev_dax));

This looks better suited for kmap_local_page().

> +
> +	/* Validate superblock magic and type */
> +	if (super->header.magic != kmem_persist_magic ||
> +		super->header.type != KMEM_PERSIST_BLK) {
> +		dev_err("KMEM not formatted for blk, magic %lx type 
%d\n",
> +			super->header.magic, super->header.type);
> +		kunmap(dax_kmem_index_to_page(0, dev_dax));
> +		*persist_data = NULL;
> +		return -EINVAL;
> +	}
> +
> +	/* Validate superblock index page counts */
> +	if (super->num_index_entries <=
> +		super->num_index_pages * index_entries_per_page &&
> +		1 + super->num_index_pages + super->num_index_entries
> +		== num_pages) {
> +		dev_info(dev,
> +			"Found kmem_blk superblock num_index_entries 
%llu num_index_pages %llu num_pages %lu\n",
> +			super->num_index_entries,
> +			super->num_index_pages, num_pages);
> +	} else {
> +		dev_warn(dev,
> +			"Invalid kmem_blk superblock 
num_index_entries %llu num_index_pages %llu num_pages %lu\n",
> +			super->num_index_entries,
> +			super->num_index_pages, num_pages);
> +		kunmap(dax_kmem_index_to_page(0, dev_dax));
> +		*persist_data = NULL;
> +		return -EINVAL;
> +	}
> +
> +	data = kzalloc(struct_size(data, index_page, super-
>num_index_pages),
> +		GFP_KERNEL);
> +	if (!data) {
> +		kunmap(dax_kmem_index_to_page(0, dev_dax));
> +		*persist_data = NULL;
> +		return -ENOMEM;
> +	}
> +
> +	*persist_data = data;
> +	data->dev_dax = dev_dax;
> +	data->super = super;
> +	spin_lock_init(&data->index_lock);
> +
> +	for (i = 0; i < super->num_index_pages; i++)
> +		data->index_page[i] =
> +			kmap(dax_kmem_index_to_page(i + 1, dev_dax));

Nesting of kmap_local_page() is performed on a stack based fashion (LIFO), 
therefore the kunmap_local() calls must be invoked in reverse order. 

Any special problems with this?  

> +
> +	kmem_blk_free_unused(data);
> +
> +	kmem_blk_disk_init(data);
> +
> +	return 0;
> +}
> +
> +static int kmem_blk_cleanup(struct dev_dax *dev_dax, void *persist_data)
> +{
> +	struct kmem_blk_data *data = persist_data;
> +	unsigned long num_pages = dax_kmem_num_pages(dev_dax);
> +	unsigned long i;
> +
> +	if (data == NULL)
> +		return -1;
> +
> +	kmem_blk_disk_cleanup(data);
> +
> +	if (data->super == 0) {
> +		for (i = 0; i < num_pages; i++)
> +			__free_page(dax_kmem_index_to_page(i, 
dev_dax));
> +	} else {
> +		for (i = 0; i < data->super->num_index_entries; i++) {
> +			u64 page_num = data->index_page
> +				[i / index_entries_per_page]
> +				[i % index_entries_per_page];
> +			if (page_num != 0) {
> +				
__free_page(dax_kmem_index_to_page(page_num,
> +								
   dev_dax));
> +			}
> +		}
> +		for (i = 0; i < data->super->num_index_pages; i++) {
> +			struct page *page =
> +				dax_kmem_index_to_page(1 + i, 
dev_dax);
> +			data->index_page[i] = NULL;
> +			kunmap(page);
> +			__free_page(page);
> +		}
> +		data->super = NULL;
> +		kunmap(dax_kmem_index_to_page(0, dev_dax));
> +		__free_page(dax_kmem_index_to_page(0, dev_dax));
> +	}
> +	kfree(data);
> +	return 0;
> +}
> +
> +struct kmem_persist_ops kmem_persist_blk_ops = {
> +	.type = KMEM_PERSIST_BLK,
> +	.format = kmem_blk_format,
> +	.probe = kmem_blk_probe,
> +	.cleanup = kmem_blk_cleanup
> +};
> diff --git a/drivers/dax/kmem_persist.h b/drivers/dax/kmem_persist.h
> index dd651025f28c..0e0279feaa12 100644
> --- a/drivers/dax/kmem_persist.h
> +++ b/drivers/dax/kmem_persist.h
> @@ -10,6 +10,7 @@ struct dev_dax;
>  
>  enum kmem_persist_type {
>  	KMEM_PERSIST_NONE = 0,
> +	KMEM_PERSIST_BLK,
>  };
>  
>  
> @@ -40,4 +41,7 @@ unsigned long dax_kmem_num_pages(struct dev_dax 
*dev_dax);
>  struct page *dax_kmem_alloc_page(struct dev_dax *dev_dax,
>  				unsigned long *page_index);
>  
> +/* Defined in kmem_blk.c */
> +extern struct kmem_persist_ops kmem_persist_blk_ops;
> +
>  #endif
> -- 
> 2.30.2
> 

Thanks,

Fabio

P.S.: I'm Cc'ing Ira Weiny.
diff mbox series

Patch

diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 90a56ca3b345..d0a97f4af4ea 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -3,6 +3,7 @@  obj-$(CONFIG_DAX) += dax.o
 obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
+obj-$(CONFIG_DEV_DAX_KMEM_PERSIST) += kmem_blk.o
 
 dax-y := super.o
 dax-y += bus.o
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 0ca6e14f7e73..0fa45d1ba9cc 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -534,8 +534,10 @@  static int __init dax_kmem_init(void)
 	if (rc)
 		kfree_const(kmem_name);
 #ifdef CONFIG_DEV_DAX_KMEM_PERSIST
-	if (rc == 0)
+	if (rc == 0) {
 		kmem_persist_type_register(&kmem_persist_none_ops);
+		kmem_persist_type_register(&kmem_persist_blk_ops);
+	}
 #endif
 	return rc;
 }
diff --git a/drivers/dax/kmem_blk.c b/drivers/dax/kmem_blk.c
new file mode 100644
index 000000000000..856b35713999
--- /dev/null
+++ b/drivers/dax/kmem_blk.c
@@ -0,0 +1,573 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 MemVerge. All rights reserved. */
+#include <linux/module.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include "dax-private.h"
+#include "kmem_persist.h"
+
+static const unsigned int index_entries_per_page = (PAGE_SIZE / sizeof(u64));
+
+struct kmem_blk_super {
+	struct kmem_persist_superblock header;
+	u64 num_index_pages;
+	u64 num_index_entries;
+} __packed;
+
+struct kmem_blk_data {
+	struct dev_dax *dev_dax;
+	struct gendisk *disk;
+	spinlock_t index_lock;
+	struct kmem_blk_super *super;
+	unsigned long num_index_pages;
+	u64 *index_page[];
+};
+
+// TODO: Make sure locking is sound for concurrent multiple IOs,
+// i.e. writes and discards.
+
+static struct page *kmem_blk_get_page(struct kmem_blk_data *data,
+				sector_t sector)
+{
+	pgoff_t i = sector >> PAGE_SECTORS_SHIFT;
+	u64 page_num;
+
+	spin_lock(&data->index_lock);
+	page_num = data->index_page
+		[i / index_entries_per_page]
+		[i % index_entries_per_page];
+	spin_unlock(&data->index_lock);
+
+	if (page_num)
+		return dax_kmem_index_to_page(page_num, data->dev_dax);
+	else
+		return NULL;
+}
+
+/*
+ * copy_to_kmem_blk_setup must be called before copy_to_kmem_blk. It may sleep.
+ */
+static int kmem_blk_insert_page(struct kmem_blk_data *data, sector_t sector)
+{
+	pgoff_t i = sector >> PAGE_SECTORS_SHIFT;
+	struct page *page;
+	unsigned long page_index = 0;
+	u64 page_num; // TODO fixup u64 / unsigned long to use one type?
+	u64 *index_ptr =
+		&data->index_page
+		[i / index_entries_per_page][i % index_entries_per_page];
+
+	/* Check if block exists */
+	spin_lock(&data->index_lock);
+	page_num = *index_ptr;
+	spin_unlock(&data->index_lock);
+	if (page_num)
+		return 0;
+
+	page = dax_kmem_alloc_page(data->dev_dax, &page_index);
+	if (!page) {
+		dev_err(&data->dev_dax->dev, "Cannot allocate page\n");
+		return -1;
+	}
+
+	spin_lock(&data->index_lock);
+	if (*index_ptr != 0)
+		__free_page(page);
+	else
+		*index_ptr = page_index;
+	spin_unlock(&data->index_lock);
+
+	return 0;
+}
+
+static int kmem_blk_discard(struct kmem_blk_data *data,
+			sector_t sector, size_t n)
+{
+	pgoff_t i = sector >> PAGE_SECTORS_SHIFT;
+	struct page *page;
+	u64 page_num; // TODO fixup u64 / unsigned long to use one type?
+	u64 *index_ptr;
+
+	BUG_ON(sector & ((1 << PAGE_SECTORS_SHIFT) - 1));
+	BUG_ON(n & (PAGE_SIZE - 1));
+
+	while (n > 0) {
+		BUG_ON(i > data->super->num_index_entries);
+		index_ptr =
+			&data->index_page
+			[i / index_entries_per_page]
+			[i % index_entries_per_page];
+		spin_lock(&data->index_lock);
+		page_num = *index_ptr;
+		if (page_num)
+			*index_ptr = 0;
+		spin_unlock(&data->index_lock);
+		if (page_num) {
+			page = dax_kmem_index_to_page(page_num, data->dev_dax);
+			__free_page(page);
+		}
+		i++;
+		n -= PAGE_SIZE;
+	}
+	return 0;
+}
+
+/*
+ * copy_to_kmem_blk_setup must be called before copy_to_kmem_blk. It may sleep.
+ */
+static int copy_to_kmem_blk_setup(struct kmem_blk_data *data, sector_t sector, size_t n)
+{
+	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	if (kmem_blk_insert_page(data, sector))
+		return -ENOSPC;
+	if (copy < n) {
+		sector += copy >> SECTOR_SHIFT;
+		if (kmem_blk_insert_page(data, sector))
+			return -ENOSPC;
+	}
+	return 0;
+}
+
+/*
+ * Copy n bytes from src to the block device starting at sector. Does not sleep.
+ */
+static void copy_to_kmem_blk(struct kmem_blk_data *data, const void *src,
+			sector_t sector, size_t n)
+{
+	struct page *page;
+	void *dst;
+	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	page = kmem_blk_get_page(data, sector);
+	BUG_ON(!page);
+
+	dst = kmap_atomic(page);
+	memcpy(dst + offset, src, copy);
+	kunmap_atomic(dst);
+
+	if (copy < n) {
+		src += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		page = kmem_blk_get_page(data, sector);
+		BUG_ON(!page);
+
+		dst = kmap_atomic(page);
+		memcpy(dst, src, copy);
+		kunmap_atomic(dst);
+	}
+}
+
+/*
+ * Copy n bytes to dst from the block device starting at sector. Does not sleep.
+ */
+static void copy_from_kmem_blk(void *dst, struct kmem_blk_data *data,
+			sector_t sector, size_t n)
+{
+	struct page *page;
+	void *src;
+	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	page = kmem_blk_get_page(data, sector);
+	if (page) {
+		src = kmap_atomic(page);
+		memcpy(dst, src + offset, copy);
+		kunmap_atomic(src);
+	} else
+		memset(dst, 0, copy);
+
+	if (copy < n) {
+		dst += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		page = kmem_blk_get_page(data, sector);
+		if (page) {
+			src = kmap_atomic(page);
+			memcpy(dst, src, copy);
+			kunmap_atomic(src);
+		} else
+			memset(dst, 0, copy);
+	}
+}
+
+/*
+ * Process a single bvec of a bio.
+ */
+static int kmem_blk_do_bvec(struct kmem_blk_data *data, struct page *page,
+			unsigned int len, unsigned int off, unsigned int op,
+			sector_t sector)
+{
+	void *mem = NULL;
+	int err = 0;
+
+	if (op == REQ_OP_WRITE) {
+		err = copy_to_kmem_blk_setup(data, sector, len);
+		if (err)
+			goto out;
+	}
+
+	if (page)
+		mem = kmap_atomic(page);
+	switch (op) {
+	case REQ_OP_READ:
+		copy_from_kmem_blk(mem + off, data, sector, len);
+		flush_dcache_page(page);
+		break;
+	case REQ_OP_WRITE:
+		flush_dcache_page(page);
+		copy_to_kmem_blk(data, mem + off, sector, len);
+		break;
+	case REQ_OP_DISCARD:
+		BUG_ON(page);
+		kmem_blk_discard(data, sector, len);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	if (mem)
+		kunmap_atomic(mem);
+
+out:
+	return err;
+}
+
+static void kmem_blk_submit_bio(struct bio *bio)
+{
+	struct kmem_blk_data *data = bio->bi_bdev->bd_disk->private_data;
+	sector_t sector = bio->bi_iter.bi_sector;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	/*
+	 * DISCARD and WRITE_ZEROES come separately and don't work with
+	 * bio_for_each_segment
+	 */
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_WRITE_ZEROES:
+		kmem_blk_discard(data, sector, bio->bi_iter.bi_size);
+		bio_endio(bio);
+		return;
+	default:
+		break;
+	}
+
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+		int err;
+
+		/* Don't support un-aligned buffer */
+		WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
+				(len & (SECTOR_SIZE - 1)));
+		err = kmem_blk_do_bvec(data, bvec.bv_page, len, bvec.bv_offset,
+				bio_op(bio), sector);
+		if (err) {
+			bio_io_error(bio);
+			return;
+		}
+		sector += len >> SECTOR_SHIFT;
+	}
+
+	bio_endio(bio);
+}
+
+static int kmem_blk_rw_page(struct block_device *bdev, sector_t sector,
+			struct page *page, unsigned int op)
+{
+	struct kmem_blk_data *data = bdev->bd_disk->private_data;
+	int err;
+
+	if (PageTransHuge(page))
+		return -EOPNOTSUPP;
+	err = kmem_blk_do_bvec(data, page, PAGE_SIZE, 0, op, sector);
+	page_endio(page, op_is_write(op), err);
+	return err;
+}
+
+static const struct block_device_operations kmem_blk_fops = {
+	.owner =		THIS_MODULE,
+	.submit_bio =		kmem_blk_submit_bio,
+	.rw_page =		kmem_blk_rw_page,
+};
+
+
+
+
+
+static int kmem_blk_disk_init(struct kmem_blk_data *data)
+{
+	struct gendisk *disk;
+	int err;
+
+	disk = blk_alloc_disk(data->dev_dax->target_node);
+	data->disk = disk;
+
+	disk->flags = GENHD_FL_NO_PART;
+	disk->fops = &kmem_blk_fops;
+	disk->private_data = data;
+	snprintf(disk->disk_name, DISK_NAME_LEN, "kmem%d",
+		data->dev_dax->target_node);
+
+	set_capacity(disk,
+		data->super->num_index_entries << PAGE_SECTORS_SHIFT);
+
+	// TODO: Handle cases where PAGE_SIZE is too big.
+	/* Set physical and logical block size to PAGE_SIZE */
+	blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
+	blk_queue_logical_block_size(disk->queue, PAGE_SIZE);
+
+	/* Tell the block layer that this is not a rotational device */
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+	/* Don't use this for randomness */
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+
+	/* Support discard */
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, disk->queue);
+	disk->queue->limits.discard_granularity = PAGE_SIZE;
+	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
+	/* We can handle WRITE_ZEROES as DISCARD, at units of page size */
+	blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX);
+
+	err = add_disk(disk);
+	if (err)
+		goto out_cleanup_disk;
+
+	return 0;
+out_cleanup_disk:
+	blk_cleanup_disk(data->disk);
+	data->disk = NULL;
+	return err;
+}
+
+
+static void kmem_blk_disk_cleanup(struct kmem_blk_data *data)
+{
+	if (data->disk == NULL)
+		return;
+	del_gendisk(data->disk);
+	blk_cleanup_disk(data->disk);
+	data->disk = NULL;
+}
+
+/* Format device with full allocation */
+static int kmem_blk_format(struct dev_dax *dev_dax)
+{
+	struct kmem_blk_super *super =
+		kmap_local_page(dax_kmem_index_to_page(0, dev_dax));
+
+	unsigned long num_pages = dax_kmem_num_pages(dev_dax);
+	u64 i;
+	/*
+	 * c = a / b => c is largest c s.t. c * b <= a.
+	 * c = (a + b - 1) / b is smallest c s.t. c * b >= a
+	 * num_index_pages is the largest number such that
+	 * 1 + num_index_pages + num_index_pages * index_entries_per_page >= num_pages
+	 * num_index_pages *(1 + index_entries_per_page) >= num_pages - 1
+	 * num_index_pages =
+	 *   ((num_pages - 1) + (1 + index_entries_per_page) - 1 ) /
+	 *   (1 + index_entries_per_page)
+	 */
+	u64 num_index_pages =
+		(num_pages + index_entries_per_page - 1) /
+		(1 + index_entries_per_page);
+	super->header.magic = kmem_persist_magic;
+	super->header.type = KMEM_PERSIST_BLK;
+	super->num_index_pages = num_index_pages;
+	super->num_index_entries = num_pages - 1 - num_index_pages;
+
+	for (i = 0; i < num_index_pages; i++) {
+		u64 *index_array =
+			kmap_local_page(dax_kmem_index_to_page(1 + i, dev_dax));
+#if !defined(KMEM_PERSIST_BLK_FORMAT_FULL)
+		memset(index_array, 0, PAGE_SIZE);
+#else /* KMEM_PERSIST_BLK_FORMAT_FULL */
+		u64 j;
+
+		for (j = 0; j < index_entries_per_page; j++) {
+			u64 idx =
+				1 + num_index_pages +
+				i * index_entries_per_page + j;
+
+			if (idx >= num_pages)
+				idx = 0;
+			index_array[j] = idx;
+		}
+#endif
+		kunmap_local(index_array);
+	}
+	kunmap_local(super);
+	return 0;
+}
+
+/* Free unused blocks in the dax memory to system */
+static int kmem_blk_free_unused(struct kmem_blk_data *data)
+{
+	struct kmem_blk_super *super = data->super;
+	unsigned long num_pages = dax_kmem_num_pages(data->dev_dax);
+	u64 *alloc_bitmap;
+	unsigned long i;
+
+	/* Bitmap for tracking allocated pages. Temporary */
+	alloc_bitmap =
+		kvzalloc(sizeof(u64) * BITS_TO_U64(num_pages), GFP_KERNEL);
+	if (alloc_bitmap == NULL) {
+		dev_err(&data->dev_dax->dev,
+			"Unable to allocate bit array. Not freeing unused space.\n");
+		return -ENOMEM;
+	}
+
+	/* Free up pages unused by block storage to memory */
+	for (i = 0; i < super->num_index_entries; i++) {
+		u64 page_num = data->index_page
+			[i / index_entries_per_page]
+			[i % index_entries_per_page];
+
+		if (page_num != 0) {
+			BUG_ON(page_num < 1 + super->num_index_pages ||
+				page_num >= num_pages);
+			/* Set bit */
+			alloc_bitmap[page_num / 64] |= 1 << (page_num % 64);
+		}
+	}
+
+	for (i = 1 + super->num_index_pages; i < num_pages; i++) {
+		struct page *page;
+
+		if (!(alloc_bitmap[i / 64] & (1 << (i % 64)))) {
+			/* Bit clear. Page not used */
+			page = dax_kmem_index_to_page(i, data->dev_dax);
+			__free_page(page);
+		}
+	}
+
+	kvfree(alloc_bitmap);
+	return 0;
+}
+
+static int kmem_blk_probe(struct dev_dax *dev_dax, void **persist_data)
+{
+	struct device *dev = &dev_dax->dev;
+	struct kmem_blk_super *super;
+	unsigned long i;
+	struct kmem_blk_data *data;
+	unsigned long num_pages = dax_kmem_num_pages(dev_dax);
+
+	if (num_pages == 0) {
+		dev_err(dev, "Dax device for KMEM has no pages\n");
+		*persist_data = NULL;
+		return -1;
+	}
+
+	super = kmap(dax_kmem_index_to_page(0, dev_dax));
+
+	/* Validate superblock magic and type */
+	if (super->header.magic != kmem_persist_magic ||
+		super->header.type != KMEM_PERSIST_BLK) {
+		dev_err("KMEM not formatted for blk, magic %lx type %d\n",
+			super->header.magic, super->header.type);
+		kunmap(dax_kmem_index_to_page(0, dev_dax));
+		*persist_data = NULL;
+		return -EINVAL;
+	}
+
+	/* Validate superblock index page counts */
+	if (super->num_index_entries <=
+		super->num_index_pages * index_entries_per_page &&
+		1 + super->num_index_pages + super->num_index_entries
+		== num_pages) {
+		dev_info(dev,
+			"Found kmem_blk superblock num_index_entries %llu num_index_pages %llu num_pages %lu\n",
+			super->num_index_entries,
+			super->num_index_pages, num_pages);
+	} else {
+		dev_warn(dev,
+			"Invalid kmem_blk superblock num_index_entries %llu num_index_pages %llu num_pages %lu\n",
+			super->num_index_entries,
+			super->num_index_pages, num_pages);
+		kunmap(dax_kmem_index_to_page(0, dev_dax));
+		*persist_data = NULL;
+		return -EINVAL;
+	}
+
+	data = kzalloc(struct_size(data, index_page, super->num_index_pages),
+		GFP_KERNEL);
+	if (!data) {
+		kunmap(dax_kmem_index_to_page(0, dev_dax));
+		*persist_data = NULL;
+		return -ENOMEM;
+	}
+
+	*persist_data = data;
+	data->dev_dax = dev_dax;
+	data->super = super;
+	spin_lock_init(&data->index_lock);
+
+	for (i = 0; i < super->num_index_pages; i++)
+		data->index_page[i] =
+			kmap(dax_kmem_index_to_page(i + 1, dev_dax));
+
+	kmem_blk_free_unused(data);
+
+	kmem_blk_disk_init(data);
+
+	return 0;
+}
+
+static int kmem_blk_cleanup(struct dev_dax *dev_dax, void *persist_data)
+{
+	struct kmem_blk_data *data = persist_data;
+	unsigned long num_pages = dax_kmem_num_pages(dev_dax);
+	unsigned long i;
+
+	if (data == NULL)
+		return -1;
+
+	kmem_blk_disk_cleanup(data);
+
+	if (data->super == 0) {
+		for (i = 0; i < num_pages; i++)
+			__free_page(dax_kmem_index_to_page(i, dev_dax));
+	} else {
+		for (i = 0; i < data->super->num_index_entries; i++) {
+			u64 page_num = data->index_page
+				[i / index_entries_per_page]
+				[i % index_entries_per_page];
+			if (page_num != 0) {
+				__free_page(dax_kmem_index_to_page(page_num,
+								   dev_dax));
+			}
+		}
+		for (i = 0; i < data->super->num_index_pages; i++) {
+			struct page *page =
+				dax_kmem_index_to_page(1 + i, dev_dax);
+			data->index_page[i] = NULL;
+			kunmap(page);
+			__free_page(page);
+		}
+		data->super = NULL;
+		kunmap(dax_kmem_index_to_page(0, dev_dax));
+		__free_page(dax_kmem_index_to_page(0, dev_dax));
+	}
+	kfree(data);
+	return 0;
+}
+
+struct kmem_persist_ops kmem_persist_blk_ops = {
+	.type = KMEM_PERSIST_BLK,
+	.format = kmem_blk_format,
+	.probe = kmem_blk_probe,
+	.cleanup = kmem_blk_cleanup
+};
diff --git a/drivers/dax/kmem_persist.h b/drivers/dax/kmem_persist.h
index dd651025f28c..0e0279feaa12 100644
--- a/drivers/dax/kmem_persist.h
+++ b/drivers/dax/kmem_persist.h
@@ -10,6 +10,7 @@  struct dev_dax;
 
 enum kmem_persist_type {
 	KMEM_PERSIST_NONE = 0,
+	KMEM_PERSIST_BLK,
 };
 
 
@@ -40,4 +41,7 @@  unsigned long dax_kmem_num_pages(struct dev_dax *dev_dax);
 struct page *dax_kmem_alloc_page(struct dev_dax *dev_dax,
 				unsigned long *page_index);
 
+/* Defined in kmem_blk.c */
+extern struct kmem_persist_ops kmem_persist_blk_ops;
+
 #endif