diff mbox series

[RFC,05/17] zuf: Multy Devices

Message ID 20190219115136.29952-6-boaz@plexistor.com (mailing list archive)
State New, archived
Headers show
Series zuf: ZUFS Zero-copy User-mode FileSystem | expand

Commit Message

Boaz Harrosh Feb. 19, 2019, 11:51 a.m. UTC
From: Boaz Harrosh <boazh@netapp.com>

ZUFS supports Multiple block devices per super_block.
This here is the devices handling code. At the output
a single multi_devices (md.h) object is associated with the
mounting super_block.

There are three mode of operations:
* mount with out a device (mount -t FOO none /somepath)

* A single device - The FS stated register_fs_info->dt_offset==-1
  No checks are made by Kernel, the single bdev is registered with
  Kernel's mount_bdev. It is up to the zusFS to check validity

* Multy devices - The FS stated register_fs_info->dt_offset==X

  This mode is the main of this patch.
  A single device is given on the mount command line. At
  register_fs_info->dt_offset of this device we look for a
  zufs_dev_table structure. After all the checks we look there
  at the device list and open all devices. Any one of the devices may
  be given on command line. But they will always be opened in
  DT(Device Table) order. The Device table has the notion of two types
  of bdevs:
  T1 devices - are pmem devices capable of direct_access
  T2 devices - are none direct_access devices

  All t1 devices are presented as one linear array. in DT order
  In t1.c we mmap this space for the server to directly access
  pmem. (In the proper persistent way)

  [We do not support any direct_access device, we only support
   pmem(s) where the all device can be addressed by a single
   physical/virtual address. This is checked before mount]

   The T2 devices are also grabbed and owned by the super_block
   A later API will enable the Server to write or transfer buffers
   from T1 to T2 in a very efficient manner. Also presented as a
   single linear array in DT order.

   Both kind of devices are NUMA aware and the NUMA info is presented
   to the zusFS for optimal allocation and access.

Signed-off-by: Boaz Harrosh <boazh@netapp.com>
---
 fs/zuf/Makefile   |   3 +
 fs/zuf/_extern.h  |   3 +
 fs/zuf/_pr.h      |  11 +
 fs/zuf/md.c       | 764 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/zuf/md.h       | 318 +++++++++++++++++++
 fs/zuf/md_def.h   | 145 +++++++++
 fs/zuf/t1.c       | 138 +++++++++
 fs/zuf/t2.c       | 375 +++++++++++++++++++++++
 fs/zuf/t2.h       |  68 +++++
 fs/zuf/zuf-core.c |  87 ++++++
 fs/zuf/zuf.h      |  39 +++
 fs/zuf/zus_api.h  |  18 ++
 12 files changed, 1969 insertions(+)
 create mode 100644 fs/zuf/md.c
 create mode 100644 fs/zuf/md.h
 create mode 100644 fs/zuf/md_def.h
 create mode 100644 fs/zuf/t1.c
 create mode 100644 fs/zuf/t2.c
 create mode 100644 fs/zuf/t2.h
diff mbox series

Patch

diff --git a/fs/zuf/Makefile b/fs/zuf/Makefile
index 8e62b4c52150..091bf053a6ed 100644
--- a/fs/zuf/Makefile
+++ b/fs/zuf/Makefile
@@ -10,6 +10,9 @@ 
 
 obj-$(CONFIG_ZUF) += zuf.o
 
+# Infrastructure
+zuf-y += md.o t1.o t2.o
+
 # ZUF core
 zuf-y += zuf-core.o zuf-root.o
 
diff --git a/fs/zuf/_extern.h b/fs/zuf/_extern.h
index 52bb6b9deafe..15d632ea5ed2 100644
--- a/fs/zuf/_extern.h
+++ b/fs/zuf/_extern.h
@@ -57,4 +57,7 @@  int zufc_dispatch(struct zuf_root_info *zri, struct zufs_ioc_hdr *hdr,
 /* zuf-root.c */
 int zufr_register_fs(struct super_block *sb, struct zufs_ioc_register_fs *rfs);
 
+/* t1.c */
+int zuf_pmem_mmap(struct file *file, struct vm_area_struct *vma);
+
 #endif	/*ndef __ZUF_EXTERN_H__*/
diff --git a/fs/zuf/_pr.h b/fs/zuf/_pr.h
index dc9f85453890..85641b6f1478 100644
--- a/fs/zuf/_pr.h
+++ b/fs/zuf/_pr.h
@@ -40,8 +40,19 @@ 
 /* ~~~ channel prints ~~~ */
 #define zuf_dbg_err(s, args ...)	zuf_chan_debug("error", s, ##args)
 #define zuf_dbg_vfs(s, args ...)	zuf_chan_debug("vfs  ", s, ##args)
+#define zuf_dbg_t1(s, args ...)		zuf_chan_debug("t1   ", s, ##args)
+#define zuf_dbg_t2(s, args ...)		zuf_chan_debug("t2dbg", s, ##args)
+#define zuf_dbg_t2_rw(s, args ...)	zuf_chan_debug("t2grw", s, ##args)
 #define zuf_dbg_core(s, args ...)	zuf_chan_debug("core ", s, ##args)
 #define zuf_dbg_zus(s, args ...)	zuf_chan_debug("zusdg", s, ##args)
 #define zuf_dbg_verbose(s, args ...)	zuf_chan_debug("d-oto", s, ##args)
 
+#define md_err		zuf_err
+#define md_warn		zuf_warn
+#define md_err_cnd	zuf_err_cnd
+#define md_warn_cnd	zuf_warn_cnd
+#define md_dbg_err	zuf_dbg_err
+#define md_dbg_verbose	zuf_dbg_verbose
+
+
 #endif /* define __ZUF_PR_H__ */
diff --git a/fs/zuf/md.c b/fs/zuf/md.c
new file mode 100644
index 000000000000..059c96062177
--- /dev/null
+++ b/fs/zuf/md.c
@@ -0,0 +1,764 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Multi-Device operations.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0. See module.c for LICENSE details.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ *	Sagi Manole <sagim@netapp.com>"
+ */
+
+#include <linux/blkdev.h>
+#include <linux/pfn_t.h>
+#include <linux/crc16.h>
+#include <linux/uuid.h>
+
+#include <linux/gcd.h>
+
+#include "_pr.h"
+#include "md.h"
+#include "t2.h"
+
+/* length of uuid dev path /dev/disk/by-uuid/<uuid> */
+#define PATH_UUID	64
+
+const fmode_t _g_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
+
+/* allocate space for and copy an existing uuid */
+static char *_uuid_path(uuid_le *uuid)
+{
+	char path[PATH_UUID];
+
+	sprintf(path, "/dev/disk/by-uuid/%pUb", uuid);
+	return kstrdup(path, GFP_KERNEL);
+}
+
+static int _bdev_get_by_path(const char *path, struct block_device **bdev,
+			     void *holder)
+{
+	/* The owner of the device is the pointer that will hold it. This
+	 * protects from same device mounting on two super-blocks as well
+	 * as same device being repeated twice.
+	 */
+	*bdev = blkdev_get_by_path(path, _g_mode, holder);
+	if (IS_ERR(*bdev)) {
+		int err = PTR_ERR(*bdev);
+		*bdev = NULL;
+		return err;
+	}
+	return 0;
+}
+
+static void _bdev_put(struct block_device **bdev, struct block_device *s_bdev)
+{
+	if (*bdev) {
+		if (!s_bdev || *bdev != s_bdev)
+			blkdev_put(*bdev, _g_mode);
+		*bdev = NULL;
+	}
+}
+
+static int ___bdev_get_by_uuid(struct block_device **bdev, uuid_le *uuid,
+			       void *holder, bool silent, const char *msg,
+			       const char *f, int l)
+{
+	char *path = NULL;
+	int err;
+
+	path = _uuid_path(uuid);
+	err = _bdev_get_by_path(path, bdev, holder);
+	if (unlikely(err))
+		md_err_cnd(silent, "[%s:%d] %s path=%s =>%d\n",
+			     f, l, msg, path, err);
+
+	kfree(path);
+	return err;
+}
+
+#define _bdev_get_by_uuid(bdev, uuid, holder, msg) \
+	___bdev_get_by_uuid(bdev, uuid, holder, silent, msg, __func__, __LINE__)
+
+static bool _main_bdev(struct block_device *bdev)
+{
+	if (bdev->bd_super && bdev->bd_super->s_bdev == bdev)
+		return true;
+	return false;
+}
+
+short md_calc_csum(struct md_dev_table *mdt)
+{
+	uint n = MDT_STATIC_SIZE(mdt) - sizeof(mdt->s_sum);
+
+	return crc16(~0, (__u8 *)&mdt->s_version, n);
+}
+
+/* ~~~~~~~ mdt related functions ~~~~~~~ */
+
+int md_t2_mdt_read(struct multi_devices *md, int index,
+		   struct md_dev_table *mdt)
+{
+	int err = t2_readpage(md, index, virt_to_page(mdt));
+
+	if (err)
+		md_dbg_verbose("!!! t2_readpage err=%d\n", err);
+
+	return err;
+}
+
+int _t2_mdt_read(struct block_device *bdev, struct md_dev_table *mdt)
+{
+	int err;
+	/* t2 interface works for all block devices */
+	struct multi_devices *md;
+	struct md_dev_info *mdi;
+
+	md = kzalloc(sizeof(*md), GFP_KERNEL);
+	if (unlikely(!md))
+		return -ENOMEM;
+
+	md->t2_count = 1;
+	md->devs[0].bdev = bdev;
+	mdi = &md->devs[0];
+	md->t2a.map = &mdi;
+	md->t2a.bn_gcd = 1; /*Does not matter only must not be zero */
+
+	err = md_t2_mdt_read(md, 0, mdt);
+
+	kfree(md);
+	return err;
+}
+
+int md_t2_mdt_write(struct multi_devices *md, struct md_dev_table *mdt)
+{
+	int i, err = 0;
+
+	for (i = 0; i < md->t2_count; ++i) {
+		ulong bn = md_o2p(md_t2_dev(md, i)->offset);
+
+		mdt->s_dev_list.id_index = mdt->s_dev_list.t1_count + i;
+		mdt->s_sum = cpu_to_le16(md_calc_csum(mdt));
+
+		err = t2_writepage(md, bn, virt_to_page(mdt));
+		if (err)
+			md_dbg_verbose("!!! t2_writepage err=%d\n", err);
+	}
+
+	return err;
+}
+
+static bool _csum_mismatch(struct md_dev_table *mdt, int silent)
+{
+	ushort crc = md_calc_csum(mdt);
+
+	if (mdt->s_sum == cpu_to_le16(crc))
+		return false;
+
+	md_warn_cnd(silent, "expected(0x%x) != s_sum(0x%x)\n",
+		      cpu_to_le16(crc), mdt->s_sum);
+	return true;
+}
+
+static bool _uuid_le_equal(uuid_le *uuid1, uuid_le *uuid2)
+{
+	return (memcmp(uuid1, uuid2, sizeof(uuid_le)) == 0);
+}
+
+static bool _mdt_compare_uuids(struct md_dev_table *mdt,
+			       struct md_dev_table *main_mdt, int silent)
+{
+	int i, dev_count;
+
+	if (!_uuid_le_equal(&mdt->s_uuid, &main_mdt->s_uuid)) {
+		md_warn_cnd(silent, "mdt uuid (%pUb != %pUb) mismatch\n",
+			      &mdt->s_uuid, &main_mdt->s_uuid);
+		return false;
+	}
+
+	dev_count = mdt->s_dev_list.t1_count + mdt->s_dev_list.t2_count +
+		    mdt->s_dev_list.rmem_count;
+	for (i = 0; i < dev_count; ++i) {
+		struct md_dev_id *dev_id1 = &mdt->s_dev_list.dev_ids[i];
+		struct md_dev_id *dev_id2 = &main_mdt->s_dev_list.dev_ids[i];
+
+		if (!_uuid_le_equal(&dev_id1->uuid, &dev_id2->uuid)) {
+			md_warn_cnd(silent,
+				    "mdt dev %d uuid (%pUb != %pUb) mismatch\n",
+				    i, &dev_id1->uuid, &dev_id2->uuid);
+			return false;
+		}
+
+		if (dev_id1->blocks != dev_id2->blocks) {
+			md_warn_cnd(silent,
+				    "mdt dev %d blocks (0x%llx != 0x%llx) mismatch\n",
+				    i, le64_to_cpu(dev_id1->blocks),
+				    le64_to_cpu(dev_id2->blocks));
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool md_mdt_check(struct md_dev_table *mdt,
+		  struct md_dev_table *main_mdt, struct block_device *bdev,
+		  struct mdt_check *mc)
+{
+	struct md_dev_table *mdt2 = (void *)mdt + MDT_SIZE;
+	struct md_dev_id *dev_id;
+	ulong bdev_size, super_size;
+
+	BUILD_BUG_ON(MDT_STATIC_SIZE(mdt) & (SMP_CACHE_BYTES - 1));
+
+	/* Do sanity checks on the superblock */
+	if (le32_to_cpu(mdt->s_magic) != mc->magic) {
+		if (le32_to_cpu(mdt2->s_magic) != mc->magic) {
+			md_warn_cnd(mc->silent,
+				     "Can't find a valid partition\n");
+			return false;
+		}
+
+		md_warn_cnd(mc->silent,
+			     "Magic error in super block: using copy\n");
+		/* Try to auto-recover the super block */
+		memcpy_flushcache(mdt, mdt2, sizeof(*mdt));
+	}
+
+	if ((mc->major_ver != mdt_major_version(mdt)) ||
+	    (mc->minor_ver < mdt_minor_version(mdt))) {
+		md_warn_cnd(mc->silent,
+			     "mkfs-mount versions mismatch! %d.%d != %d.%d\n",
+			     mdt_major_version(mdt), mdt_minor_version(mdt),
+			     mc->major_ver, mc->minor_ver);
+		return false;
+	}
+
+	if (_csum_mismatch(mdt, mc->silent)) {
+		if (_csum_mismatch(mdt2, mc->silent)) {
+			md_warn_cnd(mc->silent,
+				    "checksum error in super block\n");
+			return false;
+		}
+
+		md_warn_cnd(mc->silent,
+			    "crc16 error in super block: using copy\n");
+		/* Try to auto-recover the super block */
+		memcpy_flushcache(mdt, mdt2, MDT_SIZE);
+		/* TODO(sagi): copy fixed mdt to shadow */
+	}
+
+	if (main_mdt) {
+		if (mdt->s_dev_list.t1_count != main_mdt->s_dev_list.t1_count) {
+			md_warn_cnd(mc->silent, "mdt t1 count mismatch\n");
+			return false;
+		}
+
+		if (mdt->s_dev_list.t2_count != main_mdt->s_dev_list.t2_count) {
+			md_warn_cnd(mc->silent, "mdt t2 count mismatch\n");
+			return false;
+		}
+
+		if (mdt->s_dev_list.rmem_count !=
+		    main_mdt->s_dev_list.rmem_count) {
+			md_warn_cnd(mc->silent,
+				    "mdt rmem dev count mismatch\n");
+			return false;
+		}
+
+		if (!_mdt_compare_uuids(mdt, main_mdt, mc->silent))
+			return false;
+	}
+
+	/* check alignment */
+	dev_id = &mdt->s_dev_list.dev_ids[mdt->s_dev_list.id_index];
+	super_size = md_p2o(__dev_id_blocks(dev_id));
+	if (unlikely(!super_size || super_size & mc->alloc_mask)) {
+		md_warn_cnd(mc->silent, "super_size(0x%lx) ! 2_M aligned\n",
+			      super_size);
+		return false;
+	}
+
+	if (!bdev)
+		return true;
+
+	/* check t1 device size */
+	bdev_size = i_size_read(bdev->bd_inode);
+	if (unlikely(super_size > bdev_size)) {
+		md_warn_cnd(mc->silent,
+			    "bdev_size(0x%lx) too small expected 0x%lx\n",
+			    bdev_size, super_size);
+		return false;
+	} else if (unlikely(super_size < bdev_size)) {
+		md_dbg_err("Note mdt->size=(0x%lx) < bdev_size(0x%lx)\n",
+			      super_size, bdev_size);
+	}
+
+	return true;
+}
+
+
+int md_set_sb(struct multi_devices *md, struct block_device *s_bdev,
+	      void *sb, int silent)
+{
+	int i;
+
+	for (i = 0; i < md->t1_count; ++i) {
+		struct md_dev_info *mdi;
+
+		if (i == md->dev_index)
+			continue;
+
+		mdi = md_t1_dev(md, i);
+		if (mdi->bdev->bd_super && (mdi->bdev->bd_super != sb)) {
+			md_warn_cnd(silent,
+				"!!! %s already mounted on a different FS => -EBUSY\n",
+				_bdev_name(mdi->bdev));
+			return -EBUSY;
+		}
+
+		mdi->bdev->bd_super = sb;
+	}
+
+	md_dev_info(md, md->dev_index)->bdev = s_bdev;
+	return 0;
+}
+
+void md_fini(struct multi_devices *md, struct block_device *s_bdev)
+{
+	int i;
+
+	kfree(md->t2a.map);
+	kfree(md->t1a.map);
+
+	for (i = 0; i < md->t1_count + md->t2_count; ++i) {
+		struct md_dev_info *mdi = md_dev_info(md, i);
+
+		md_t1_info_fini(mdi);
+		if (mdi->bdev && !_main_bdev(mdi->bdev))
+			mdi->bdev->bd_super = NULL;
+		_bdev_put(&mdi->bdev, s_bdev);
+	}
+
+	kfree(md);
+}
+
+
+/* ~~~~~~~ Pre-mount operations ~~~~~~~ */
+
+static int _get_device(struct block_device **bdev, const char *dev_name,
+		       uuid_le *uuid, void *holder, int silent,
+		       bool *bind_mount)
+{
+	int err;
+
+	if (dev_name)
+		err = _bdev_get_by_path(dev_name, bdev, holder);
+	else
+		err = _bdev_get_by_uuid(bdev, uuid, holder,
+					"failed to get device");
+
+	if (unlikely(err)) {
+		md_err_cnd(silent,
+			"failed to get device dev_name=%s uuid=%pUb err=%d\n",
+			dev_name, uuid, err);
+		return err;
+	}
+
+	if (bind_mount && _main_bdev(*bdev))
+		*bind_mount = true;
+
+	return 0;
+}
+
+static int _init_dev_info(struct md_dev_info *mdi, struct md_dev_id *id,
+			  int index, u64 offset,
+			  struct md_dev_table *main_mdt,
+			  struct mdt_check *mc, bool t1_dev,
+			  int silent)
+{
+	struct md_dev_table *mdt = NULL;
+	bool mdt_alloc = false;
+	int err = 0;
+
+	if (mdi->bdev == NULL) {
+		err = _get_device(&mdi->bdev, NULL, &id->uuid, mc->holder,
+				  silent, NULL);
+		if (unlikely(err))
+			return err;
+	}
+
+	mdi->offset = offset;
+	mdi->size = md_p2o(__dev_id_blocks(id));
+	mdi->index = index;
+
+	if (t1_dev) {
+		struct page *dev_page;
+		int end_of_dev_nid;
+
+		err = md_t1_info_init(mdi, silent);
+		if (unlikely(err))
+			return err;
+
+		if ((ulong)mdi->t1i.virt_addr & mc->alloc_mask) {
+			md_warn_cnd(silent, "!!! unaligned device %s\n",
+				      _bdev_name(mdi->bdev));
+			return -EINVAL;
+		}
+
+		if (!__pfn_to_section(mdi->t1i.phys_pfn)) {
+			md_err_cnd(silent, "Intel does not like pages...\n");
+			return -EINVAL;
+		}
+
+		mdt = mdi->t1i.virt_addr;
+
+		mdi->t1i.pgmap = virt_to_page(mdt)->pgmap;
+		dev_page = pfn_to_page(mdi->t1i.phys_pfn);
+		mdi->nid = page_to_nid(dev_page);
+		end_of_dev_nid = page_to_nid(dev_page + md_o2p(mdi->size - 1));
+
+		if (mdi->nid != end_of_dev_nid)
+			md_warn("pmem crosses NUMA boundaries");
+	} else {
+		mdt = (void *)__get_free_page(GFP_KERNEL);
+		if (unlikely(!mdt)) {
+			md_dbg_err("!!! failed to alloc page\n");
+			return -ENOMEM;
+		}
+
+		mdt_alloc = true;
+		err = _t2_mdt_read(mdi->bdev, mdt);
+		if (unlikely(err)) {
+			md_err_cnd(silent, "failed to read mdt from t2 => %d\n",
+				   err);
+			goto out;
+		}
+		mdi->nid = __dev_id_nid(id);
+	}
+
+	if (!md_mdt_check(mdt, main_mdt, mdi->bdev, mc)) {
+		md_err_cnd(silent, "device %s failed integrity check\n",
+			     _bdev_name(mdi->bdev));
+		err = -EINVAL;
+		goto out;
+	}
+
+	return 0;
+
+out:
+	if (mdt_alloc)
+		free_page((ulong)mdt);
+	return err;
+}
+
+static int _map_setup(struct multi_devices *md, ulong blocks, int dev_start,
+		      struct md_dev_larray *larray)
+{
+	ulong map_size, bn_end;
+	int i, dev_index = dev_start;
+
+	map_size = blocks / larray->bn_gcd;
+	larray->map = kcalloc(map_size, sizeof(*larray->map), GFP_KERNEL);
+	if (!larray->map) {
+		md_dbg_err("failed to allocate dev map\n");
+		return -ENOMEM;
+	}
+
+	bn_end = md_o2p(md->devs[dev_index].size);
+	for (i = 0; i < map_size; ++i) {
+		if ((i * larray->bn_gcd) >= bn_end)
+			bn_end += md_o2p(md->devs[++dev_index].size);
+		larray->map[i] = &md->devs[dev_index];
+	}
+
+	return 0;
+}
+
+static int _md_init(struct multi_devices *md, struct mdt_check *mc,
+		    struct md_dev_list *dev_list, int silent)
+{
+	struct md_dev_table *main_mdt = NULL;
+	u64 total_size = 0;
+	int i, err;
+
+	for (i = 0; i < md->t1_count; ++i) {
+		struct md_dev_info *mdi = md_t1_dev(md, i);
+		struct md_dev_table *dev_mdt;
+
+		err = _init_dev_info(mdi, &dev_list->dev_ids[i], i, total_size,
+				     main_mdt, mc, true, silent);
+		if (unlikely(err))
+			return err;
+
+		/* apparently gcd(0,X)=X which is nice */
+		md->t1a.bn_gcd = gcd(md->t1a.bn_gcd, md_o2p(mdi->size));
+		total_size += mdi->size;
+
+		dev_mdt = md_t1_addr(md, i);
+		if (!main_mdt)
+			main_mdt = dev_mdt;
+
+		if (mdt_test_option(dev_mdt, MDT_F_SHADOW))
+			memcpy(mdi->t1i.virt_addr,
+			       mdi->t1i.virt_addr + mdi->size, mdi->size);
+
+		md_dbg_verbose("dev=%d %pUb %s v=%p pfn=%lu off=%lu size=%lu\n",
+				 i, &dev_list->dev_ids[i].uuid,
+				 _bdev_name(mdi->bdev), dev_mdt,
+				 mdi->t1i.phys_pfn, mdi->offset, mdi->size);
+	}
+
+	md->t1_blocks = le64_to_cpu(main_mdt->s_t1_blocks);
+	if (unlikely(md->t1_blocks != md_o2p(total_size))) {
+		md_err_cnd(silent,
+			"FS corrupted md->t1_blocks(0x%lx) != total_size(0x%llx)\n",
+			md->t1_blocks, total_size);
+		return -EIO;
+	}
+
+	err = _map_setup(md, le64_to_cpu(main_mdt->s_t1_blocks), 0, &md->t1a);
+	if (unlikely(err))
+		return err;
+
+	md_dbg_verbose("t1 devices=%d total_size=0x%llx segment_map=0x%lx\n",
+			 md->t1_count, total_size,
+			 md_o2p(total_size) / md->t1a.bn_gcd);
+
+	if (md->t2_count == 0)
+		return 0;
+
+	/* Done with t1. Counting t2s */
+	total_size = 0;
+	for (i = 0; i < md->t2_count; ++i) {
+		struct md_dev_info *mdi = md_t2_dev(md, i);
+
+		err = _init_dev_info(mdi, &dev_list->dev_ids[md->t1_count + i],
+				     md->t1_count + i, total_size, main_mdt,
+				     mc, false, silent);
+		if (unlikely(err))
+			return err;
+
+		/* apparently gcd(0,X)=X which is nice */
+		md->t2a.bn_gcd = gcd(md->t2a.bn_gcd, md_o2p(mdi->size));
+		total_size += mdi->size;
+
+		md_dbg_verbose("dev=%d %s off=%lu size=%lu\n", i,
+				 _bdev_name(mdi->bdev), mdi->offset, mdi->size);
+	}
+
+	md->t2_blocks = le64_to_cpu(main_mdt->s_t2_blocks);
+	if (unlikely(md->t2_blocks != md_o2p(total_size))) {
+		md_err_cnd(silent,
+			"FS corrupted md->t2_blocks(0x%lx) != total_size(0x%llx)\n",
+			md->t2_blocks, total_size);
+		return -EIO;
+	}
+
+	err = _map_setup(md, le64_to_cpu(main_mdt->s_t2_blocks), md->t1_count,
+			 &md->t2a);
+	if (unlikely(err))
+		return err;
+
+	md_dbg_verbose("t2 devices=%d total_size=%llu segment_map=%lu\n",
+			 md->t2_count, total_size,
+			 md_o2p(total_size) / md->t2a.bn_gcd);
+
+	return 0;
+}
+
+static int _load_dev_list(struct md_dev_list *dev_list, struct mdt_check *mc,
+			  struct block_device *bdev, const char *dev_name,
+			  int silent)
+{
+	struct md_dev_table *mdt;
+	int err;
+
+	mdt = (void *)__get_free_page(GFP_KERNEL);
+	if (unlikely(!mdt)) {
+		md_dbg_err("!!! failed to alloc page\n");
+		return -ENOMEM;
+	}
+
+	err = _t2_mdt_read(bdev, mdt);
+	if (unlikely(err)) {
+		md_err_cnd(silent, "failed to read super block from %s => %d\n",
+			     dev_name, err);
+		goto out;
+	}
+
+	if (!md_mdt_check(mdt, NULL, bdev, mc)) {
+		md_err_cnd(silent, "bad mdt in %s\n", dev_name);
+		err = -EINVAL;
+		goto out;
+	}
+
+	*dev_list = mdt->s_dev_list;
+
+out:
+	free_page((ulong)mdt);
+	return err;
+}
+
+int md_init(struct multi_devices *md, const char *dev_name,
+	    struct mdt_check *mc, const char **dev_path)
+{
+	struct md_dev_list *dev_list;
+	struct block_device *bdev;
+	short id_index;
+	bool bind_mount = false;
+	int err;
+
+	dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL);
+	if (unlikely(!dev_list))
+		return -ENOMEM;
+
+	err = _get_device(&bdev, dev_name, NULL, mc->holder, mc->silent,
+			  &bind_mount);
+	if (unlikely(err))
+		goto out2;
+
+	err = _load_dev_list(dev_list, mc, bdev, dev_name, mc->silent);
+	if (unlikely(err)) {
+		_bdev_put(&bdev, NULL);
+		goto out2;
+	}
+
+	id_index = le16_to_cpu(dev_list->id_index);
+	if (bind_mount) {
+		_bdev_put(&bdev, NULL);
+		md->dev_index = id_index;
+		goto out;
+	}
+
+	md->t1_count = le16_to_cpu(dev_list->t1_count);
+	md->t2_count = le16_to_cpu(dev_list->t2_count);
+	md->devs[id_index].bdev = bdev;
+
+	if ((id_index != 0)) {
+		err = _get_device(&md_t1_dev(md, 0)->bdev, NULL,
+				  &dev_list->dev_ids[0].uuid, mc->holder,
+				  mc->silent, &bind_mount);
+		if (unlikely(err))
+			goto out2;
+
+		if (bind_mount)
+			goto out;
+	}
+
+	if (md->t2_count) {
+		int t2_index = md->t1_count;
+
+		/* t2 is the primary device if given in mount, or the first
+		 * mount specified it as primary device
+		 */
+		if (id_index != md->t1_count) {
+			err = _get_device(&md_t2_dev(md, 0)->bdev, NULL,
+					  &dev_list->dev_ids[t2_index].uuid,
+					  mc->holder, mc->silent, &bind_mount);
+			if (unlikely(err))
+				goto out2;
+
+			if (bind_mount)
+				md->dev_index = t2_index;
+		}
+
+		if (t2_index <= id_index)
+			md->dev_index = t2_index;
+	}
+
+out:
+	if (md->dev_index != id_index)
+		*dev_path = _uuid_path(&dev_list->dev_ids[md->dev_index].uuid);
+	else
+		*dev_path = kstrdup(dev_name, GFP_KERNEL);
+
+	if (!bind_mount) {
+		err = _md_init(md, mc, dev_list, mc->silent);
+		if (unlikely(err))
+			goto out2;
+		_bdev_put(&md_dev_info(md, md->dev_index)->bdev, NULL);
+	} else {
+		md_fini(md, NULL);
+	}
+
+out2:
+	kfree(dev_list);
+
+	return err;
+}
+
+struct multi_devices *md_alloc(size_t size)
+{
+	uint s = max(sizeof(struct multi_devices), size);
+	struct multi_devices *md = kzalloc(s, GFP_KERNEL);
+
+	if (unlikely(!md))
+		return ERR_PTR(-ENOMEM);
+	return md;
+}
+
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * PORTING SECTION:
+ * Below are members that are done differently in different Linux versions.
+ * So keep separate from code
+ */
+static int _check_da_ret(struct md_dev_info *mdi, long avail, bool silent)
+{
+	if (unlikely(avail < (long)mdi->size)) {
+		if (0 < avail) {
+			md_warn_cnd(silent,
+				"Unsupported DAX device %s (range mismatch) => 0x%lx < 0x%lx\n",
+				_bdev_name(mdi->bdev), avail, mdi->size);
+			return -ERANGE;
+		}
+		md_warn_cnd(silent, "!!! %s direct_access return => %ld\n",
+			    _bdev_name(mdi->bdev), avail);
+		return avail;
+	}
+	return 0;
+}
+
+#include <linux/dax.h>
+
+int md_t1_info_init(struct md_dev_info *mdi, bool silent)
+{
+	pfn_t a_pfn_t;
+	void *addr;
+	long nrpages, avail, pgoff;
+	int id;
+
+	mdi->t1i.dax_dev = fs_dax_get_by_bdev(mdi->bdev);
+	if (unlikely(!mdi->t1i.dax_dev))
+		return -EOPNOTSUPP;
+
+	id = dax_read_lock();
+
+	bdev_dax_pgoff(mdi->bdev, 0, PAGE_SIZE, &pgoff);
+	nrpages = dax_direct_access(mdi->t1i.dax_dev, pgoff, md_o2p(mdi->size),
+				    &addr, &a_pfn_t);
+	dax_read_unlock(id);
+	if (unlikely(nrpages <= 0)) {
+		if (!nrpages)
+			nrpages = -ERANGE;
+		avail = nrpages;
+	} else {
+		avail = md_p2o(nrpages);
+	}
+
+	mdi->t1i.virt_addr = addr;
+	mdi->t1i.phys_pfn = pfn_t_to_pfn(a_pfn_t);
+
+	md_dbg_verbose("0x%lx 0x%lx pgoff=0x%lx\n",
+			 (ulong)mdi->t1i.virt_addr, mdi->t1i.phys_pfn, pgoff);
+
+	return _check_da_ret(mdi, avail, silent);
+}
+
+void md_t1_info_fini(struct md_dev_info *mdi)
+{
+	fs_put_dax(mdi->t1i.dax_dev);
+	mdi->t1i.dax_dev = NULL;
+	mdi->t1i.virt_addr = NULL;
+}
diff --git a/fs/zuf/md.h b/fs/zuf/md.h
new file mode 100644
index 000000000000..eaf7280af356
--- /dev/null
+++ b/fs/zuf/md.h
@@ -0,0 +1,318 @@ 
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Multi-Device operations.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ *	Sagi Manole <sagim@netapp.com>"
+ */
+
+#ifndef __MD_H__
+#define __MD_H__
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+
+#include "md_def.h"
+
+#ifndef __KERNEL__
+struct page;
+struct block_device;
+#endif /* ndef __KERNEL__ */
+
+struct md_t1_info {
+	void *virt_addr;
+#ifdef __KERNEL__
+	ulong phys_pfn;
+	struct dax_device *dax_dev;
+	struct dev_pagemap *pgmap;
+#endif /*def __KERNEL__*/
+};
+
+struct md_t2_info {
+	bool err_read_reported;
+	bool err_write_reported;
+};
+
+struct md_dev_info {
+	struct block_device *bdev;
+	ulong size;
+	ulong offset;
+	union {
+		struct md_t1_info	t1i;
+		struct md_t2_info	t2i;
+	};
+	int index;
+	int nid;
+};
+
+struct md_dev_larray {
+	ulong bn_gcd;
+	struct md_dev_info **map;
+};
+
+#ifndef __KERNEL__
+struct fba {
+	int fd; void *ptr;
+	size_t size;
+	void *orig_ptr;
+};
+#endif /*! __KERNEL__*/
+
+struct zus_sb_info;
+struct multi_devices {
+	int dev_index;
+	int t1_count;
+	int t2_count;
+	struct md_dev_info devs[MD_DEV_MAX];
+	struct md_dev_larray t1a;
+	struct md_dev_larray t2a;
+#ifndef __KERNEL__
+	struct zufs_ioc_pmem pmem_info; /* As received from Kernel */
+
+	void *p_pmem_addr;
+	int fd;
+	uint user_page_size;
+	struct fba pages;
+	struct zus_sb_info *sbi;
+#else
+	ulong t1_blocks;
+	ulong t2_blocks;
+#endif /*! __KERNEL__*/
+};
+
+static inline __u64 md_p2o(ulong bn)
+{
+	return (__u64)bn << PAGE_SHIFT;
+}
+
+static inline ulong md_o2p(__u64 offset)
+{
+	return offset >> PAGE_SHIFT;
+}
+
+static inline ulong md_o2p_up(__u64 offset)
+{
+	return md_o2p(offset + PAGE_SIZE - 1);
+}
+
+static inline struct md_dev_info *md_t1_dev(struct multi_devices *md, int i)
+{
+	return &md->devs[i];
+}
+
+static inline struct md_dev_info *md_t2_dev(struct multi_devices *md, int i)
+{
+	return &md->devs[md->t1_count + i];
+}
+
+static inline struct md_dev_info *md_dev_info(struct multi_devices *md, int i)
+{
+	return &md->devs[i];
+}
+
+static inline void *md_t1_addr(struct multi_devices *md, int i)
+{
+	struct md_dev_info *mdi = md_t1_dev(md, i);
+
+	return mdi->t1i.virt_addr;
+}
+
+static inline ulong md_t1_blocks(struct multi_devices *md)
+{
+#ifdef __KERNEL__
+	return md->t1_blocks;
+#else
+	return md->pmem_info.mdt.s_t1_blocks;
+#endif
+}
+
+static inline ulong md_t2_blocks(struct multi_devices *md)
+{
+#ifdef __KERNEL__
+	return md->t2_blocks;
+#else
+	return md->pmem_info.mdt.s_t2_blocks;
+#endif
+}
+
+static inline struct md_dev_table *md_zdt(struct multi_devices *md)
+{
+	return md_t1_addr(md, 0);
+}
+
+static inline struct md_dev_info *md_bn_t1_dev(struct multi_devices *md,
+						 ulong bn)
+{
+	return md->t1a.map[bn / md->t1a.bn_gcd];
+}
+
+#ifdef __KERNEL__
+static inline ulong md_pfn(struct multi_devices *md, ulong block)
+{
+	struct md_dev_info *mdi;
+	bool add_pfn = false;
+	ulong base_pfn;
+
+	if (unlikely(md_t1_blocks(md) <= block)) {
+		if (WARN_ON(!mdt_test_option(md_zdt(md), MDT_F_SHADOW)))
+			return 0;
+		block -= md_t1_blocks(md);
+		add_pfn = true;
+	}
+
+	mdi = md_bn_t1_dev(md, block);
+	if (add_pfn)
+		base_pfn = mdi->t1i.phys_pfn + md_o2p(mdi->size);
+	else
+		base_pfn = mdi->t1i.phys_pfn;
+	return base_pfn + (block - md_o2p(mdi->offset));
+}
+#endif /* def __KERNEL__ */
+
+static inline void *md_addr(struct multi_devices *md, ulong offset)
+{
+#ifdef __KERNEL__
+	struct md_dev_info *mdi = md_bn_t1_dev(md, md_o2p(offset));
+
+	return offset ? mdi->t1i.virt_addr + (offset - mdi->offset) : NULL;
+#else
+	return offset ? md->p_pmem_addr + offset : NULL;
+#endif
+}
+
+static inline void *md_baddr(struct multi_devices *md, ulong bn)
+{
+	return md_addr(md, md_p2o(bn));
+}
+
+static inline struct md_dev_info *md_bn_t2_dev(struct multi_devices *md,
+					       ulong bn)
+{
+	return md->t2a.map[bn / md->t2a.bn_gcd];
+}
+
+static inline int md_t2_bn_nid(struct multi_devices *md, ulong bn)
+{
+	struct md_dev_info *mdi = md_bn_t2_dev(md, bn);
+
+	return mdi->nid;
+}
+
+static inline ulong md_t2_local_bn(struct multi_devices *md, ulong bn)
+{
+#ifdef __KERNEL__
+	struct md_dev_info *mdi = md_bn_t2_dev(md, bn);
+
+	return bn - md_o2p(mdi->offset);
+#else
+	return bn; /* In zus we just let Kernel worry about it */
+#endif
+}
+
+static inline ulong md_t2_gcd(struct multi_devices *md)
+{
+	return md->t2a.bn_gcd;
+}
+
+static inline void *md_addr_verify(struct multi_devices *md, ulong offset)
+{
+	if (unlikely(offset > md_p2o(md_t1_blocks(md)))) {
+		md_dbg_err("offset=0x%lx > max=0x%llx\n",
+			    offset, md_p2o(md_t1_blocks(md)));
+		return NULL;
+	}
+
+	return md_addr(md, offset);
+}
+
+static inline struct page *md_bn_to_page(struct multi_devices *md, ulong bn)
+{
+#ifdef __KERNEL__
+	return pfn_to_page(md_pfn(md, bn));
+#else
+	return md->pages.ptr + bn * md->user_page_size;
+#endif
+}
+
+static inline ulong md_addr_to_offset(struct multi_devices *md, void *addr)
+{
+#ifdef __KERNEL__
+	/* TODO: Keep the device index in page-flags we need to fix the
+	 * page-ref right? for now with pages untouched we need this loop
+	 */
+	int dev_index;
+
+	for (dev_index = 0; dev_index < md->t1_count; ++dev_index) {
+		struct md_dev_info *mdi = md_t1_dev(md, dev_index);
+
+		if ((mdi->t1i.virt_addr <= addr) &&
+		    (addr < (mdi->t1i.virt_addr + mdi->size)))
+			return mdi->offset + (addr - mdi->t1i.virt_addr);
+	}
+
+	return 0;
+#else /* !__KERNEL__ */
+	return addr - md->p_pmem_addr;
+#endif
+}
+
+static inline ulong md_addr_to_bn(struct multi_devices *md, void *addr)
+{
+	return md_o2p(md_addr_to_offset(md, addr));
+}
+
+static inline ulong md_page_to_bn(struct multi_devices *md, struct page *page)
+{
+#ifdef __KERNEL__
+	return md_addr_to_bn(md, page_address(page));
+#else
+	ulong bytes = (void *)page - md->pages.ptr;
+
+	return bytes / md->user_page_size;
+#endif
+}
+
+#ifdef __KERNEL__
+/* TODO: Change API to take mdi and also support in um */
+static inline const char *_bdev_name(struct block_device *bdev)
+{
+	return dev_name(&bdev->bd_part->__dev);
+}
+#endif /*def __KERNEL__*/
+
+struct mdt_check {
+	ulong alloc_mask;
+	uint major_ver;
+	uint minor_ver;
+	__u32  magic;
+
+	void *holder;
+	bool silent;
+};
+
+/* md.c */
+bool md_mdt_check(struct md_dev_table *mdt, struct md_dev_table *main_mdt,
+		  struct block_device *bdev, struct mdt_check *mc);
+int md_t2_mdt_read(struct multi_devices *md, int dev_index,
+		   struct md_dev_table *mdt);
+int md_t2_mdt_write(struct multi_devices *md, struct md_dev_table *mdt);
+short md_calc_csum(struct md_dev_table *mdt);
+void md_fini(struct multi_devices *md, struct block_device *s_bdev);
+
+#ifdef __KERNEL__
+struct multi_devices *md_alloc(size_t size);
+int md_init(struct multi_devices *md, const char *dev_name,
+	    struct mdt_check *mc, const char **dev_path);
+int md_set_sb(struct multi_devices *md, struct block_device *s_bdev, void *sb,
+	      int silent);
+int md_t1_info_init(struct md_dev_info *mdi, bool silent);
+void md_t1_info_fini(struct md_dev_info *mdi);
+
+#else /* m1us */
+int md_init_from_pmem_info(struct multi_devices *md);
+#endif
+
+#endif
diff --git a/fs/zuf/md_def.h b/fs/zuf/md_def.h
new file mode 100644
index 000000000000..a236567adfd9
--- /dev/null
+++ b/fs/zuf/md_def.h
@@ -0,0 +1,145 @@ 
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Multi-Device operations.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ *	Sagi Manole <sagim@netapp.com>"
+ */
+#ifndef _LINUX_MD_DEF_H
+#define _LINUX_MD_DEF_H
+
+#include <linux/types.h>
+#include <linux/uuid.h>
+
+#ifndef __KERNEL__
+
+#include <stdint.h>
+#include <endian.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#ifndef le16_to_cpu
+
+#define le16_to_cpu(x)	((__u16)le16toh(x))
+#define le32_to_cpu(x)	((__u32)le32toh(x))
+#define le64_to_cpu(x)	((__u64)le64toh(x))
+#define cpu_to_le16(x)	((__le16)htole16(x))
+#define cpu_to_le32(x)	((__le32)htole32(x))
+#define cpu_to_le64(x)	((__le64)htole64(x))
+
+#endif
+
+#ifndef __aligned
+#define	__aligned(x)			__attribute__((aligned(x)))
+#endif
+
+#ifndef __packed
+#	define __packed __attribute__((packed))
+#endif
+
+#endif /*  ndef __KERNEL__ */
+
+#define MDT_SIZE 2048
+
+#define MD_DEV_NUMA_SHIFT		60
+#define MD_DEV_BLOCKS_MASK		0x0FFFFFFFFFFFFFFF
+
+struct md_dev_id {
+	uuid_le	uuid;
+	__le64	blocks;
+} __packed;
+
+static inline __u64 __dev_id_blocks(struct md_dev_id *dev)
+{
+	return le64_to_cpu(dev->blocks) & MD_DEV_BLOCKS_MASK;
+}
+
+static inline void __dev_id_blocks_set(struct md_dev_id *dev, __u64 blocks)
+{
+	dev->blocks &= ~MD_DEV_BLOCKS_MASK;
+	dev->blocks |= blocks;
+}
+
+static inline int __dev_id_nid(struct md_dev_id *dev)
+{
+	return (int)(le64_to_cpu(dev->blocks) >> MD_DEV_NUMA_SHIFT);
+}
+
+static inline void __dev_id_nid_set(struct md_dev_id *dev, int nid)
+{
+	dev->blocks &= MD_DEV_BLOCKS_MASK;
+	dev->blocks |= (__le64)nid << MD_DEV_NUMA_SHIFT;
+}
+
+/* 64 is the nicest number to still fit when the ZDT is 2048 and 6 bits can
+ * fit in page struct for address to block translation.
+ */
+#define MD_DEV_MAX   64
+
+struct md_dev_list {
+	__le16		   id_index;	/* index of current dev in list */
+	__le16		   t1_count;	/* # of t1 devs */
+	__le16		   t2_count;	/* # of t2 devs (after t1_count) */
+	__le16		   rmem_count;	/* align to 64 bit */
+	struct md_dev_id dev_ids[MD_DEV_MAX];
+} __aligned(64);
+
+/*
+ * Structure of the on disk multy device table
+ * NOTE: md_dev_table is always of size MDT_SIZE. These below are the
+ *   currently defined/used members in this version.
+ *   TODO: remove the s_ from all the fields
+ */
+struct md_dev_table {
+	/* static fields. they never change after file system creation.
+	 * checksum only validates up to s_start_dynamic field below
+	 */
+	__le16		s_sum;              /* checksum of this sb */
+	__le16		s_version;          /* zdt-version */
+	__le32		s_magic;            /* magic signature */
+	uuid_le		s_uuid;		    /* 128-bit uuid */
+	__le64		s_flags;
+	__le64		s_t1_blocks;
+	__le64		s_t2_blocks;
+
+	struct md_dev_list s_dev_list;
+
+	char		s_start_dynamic[0];
+
+	/* all the dynamic fields should go here */
+	__le64		s_mtime;		/* mount time */
+	__le64		s_wtime;		/* write time */
+};
+
+/* device table s_flags */
+enum enum_mdt_flags {
+	MDT_F_SHADOW		= (1UL << 0),	/* simulate cpu cache */
+	MDT_F_POSIXACL		= (1UL << 1),	/* enable acls */
+
+	MDT_F_USER_START	= 8,	/* first 8 bit reserved for mdt */
+};
+
+static inline bool mdt_test_option(struct md_dev_table *mdt,
+				   enum enum_mdt_flags flag)
+{
+	return (mdt->s_flags & flag) != 0;
+}
+
+#define MD_MINORS_PER_MAJOR	1024
+
+static inline int mdt_major_version(struct md_dev_table *mdt)
+{
+	return le16_to_cpu(mdt->s_version) / MD_MINORS_PER_MAJOR;
+}
+
+static inline int mdt_minor_version(struct md_dev_table *mdt)
+{
+	return le16_to_cpu(mdt->s_version) % MD_MINORS_PER_MAJOR;
+}
+
+#define MDT_STATIC_SIZE(mdt) ((__u64)&mdt->s_start_dynamic - (__u64)mdt)
+
+#endif /* _LINUX_MD_DEF_H */
diff --git a/fs/zuf/t1.c b/fs/zuf/t1.c
new file mode 100644
index 000000000000..53da4c5840c7
--- /dev/null
+++ b/fs/zuf/t1.c
@@ -0,0 +1,138 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Just the special mmap of the all t1 array to the ZUS Server
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0. See module.c for LICENSE details.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/pfn_t.h>
+#include <asm/pgtable.h>
+
+#include "_pr.h"
+#include "zuf.h"
+
+/* ~~~ Functions for mmap a t1-array and page faults ~~~ */
+struct zuf_pmem *_pmem_from_f_private(struct file *file)
+{
+	struct zuf_special_file *zsf = file->private_data;
+
+	WARN_ON(zsf->type != zlfs_e_pmem);
+	return container_of(zsf, struct zuf_pmem, hdr);
+}
+
+static vm_fault_t t1_fault(struct vm_fault *vmf, enum page_entry_size pe_size)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct inode *inode = vma->vm_file->f_mapping->host;
+	ulong addr = vmf->address;
+	struct zuf_pmem *z_pmem;
+	pgoff_t size;
+	ulong bn;
+	pfn_t pfnt;
+	ulong pfn = 0;
+	vm_fault_t flt;
+
+	zuf_dbg_t1("[%ld] vm_start=0x%lx vm_end=0x%lx VA=0x%lx "
+		    "pgoff=0x%lx vmf_flags=0x%x cow_page=%p page=%p pe_size=%d\n",
+		    inode->i_ino, vma->vm_start, vma->vm_end, addr, vmf->pgoff,
+		    vmf->flags, vmf->cow_page, vmf->page, pe_size);
+
+	if (unlikely(vmf->page)) {
+		zuf_err("[%ld] vm_start=0x%lx vm_end=0x%lx VA=0x%lx "
+			"pgoff=0x%lx vmf_flags=0x%x page=%p cow_page=%p\n",
+			inode->i_ino, vma->vm_start, vma->vm_end, addr,
+			vmf->pgoff, vmf->flags, vmf->page, vmf->cow_page);
+		return VM_FAULT_SIGBUS;
+	}
+
+	size = md_o2p_up(i_size_read(inode));
+	if (unlikely(vmf->pgoff >= size)) {
+		ulong pgoff = vma->vm_pgoff + md_o2p(addr - vma->vm_start);
+
+		zuf_err("[%ld] pgoff(0x%lx)(0x%lx) >= size(0x%lx) => SIGBUS\n",
+			 inode->i_ino, vmf->pgoff, pgoff, size);
+
+		return VM_FAULT_SIGBUS;
+	}
+
+	if (vmf->cow_page)
+		/* HOWTO: prevent private mmaps */
+		return VM_FAULT_SIGBUS;
+
+	z_pmem = _pmem_from_f_private(vma->vm_file);
+
+	switch (pe_size) {
+	case PE_SIZE_PTE:
+		zuf_err("[%ld] PTE fault not expected pgoff=0x%lx addr=0x%lx\n",
+			inode->i_ino, vmf->pgoff, addr);
+		/* fall through do PMD insert anyway */
+	case PE_SIZE_PMD:
+		bn = linear_page_index(vma, addr & PMD_MASK);
+		pfn = md_pfn(&z_pmem->md, bn);
+		pfnt = phys_to_pfn_t(PFN_PHYS(pfn), PFN_MAP | PFN_DEV);
+		flt = vmf_insert_pfn_pmd(vma, addr, vmf->pmd, pfnt, true);
+		zuf_dbg_t1("[%ld] PMD pfn-0x%lx addr=0x%lx bn=0x%lx pgoff=0x%lx => %d\n",
+			inode->i_ino, pfn, addr, bn, vmf->pgoff, flt);
+		break;
+	default:
+		/* FIXME: Easily support PE_SIZE_PUD Just needs to align to
+		 * PUD_MASK at zufr_get_unmapped_area(). But this is hard today
+		 * because of the 2M nvdimm lib takes for its page flag
+		 * information with NFIT. (That need not be there in any which
+		 * case.)
+		 * Which means zufr_get_unmapped_area needs to return
+		 * a align1G+2M address start. and first 1G is map PMD size.
+		 * Very ugly, sigh.
+		 * One thing I do not understand why when the vma->vm_start is
+		 * not PUD aligned and faults requests index zero. Then system
+		 * asks for PE_SIZE_PUD anyway. say my 0 index is 1G aligned
+		 * vmf_insert_pfn_pud() will always fail because the aligned
+		 * vm_addr is outside the vma.
+		 */
+		flt = VM_FAULT_FALLBACK;
+		zuf_dbg_t1("[%ld] default? pgoff=0x%lx addr=0x%lx pe_size=0x%x => %d\n",
+			   inode->i_ino, vmf->pgoff, addr, pe_size, flt);
+	}
+
+	return flt;
+}
+
+static vm_fault_t t1_fault_pte(struct vm_fault *vmf)
+{
+	return t1_fault(vmf, PE_SIZE_PTE);
+}
+
+static const struct vm_operations_struct t1_vm_ops = {
+	.huge_fault	= t1_fault,
+	.fault		= t1_fault_pte,
+};
+
+int zuf_pmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct zuf_special_file *zsf = file->private_data;
+
+	if (!zsf || zsf->type != zlfs_e_pmem)
+		return -EPERM;
+
+
+	/* FIXME:  MIXEDMAP for the support of pmem-pages (Why?)
+	 */
+	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	vma->vm_ops = &t1_vm_ops;
+
+	zuf_dbg_vfs("[%ld] start=0x%lx end=0x%lx flags=0x%lx page_prot=0x%lx\n",
+		     file->f_mapping->host->i_ino, vma->vm_start, vma->vm_end,
+		     vma->vm_flags, pgprot_val(vma->vm_page_prot));
+
+	return 0;
+}
+
diff --git a/fs/zuf/t2.c b/fs/zuf/t2.c
new file mode 100644
index 000000000000..7bc7b42466b9
--- /dev/null
+++ b/fs/zuf/t2.c
@@ -0,0 +1,375 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tier-2 operations.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0. See module.c for LICENSE details.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ */
+
+#include "_pr.h"
+#include "t2.h"
+
+#include <linux/bitops.h>
+#include <linux/bio.h>
+
+#include "zuf.h"
+
+#define t2_dbg(fmt, args ...) zuf_dbg_t2(fmt, ##args)
+#define t2_warn(fmt, args ...) zuf_warn(fmt, ##args)
+
+const char *_pr_rw(int rw)
+{
+	return (rw & WRITE) ? "WRITE" : "READ";
+}
+#define t2_tis_dbg(tis, fmt, args ...) \
+	zuf_dbg_t2("%s: r=%d f=0x%lx " fmt, _pr_rw(tis->rw_flags),	       \
+		    atomic_read(&tis->refcount), tis->rw_flags, ##args)
+
+#define t2_tis_dbg_rw(tis, fmt, args ...) \
+	zuf_dbg_t2_rw("%s<%p>: r=%d f=0x%lx " fmt, _pr_rw(tis->rw_flags),     \
+		    tis->priv, atomic_read(&tis->refcount), tis->rw_flags,\
+		    ##args)
+
+/* ~~~~~~~~~~~~ Async read/write ~~~~~~~~~~ */
+void t2_io_begin(struct multi_devices *md, int rw, t2_io_done_fn done,
+		 void *priv, uint n_vects, struct t2_io_state *tis)
+{
+	atomic_set(&tis->refcount, 1);
+	tis->md = md;
+	tis->done = done;
+	tis->priv = priv;
+	tis->n_vects = min(n_vects ? n_vects : 1, (uint)BIO_MAX_PAGES);
+	tis->rw_flags = rw;
+	tis->last_t2 = -1;
+	tis->cur_bio = NULL;
+	tis->index = ~0;
+	bio_list_init(&tis->delayed_bios);
+	tis->err = 0;
+	blk_start_plug(&tis->plug);
+	t2_tis_dbg_rw(tis, "done=%pS n_vects=%d\n", done, n_vects);
+}
+
+static void _tis_put(struct t2_io_state *tis)
+{
+	t2_tis_dbg_rw(tis, "done=%pS\n", tis->done);
+
+	if (test_bit(B_TIS_FREE_AFTER_WAIT, &tis->rw_flags))
+		wake_up_var(&tis->refcount);
+	else if (tis->done)
+		/* last - done may free the tis */
+		tis->done(tis, NULL, true);
+}
+
+static inline void tis_get(struct t2_io_state *tis)
+{
+	atomic_inc(&tis->refcount);
+}
+
+static inline int tis_put(struct t2_io_state *tis)
+{
+	if (atomic_dec_and_test(&tis->refcount)) {
+		_tis_put(tis);
+		return 1;
+	}
+	return 0;
+}
+
+static inline bool _err_set_reported(struct md_dev_info *mdi, bool write)
+{
+	bool *reported = write ? &mdi->t2i.err_write_reported :
+				 &mdi->t2i.err_read_reported;
+
+	if (!(*reported)) {
+		*reported = true;
+		return true;
+	}
+	return false;
+}
+
+static int _status_to_errno(blk_status_t status)
+{
+	return -EIO;
+}
+
+void t2_io_done(struct t2_io_state *tis, struct bio *bio, bool last)
+{
+	struct bio_vec *bv;
+	uint i;
+
+	if (!bio)
+		return;
+
+	bio_for_each_segment_all(bv, bio, i)
+		put_page(bv->bv_page);
+}
+
+static void _tis_bio_done(struct bio *bio)
+{
+	struct t2_io_state *tis = bio->bi_private;
+	struct md_dev_info *mdi = md_t2_dev(tis->md, 0);
+
+	t2_tis_dbg(tis, "done=%pS err=%d\n", tis->done, bio->bi_status);
+
+	if (unlikely(bio->bi_status)) {
+		zuf_dbg_err("%s: err=%d last-err=%d\n",
+			     _pr_rw(tis->rw_flags), bio->bi_status, tis->err);
+		if (_err_set_reported(mdi, 0 != (tis->rw_flags & WRITE)))
+			zuf_err("%s: err=%d\n",
+				 _pr_rw(tis->rw_flags), bio->bi_status);
+		/* Store the last one */
+		tis->err = _status_to_errno(bio->bi_status);
+	} else if (unlikely(mdi->t2i.err_write_reported ||
+			    mdi->t2i.err_read_reported)) {
+		if (tis->rw_flags & WRITE)
+			mdi->t2i.err_write_reported = false;
+		else
+			mdi->t2i.err_read_reported = false;
+	}
+
+	if (tis->done)
+		tis->done(tis, bio, false);
+	else
+		t2_io_done(tis, bio, false);
+
+	bio_put(bio);
+	tis_put(tis);
+}
+
+static bool _tis_delay(struct t2_io_state *tis)
+{
+	return 0 != (tis->rw_flags & TIS_DELAY_SUBMIT);
+}
+
+#define bio_list_for_each_safe(bio, btmp, bl)				\
+	for (bio = (bl)->head,	btmp = bio ? bio->bi_next : NULL;	\
+	     bio; bio = btmp,	btmp = bio ? bio->bi_next : NULL)
+
+static void _tis_submit_bio(struct t2_io_state *tis, bool flush, bool done)
+{
+	if (flush || done) {
+		if (_tis_delay(tis)) {
+			struct bio *btmp, *bio;
+
+			bio_list_for_each_safe(bio, btmp, &tis->delayed_bios) {
+				bio->bi_next = NULL;
+				if (bio->bi_iter.bi_sector == -1) {
+					t2_warn("!!!!!!!!!!!!!\n");
+					bio_put(bio);
+					continue;
+				}
+				t2_tis_dbg(tis, "submit bio[%d] max_v=%d\n",
+					    bio->bi_vcnt, tis->n_vects);
+				submit_bio(bio);
+			}
+			bio_list_init(&tis->delayed_bios);
+		}
+
+		if (!tis->cur_bio)
+			return;
+
+		if (tis->cur_bio->bi_iter.bi_sector != -1) {
+			t2_tis_dbg(tis, "submit bio[%d] max_v=%d\n",
+				    tis->cur_bio->bi_vcnt, tis->n_vects);
+			submit_bio(tis->cur_bio);
+			tis->cur_bio = NULL;
+			tis->index = ~0;
+		} else if (done) {
+			t2_tis_dbg(tis, "put cur_bio=%p\n", tis->cur_bio);
+			bio_put(tis->cur_bio);
+			WARN_ON(tis_put(tis));
+		}
+	} else if (tis->cur_bio && (tis->cur_bio->bi_iter.bi_sector != -1)) {
+		/* Not flushing regular progress */
+		if (_tis_delay(tis)) {
+			t2_tis_dbg(tis, "list_add cur_bio=%p\n", tis->cur_bio);
+			bio_list_add(&tis->delayed_bios, tis->cur_bio);
+		} else {
+			t2_tis_dbg(tis, "submit bio[%d] max_v=%d\n",
+				    tis->cur_bio->bi_vcnt, tis->n_vects);
+			submit_bio(tis->cur_bio);
+		}
+		tis->cur_bio = NULL;
+		tis->index = ~0;
+	}
+}
+
+/* tis->cur_bio MUST be NULL, checked by caller */
+static void _tis_alloc(struct t2_io_state *tis, struct md_dev_info *mdi,
+		       gfp_t gfp)
+{
+	struct bio *bio = bio_alloc(gfp, tis->n_vects);
+	int bio_op;
+
+	if (unlikely(!bio)) {
+		if (!_tis_delay(tis))
+			t2_warn("!!! failed to alloc bio");
+		tis->err = -ENOMEM;
+		return;
+	}
+
+	if (WARN_ON(!tis || !tis->md)) {
+		tis->err = -ENOMEM;
+		return;
+	}
+
+	/* FIXME: bio_set_op_attrs macro has a BUG which does not allow this
+	 * question inline.
+	 */
+	bio_op = (tis->rw_flags & WRITE) ? REQ_OP_WRITE : REQ_OP_READ;
+	bio_set_op_attrs(bio, bio_op, 0);
+
+	bio->bi_iter.bi_sector = -1;
+	bio->bi_end_io = _tis_bio_done;
+	bio->bi_private = tis;
+
+	if (mdi) {
+		bio_set_dev(bio, mdi->bdev);
+		tis->index = mdi->index;
+	} else {
+		tis->index = ~0;
+	}
+	tis->last_t2 = -1;
+	tis->cur_bio = bio;
+	tis_get(tis);
+	t2_tis_dbg(tis, "New bio n_vects=%d\n", tis->n_vects);
+}
+
+int t2_io_prealloc(struct t2_io_state *tis, uint n_vects)
+{
+	tis->err = 0; /* reset any -ENOMEM from a previous t2_io_add */
+
+	_tis_submit_bio(tis, true, false);
+	tis->n_vects = min(n_vects ? n_vects : 1, (uint)BIO_MAX_PAGES);
+
+	t2_tis_dbg(tis, "n_vects=%d cur_bio=%p\n", tis->n_vects, tis->cur_bio);
+
+	if (!tis->cur_bio)
+		_tis_alloc(tis, NULL, GFP_NOFS);
+	return tis->err;
+}
+
+int t2_io_add(struct t2_io_state *tis, ulong t2, struct page *page)
+{
+	struct md_dev_info *mdi;
+	ulong local_t2;
+	int ret;
+
+	if (t2 > md_t2_blocks(tis->md)) {
+		zuf_err("bad t2 (0x%lx) offset\n", t2);
+		return -EFAULT;
+	}
+	get_page(page);
+
+	mdi = md_bn_t2_dev(tis->md, t2);
+	WARN_ON(!mdi);
+
+	local_t2 = md_t2_local_bn(tis->md, t2);
+	if (((local_t2 != (tis->last_t2 + 1)) && (tis->last_t2 != -1)) ||
+	   ((0 < tis->index) && (tis->index != mdi->index)))
+		_tis_submit_bio(tis, false, false);
+
+start:
+	if (!tis->cur_bio) {
+		_tis_alloc(tis, mdi, _tis_delay(tis) ? GFP_ATOMIC : GFP_NOFS);
+		if (unlikely(tis->err)) {
+			put_page(page);
+			return tis->err;
+		}
+	} else if (tis->index == ~0) {
+		/* the bio was allocated during t2_io_prealloc */
+		tis->index = mdi->index;
+		bio_set_dev(tis->cur_bio, mdi->bdev);
+	}
+
+	if (tis->last_t2 == -1)
+		tis->cur_bio->bi_iter.bi_sector = local_t2 * T2_SECTORS_PER_PAGE;
+
+	ret = bio_add_page(tis->cur_bio, page, PAGE_SIZE, 0);
+	if (unlikely(ret != PAGE_SIZE)) {
+		t2_tis_dbg(tis, "bio_add_page=>%d bi_vcnt=%d n_vects=%d\n",
+			   ret, tis->cur_bio->bi_vcnt, tis->n_vects);
+		_tis_submit_bio(tis, false, false);
+		goto start; /* device does not support tis->n_vects */
+	}
+
+	if ((tis->cur_bio->bi_vcnt == tis->n_vects) && (tis->n_vects != 1))
+		_tis_submit_bio(tis, false, false);
+
+	t2_tis_dbg(tis, "t2=0x%lx last_t2=0x%lx local_t2=0x%lx t1=0x%lx\n",
+		   t2, tis->last_t2, local_t2, md_page_to_bn(tis->md, page));
+
+	tis->last_t2 = local_t2;
+	return 0;
+}
+
+int t2_io_end(struct t2_io_state *tis, bool wait)
+{
+	if (unlikely(!tis || !tis->md))
+		return 0; /* never initialized nothing to do */
+
+	t2_tis_dbg_rw(tis, "wait=%d\n", wait);
+
+	_tis_submit_bio(tis, true, true);
+	blk_finish_plug(&tis->plug);
+
+	if (wait)
+		set_bit(B_TIS_FREE_AFTER_WAIT, &tis->rw_flags);
+	tis_put(tis);
+
+	if (wait) {
+		wait_var_event(&tis->refcount, !atomic_read(&tis->refcount));
+		if (tis->done)
+			tis->done(tis, NULL, true);
+	}
+
+	return tis->err;
+}
+
+/* ~~~~~~~ Sync read/write ~~~~~~~ TODO: Remove soon */
+static int _sync_io_page(struct multi_devices *md, int rw, ulong bn,
+			 struct page *page)
+{
+	struct t2_io_state tis;
+	int err;
+
+	t2_io_begin(md, rw, NULL, NULL, 1, &tis);
+
+	t2_tis_dbg((&tis), "bn=0x%lx p-i=0x%lx\n", bn, page->index);
+
+	err = t2_io_add(&tis, bn, page);
+	if (unlikely(err))
+		return err;
+
+	err = submit_bio_wait(tis.cur_bio);
+	if (unlikely(err)) {
+		SetPageError(page);
+		/*
+		 * We failed to write the page out to tier-2.
+		 * Print a dire warning that things will go BAD (tm)
+		 * very quickly.
+		 */
+		zuf_err("io-error bn=0x%lx => %d\n", bn, err);
+	}
+
+	/* Same as t2_io_end+_tis_bio_done but without the kref stuff */
+	blk_finish_plug(&tis.plug);
+	put_page(page);
+	if (likely(tis.cur_bio))
+		bio_put(tis.cur_bio);
+
+	return err;
+}
+
+int t2_writepage(struct multi_devices *md, ulong bn, struct page *page)
+{
+	return _sync_io_page(md, WRITE, bn, page);
+}
+
+int t2_readpage(struct multi_devices *md, ulong bn, struct page *page)
+{
+	return _sync_io_page(md, READ, bn, page);
+}
diff --git a/fs/zuf/t2.h b/fs/zuf/t2.h
new file mode 100644
index 000000000000..cbd23dd409eb
--- /dev/null
+++ b/fs/zuf/t2.h
@@ -0,0 +1,68 @@ 
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Tier-2 Header file.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ */
+
+#ifndef __T2_H__
+#define __T2_H__
+
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/kref.h>
+#include "md.h"
+
+#define T2_SECTORS_PER_PAGE	(PAGE_SIZE / 512)
+
+/* t2.c */
+
+/* Sync read/write */
+int t2_writepage(struct multi_devices *md, ulong bn, struct page *page);
+int t2_readpage(struct multi_devices *md, ulong bn, struct page *page);
+
+/* Async read/write */
+struct t2_io_state;
+typedef void (*t2_io_done_fn)(struct t2_io_state *tis, struct bio *bio,
+			      bool last);
+
+struct t2_io_state {
+	atomic_t refcount; /* counts in-flight bios */
+	struct blk_plug plug;
+
+	struct multi_devices	*md;
+	int		index;
+	t2_io_done_fn	done;
+	void		*priv;
+
+	uint		n_vects;
+	ulong		rw_flags;
+	ulong		last_t2;
+	struct bio	*cur_bio;
+	struct bio_list	delayed_bios;
+	int		err;
+};
+
+/* For rw_flags above */
+/* From Kernel: WRITE		(1U << 0) */
+#define TIS_DELAY_SUBMIT	(1U << 2)
+enum {B_TIS_FREE_AFTER_WAIT = 3};
+#define TIS_FREE_AFTER_WAIT	(1U << B_TIS_FREE_AFTER_WAIT)
+#define TIS_USER_DEF_FIRST	(1U << 8)
+
+void t2_io_begin(struct multi_devices *md, int rw, t2_io_done_fn done,
+		 void *priv, uint n_vects, struct t2_io_state *tis);
+int t2_io_prealloc(struct t2_io_state *tis, uint n_vects);
+int t2_io_add(struct t2_io_state *tis, ulong t2, struct page *page);
+int t2_io_end(struct t2_io_state *tis, bool wait);
+
+/* This is done by default if t2_io_done_fn above is NULL
+ * Can also be chain-called by users.
+ */
+void t2_io_done(struct t2_io_state *tis, struct bio *bio, bool last);
+
+#endif /*def __T2_H__*/
diff --git a/fs/zuf/zuf-core.c b/fs/zuf/zuf-core.c
index 95582c0a4ba5..d94c2e6d7578 100644
--- a/fs/zuf/zuf-core.c
+++ b/fs/zuf/zuf-core.c
@@ -339,6 +339,89 @@  static int _zu_numa_map(struct file *file, void *parg)
 	return err;
 }
 
+/* ~~~~ PMEM GRAB ~~~~ */
+static int zufr_find_pmem(struct zuf_root_info *zri,
+		   uint pmem_kern_id, struct zuf_pmem **pmem_md)
+{
+	struct zuf_pmem *z_pmem;
+
+	list_for_each_entry(z_pmem, &zri->pmem_list, list) {
+		if (z_pmem->pmem_id == pmem_kern_id) {
+			*pmem_md = z_pmem;
+			return 0;
+		}
+	}
+
+	return -ENODEV;
+}
+
+/*FIXME: At pmem the struct md_dev_list for t1(s) is not properly set
+ * For now we do not fix it and re-write the mdt. So just fix the one
+ * we are about to send to Server
+ */
+void _fix_numa_ids(struct multi_devices *md, struct md_dev_list *mdl)
+{
+	int i;
+
+	for (i = 0; i < md->t1_count; ++i)
+		if (md->devs[i].nid != __dev_id_nid(&mdl->dev_ids[i]))
+			__dev_id_nid_set(&mdl->dev_ids[i], md->devs[i].nid);
+}
+
+static int _zu_grab_pmem(struct file *file, void *parg)
+{
+	struct zuf_root_info *zri = ZRI(file->f_inode->i_sb);
+	struct zufs_ioc_pmem __user *arg_pmem = parg;
+	struct zufs_ioc_pmem *zi_pmem = kzalloc(sizeof(*zi_pmem), GFP_KERNEL);
+	struct zuf_pmem *pmem_md;
+	size_t pmem_size;
+	int err;
+
+	if (unlikely(!zi_pmem))
+		return -ENOMEM;
+
+	err = get_user(zi_pmem->pmem_kern_id, &arg_pmem->pmem_kern_id);
+	if (err) {
+		zuf_err("\n");
+		goto out;
+	}
+
+	err = zufr_find_pmem(zri, zi_pmem->pmem_kern_id, &pmem_md);
+	if (err) {
+		zuf_err("!!! pmem_kern_id=%d not found\n",
+			zi_pmem->pmem_kern_id);
+		goto out;
+	}
+
+	if (pmem_md->hdr.file) {
+		zuf_err("[%u] pmem already taken\n", zi_pmem->pmem_kern_id);
+		err = -EIO;
+		goto out;
+	}
+
+	memcpy(&zi_pmem->mdt, md_zdt(&pmem_md->md), sizeof(zi_pmem->mdt));
+	_fix_numa_ids(&pmem_md->md, &zi_pmem->mdt.s_dev_list);
+
+	pmem_size = md_p2o(md_t1_blocks(&pmem_md->md));
+	if (mdt_test_option(md_zdt(&pmem_md->md), MDT_F_SHADOW))
+		pmem_size += pmem_size;
+	i_size_write(file->f_inode, pmem_size);
+	pmem_md->hdr.type = zlfs_e_pmem;
+	pmem_md->hdr.file = file;
+	file->private_data = &pmem_md->hdr;
+	zuf_dbg_core("pmem %d i_size=0x%llx GRABED %s\n",
+		     zi_pmem->pmem_kern_id, i_size_read(file->f_inode),
+		     _bdev_name(md_t1_dev(&pmem_md->md, 0)->bdev));
+
+out:
+	zi_pmem->hdr.err = err;
+	err = copy_to_user(parg, zi_pmem, sizeof(*zi_pmem));
+	if (err)
+		zuf_err("=>%d\n", err);
+	kfree(zi_pmem);
+	return err;
+}
+
 static int _map_pages(struct zufc_thread *zt, struct page **pages, uint nump,
 		      bool map_readonly)
 {
@@ -822,6 +905,8 @@  long zufc_ioctl(struct file *file, unsigned int cmd, ulong arg)
 		return _zu_mount(file, parg);
 	case ZU_IOC_NUMA_MAP:
 		return _zu_numa_map(file, parg);
+	case ZU_IOC_GRAB_PMEM:
+		return _zu_grab_pmem(file, parg);
 	case ZU_IOC_INIT_THREAD:
 		return _zu_init(file, parg);
 	case ZU_IOC_WAIT_OPT:
@@ -1065,6 +1150,8 @@  int zufc_mmap(struct file *file, struct vm_area_struct *vma)
 	switch (zsf->type) {
 	case zlfs_e_zt:
 		return zufc_zt_mmap(file, vma);
+	case zlfs_e_pmem:
+		return zuf_pmem_mmap(file, vma);
 	case zlfs_e_dpp_buff:
 		return zufc_ebuff_mmap(file, vma);
 	default:
diff --git a/fs/zuf/zuf.h b/fs/zuf/zuf.h
index a33f5908155d..0689f2031ec7 100644
--- a/fs/zuf/zuf.h
+++ b/fs/zuf/zuf.h
@@ -29,6 +29,8 @@ 
 
 #include "relay.h"
 #include "_pr.h"
+#include "md.h"
+#include "t2.h"
 
 enum zlfs_e_special_file {
 	zlfs_e_zt = 1,
@@ -42,6 +44,14 @@  struct zuf_special_file {
 	struct file *file;
 };
 
+/* Our special md structure */
+struct zuf_pmem {
+	struct multi_devices md; /* must be first */
+	struct list_head list;
+	struct zuf_special_file hdr;
+	uint pmem_id;
+};
+
 /* This is the zuf-root.c mini filesystem */
 struct zuf_root_info {
 	struct __mount_thread_info {
@@ -94,6 +104,35 @@  static inline void zuf_add_fs_type(struct zuf_root_info *zri,
 	list_add(&zft->list, &zri->fst_list);
 }
 
+static inline void zuf_add_pmem(struct zuf_root_info *zri,
+				   struct multi_devices *md)
+{
+	struct zuf_pmem *z_pmem = (void *)md;
+
+	z_pmem->pmem_id = ++zri->next_pmem_id; /* Avoid 0 id */
+
+	/* Unlocked for now only one mount-thread with zus */
+	INIT_LIST_HEAD(&z_pmem->list);
+	list_add(&z_pmem->list, &zri->pmem_list);
+}
+
+static inline void zuf_rm_pmem(struct multi_devices *md)
+{
+	struct zuf_pmem *z_pmem = (void *)md;
+
+	if (z_pmem->pmem_id) /* We avoided 0 id */
+		list_del_init(&z_pmem->list);
+}
+
+static inline uint zuf_pmem_id(struct multi_devices *md)
+{
+	struct zuf_pmem *z_pmem = container_of(md, struct zuf_pmem, md);
+
+	return z_pmem->pmem_id;
+}
+
+// void zuf_del_fs_type(struct zuf_root_info *zri, struct zuf_fs_type *zft);
+
 /*
  * ZUF per-inode data in memory
  */
diff --git a/fs/zuf/zus_api.h b/fs/zuf/zus_api.h
index 3319a70b5ccc..e0c439b6c8e9 100644
--- a/fs/zuf/zus_api.h
+++ b/fs/zuf/zus_api.h
@@ -19,6 +19,8 @@ 
 #include <stddef.h>
 #include <linux/statfs.h>
 
+#include "md_def.h"
+
 /*
  * Version rules:
  *   This is the zus-to-zuf API version. And not the Filesystem
@@ -74,6 +76,10 @@ 
  */
 #define EZUF_RETRY_DONE 540
 
+
+/* All device sizes offsets must align on 2M */
+#define ZUFS_ALLOC_MASK		(1024 * 1024 * 2 - 1)
+
 /**
  * zufs dual port memory
  * This is a special type of offset to either memory or persistent-memory,
@@ -221,6 +227,18 @@  struct zufs_ioc_numa_map {
 };
 #define ZU_IOC_NUMA_MAP	_IOWR('Z', 12, struct zufs_ioc_numa_map)
 
+struct zufs_ioc_pmem {
+	/* Set by zus */
+	struct zufs_ioc_hdr hdr;
+	__u32 pmem_kern_id;
+
+	/* Returned to zus */
+	struct md_dev_table mdt;
+
+};
+/* GRAB is never ungrabed umount or file close cleans it all */
+#define ZU_IOC_GRAB_PMEM	_IOWR('Z', 13, struct zufs_ioc_pmem)
+
 /* ZT init */
 enum { ZUFS_MAX_ZT_CHANNELS = 64 };