diff mbox

[RFC,v2,08/83] NOVA superblock operations.

Message ID 1520705944-6723-9-git-send-email-jix024@eng.ucsd.edu (mailing list archive)
State Changes Requested
Headers show

Commit Message

Andiry Xu March 10, 2018, 6:17 p.m. UTC
From: Andiry Xu <jix024@cs.ucsd.edu>

This is the entry point for NOVA filesystem mount and umount.
NOVA works on DAX devices. During initialization it gets the
device information, such as physical/virtual addresses and device size.
It does not access the DAX device during runtime.

During initialization NOVA also initializes the root inode.
The root inode is a reserved inode and resides on the fixed location.

The way to mount and initialize a NOVA instance is:

mount -t NOVA -o init /dev/pmem0 /mnt/NOVA

This creates a NOVA instance on /dev/pmem0 and mount on /mnt/NOVA.
Currently it cannot do anything except mount and umount.

Signed-off-by: Andiry Xu <jix024@cs.ucsd.edu>
---
 fs/nova/super.c | 630 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 630 insertions(+)
 create mode 100644 fs/nova/super.c
diff mbox

Patch

diff --git a/fs/nova/super.c b/fs/nova/super.c
new file mode 100644
index 0000000..552fe5d
--- /dev/null
+++ b/fs/nova/super.c
@@ -0,0 +1,630 @@ 
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Super block operations.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix024@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/ctype.h>
+#include <linux/bitops.h>
+#include <linux/magic.h>
+#include <linux/exportfs.h>
+#include <linux/random.h>
+#include <linux/cred.h>
+#include <linux/list.h>
+#include <linux/dax.h>
+#include "nova.h"
+#include "super.h"
+
+int support_clwb;
+
+module_param(nova_dbgmask, int, 0444);
+MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
+
+static struct super_operations nova_sops;
+
+static struct kmem_cache *nova_inode_cachep;
+
+
+/* FIXME: should the following variable be one per NOVA instance? */
+unsigned int nova_dbgmask;
+
+void nova_error_mng(struct super_block *sb, const char *fmt, ...)
+{
+	va_list args;
+
+	printk(KERN_CRIT "nova error: ");
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("nova: panic from previous error\n");
+	if (test_opt(sb, ERRORS_RO)) {
+		printk(KERN_CRIT "nova err: remounting filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+	}
+}
+
+static void nova_set_blocksize(struct super_block *sb, unsigned long size)
+{
+	int bits;
+
+	/*
+	 * We've already validated the user input and the value here must be
+	 * between NOVA_MAX_BLOCK_SIZE and NOVA_MIN_BLOCK_SIZE
+	 * and it must be a power of 2.
+	 */
+	bits = fls(size) - 1;
+	sb->s_blocksize_bits = bits;
+	sb->s_blocksize = (1 << bits);
+}
+
+static int nova_get_nvmm_info(struct super_block *sb,
+	struct nova_sb_info *sbi)
+{
+	void *virt_addr = NULL;
+	pfn_t __pfn_t;
+	long size;
+	struct dax_device *dax_dev;
+	int ret;
+
+	ret = bdev_dax_supported(sb, PAGE_SIZE);
+	nova_dbg_verbose("%s: dax_supported = %d; bdev->super=0x%p",
+			 __func__, ret, sb->s_bdev->bd_super);
+	if (ret) {
+		nova_err(sb, "device does not support DAX\n");
+		return ret;
+	}
+
+	sbi->s_bdev = sb->s_bdev;
+
+	dax_dev = fs_dax_get_by_host(sb->s_bdev->bd_disk->disk_name);
+	if (!dax_dev) {
+		nova_err(sb, "Couldn't retrieve DAX device.\n");
+		return -EINVAL;
+	}
+	sbi->s_dax_dev = dax_dev;
+
+	size = dax_direct_access(sbi->s_dax_dev, 0, LONG_MAX/PAGE_SIZE,
+				 &virt_addr, &__pfn_t) * PAGE_SIZE;
+	if (size <= 0) {
+		nova_err(sb, "direct_access failed\n");
+		return -EINVAL;
+	}
+
+	sbi->virt_addr = virt_addr;
+
+	if (!sbi->virt_addr) {
+		nova_err(sb, "ioremap of the nova image failed(1)\n");
+		return -EINVAL;
+	}
+
+	sbi->phys_addr = pfn_t_to_pfn(__pfn_t) << PAGE_SHIFT;
+	sbi->initsize = size;
+	sbi->replica_reserved_inodes_addr = virt_addr + size -
+			(sbi->tail_reserved_blocks << PAGE_SHIFT);
+	sbi->replica_sb_addr = virt_addr + size - PAGE_SIZE;
+
+	nova_dbg("%s: dev %s, phys_addr 0x%llx, virt_addr %p, size %ld\n",
+		__func__, sbi->s_bdev->bd_disk->disk_name,
+		sbi->phys_addr, sbi->virt_addr, sbi->initsize);
+
+	return 0;
+}
+
+static loff_t nova_max_size(int bits)
+{
+	loff_t res;
+
+	res = (1ULL << 63) - 1;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
+	nova_dbg_verbose("max file size %llu bytes\n", res);
+	return res;
+}
+
+enum {
+	Opt_bpi, Opt_init, Opt_mode, Opt_uid,
+	Opt_gid, Opt_dax,
+	Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_dbgmask, Opt_err
+};
+
+static const match_table_t tokens = {
+	{ Opt_bpi,	     "bpi=%u"		  },
+	{ Opt_init,	     "init"		  },
+	{ Opt_mode,	     "mode=%o"		  },
+	{ Opt_uid,	     "uid=%u"		  },
+	{ Opt_gid,	     "gid=%u"		  },
+	{ Opt_dax,	     "dax"		  },
+	{ Opt_err_cont,	     "errors=continue"	  },
+	{ Opt_err_panic,     "errors=panic"	  },
+	{ Opt_err_ro,	     "errors=remount-ro"  },
+	{ Opt_dbgmask,	     "dbgmask=%u"	  },
+	{ Opt_err,	     NULL		  },
+};
+
+static int nova_parse_options(char *options, struct nova_sb_info *sbi,
+			       bool remount)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	kuid_t uid;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_bpi:
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			if (remount && sbi->bpi)
+				goto bad_opt;
+			sbi->bpi = option;
+			break;
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			uid = make_kuid(current_user_ns(), option);
+			if (remount && !uid_eq(sbi->uid, uid))
+				goto bad_opt;
+			sbi->uid = uid;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			sbi->gid = make_kgid(current_user_ns(), option);
+			break;
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				goto bad_val;
+			sbi->mode = option & 01777U;
+			break;
+		case Opt_init:
+			if (remount)
+				goto bad_opt;
+			set_opt(sbi->s_mount_opt, FORMAT);
+			break;
+		case Opt_err_panic:
+			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt(sbi->s_mount_opt, ERRORS_RO);
+			set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt(sbi->s_mount_opt, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			clear_opt(sbi->s_mount_opt, ERRORS_RO);
+			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt(sbi->s_mount_opt, ERRORS_CONT);
+			break;
+		case Opt_dax:
+			set_opt(sbi->s_mount_opt, DAX);
+			break;
+		case Opt_dbgmask:
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			nova_dbgmask = option;
+			break;
+		default: {
+			goto bad_opt;
+		}
+		}
+	}
+
+	return 0;
+
+bad_val:
+	nova_info("Bad value '%s' for mount option '%s'\n", args[0].from,
+	       p);
+	return -EINVAL;
+bad_opt:
+	nova_info("Bad mount option: \"%s\"\n", p);
+	return -EINVAL;
+}
+
+
+/* Make sure we have enough space */
+static bool nova_check_size(struct super_block *sb, unsigned long size)
+{
+	unsigned long minimum_size;
+
+	/* space required for super block and root directory.*/
+	minimum_size = (HEAD_RESERVED_BLOCKS + TAIL_RESERVED_BLOCKS + 1)
+			  << sb->s_blocksize_bits;
+
+	if (size < minimum_size)
+		return false;
+
+	return true;
+}
+
+static inline void nova_sync_super(struct super_block *sb)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct nova_super_block *super = nova_get_super(sb);
+	struct nova_super_block *super_redund;
+
+	super_redund = nova_get_redund_super(sb);
+
+	memcpy_to_pmem_nocache((void *)super, (void *)sbi->nova_sb,
+		sizeof(struct nova_super_block));
+	PERSISTENT_BARRIER();
+
+	memcpy_to_pmem_nocache((void *)super_redund, (void *)sbi->nova_sb,
+		sizeof(struct nova_super_block));
+	PERSISTENT_BARRIER();
+}
+
+static struct nova_inode *nova_init(struct super_block *sb,
+				      unsigned long size)
+{
+	unsigned long blocksize;
+	struct nova_inode *root_i, *pi;
+	struct nova_super_block *super;
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+
+	nova_info("creating an empty nova of size %lu\n", size);
+	sbi->num_blocks = ((unsigned long)(size) >> PAGE_SHIFT);
+
+	nova_dbgv("nova: Default block size set to 4K\n");
+	sbi->blocksize = blocksize = NOVA_DEF_BLOCK_SIZE_4K;
+	nova_set_blocksize(sb, sbi->blocksize);
+
+	if (!nova_check_size(sb, size)) {
+		nova_warn("Specified NOVA size too small 0x%lx.\n", size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	nova_dbgv("max file name len %d\n", (unsigned int)NOVA_NAME_LEN);
+
+	super = nova_get_super(sb);
+
+	/* clear out super-block and inode table */
+	memset_nt(super, 0, sbi->head_reserved_blocks * sbi->blocksize);
+
+	pi = nova_get_inode_by_ino(sb, NOVA_BLOCKNODE_INO);
+	pi->nova_ino = NOVA_BLOCKNODE_INO;
+	nova_flush_buffer(pi, CACHELINE_SIZE, 1);
+
+	sbi->nova_sb->s_size = cpu_to_le64(size);
+	sbi->nova_sb->s_blocksize = cpu_to_le32(blocksize);
+	sbi->nova_sb->s_magic = cpu_to_le32(NOVA_SUPER_MAGIC);
+	sbi->nova_sb->s_epoch_id = 0;
+
+	nova_sync_super(sb);
+
+	root_i = nova_get_inode_by_ino(sb, NOVA_ROOT_INO);
+	nova_dbgv("%s: Allocate root inode @ 0x%p\n", __func__, root_i);
+
+	root_i->i_mode = cpu_to_le16(sbi->mode | S_IFDIR);
+	root_i->i_uid = cpu_to_le32(from_kuid(&init_user_ns, sbi->uid));
+	root_i->i_gid = cpu_to_le32(from_kgid(&init_user_ns, sbi->gid));
+	root_i->i_links_count = cpu_to_le16(2);
+	root_i->i_blk_type = NOVA_BLOCK_TYPE_4K;
+	root_i->i_flags = 0;
+	root_i->i_size = cpu_to_le64(sb->s_blocksize);
+	root_i->i_atime = root_i->i_mtime = root_i->i_ctime =
+		cpu_to_le32(get_seconds());
+	root_i->nova_ino = cpu_to_le64(NOVA_ROOT_INO);
+	root_i->valid = 1;
+
+	nova_flush_buffer(root_i, sizeof(*root_i), false);
+
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+	nova_info("NOVA initialization finish\n");
+	return root_i;
+}
+
+static inline void set_default_opts(struct nova_sb_info *sbi)
+{
+	set_opt(sbi->s_mount_opt, HUGEIOREMAP);
+	set_opt(sbi->s_mount_opt, ERRORS_CONT);
+	sbi->head_reserved_blocks = HEAD_RESERVED_BLOCKS;
+	sbi->tail_reserved_blocks = TAIL_RESERVED_BLOCKS;
+	sbi->cpus = num_online_cpus();
+}
+
+static void nova_root_check(struct super_block *sb, struct nova_inode *root_pi)
+{
+	if (!S_ISDIR(le16_to_cpu(root_pi->i_mode)))
+		nova_warn("root is not a directory!\n");
+}
+
+static int nova_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct nova_sb_info *sbi = NULL;
+	struct nova_inode *root_pi;
+	struct inode *root_i = NULL;
+	unsigned long blocksize;
+	u32 random = 0;
+	int retval = -EINVAL;
+
+	BUILD_BUG_ON(sizeof(struct nova_super_block) > NOVA_SB_SIZE);
+
+	sbi = kzalloc(sizeof(struct nova_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sbi->nova_sb = kzalloc(sizeof(struct nova_super_block), GFP_KERNEL);
+	if (!sbi->nova_sb) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
+
+	sb->s_fs_info = sbi;
+	sbi->sb = sb;
+
+	set_default_opts(sbi);
+
+	/* Currently the log page supports 64 journal pointer pairs */
+	if (sbi->cpus > MAX_CPUS) {
+		nova_err(sb, "NOVA needs more log pointer pages to support more than "
+			  __stringify(MAX_CPUS) " cpus.\n");
+		goto out;
+	}
+
+	retval = nova_get_nvmm_info(sb, sbi);
+	if (retval) {
+		nova_err(sb, "%s: Failed to get nvmm info.",
+			 __func__);
+		goto out;
+	}
+
+	get_random_bytes(&random, sizeof(u32));
+	atomic_set(&sbi->next_generation, random);
+
+	/* Init with default values */
+	sbi->mode = (0755);
+	sbi->uid = current_fsuid();
+	sbi->gid = current_fsgid();
+	set_opt(sbi->s_mount_opt, HUGEIOREMAP);
+
+	mutex_init(&sbi->s_lock);
+
+	sbi->zeroed_page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!sbi->zeroed_page) {
+		retval = -ENOMEM;
+		nova_dbg("%s: sbi->zeroed_page failed.",
+			 __func__);
+		goto out;
+	}
+
+	retval = nova_parse_options(data, sbi, 0);
+	if (retval) {
+		nova_err(sb, "%s: Failed to parse nova command line options.",
+			 __func__);
+		goto out;
+	}
+
+	/* Init a new nova instance */
+	if (sbi->s_mount_opt & NOVA_MOUNT_FORMAT) {
+		root_pi = nova_init(sb, sbi->initsize);
+		if (IS_ERR(root_pi)) {
+			nova_err(sb, "%s: root_pi error.",
+				 __func__);
+
+			goto out;
+		}
+		goto setup_sb;
+	}
+
+	blocksize = le32_to_cpu(sbi->nova_sb->s_blocksize);
+	nova_set_blocksize(sb, blocksize);
+
+	nova_dbg_verbose("blocksize %lu\n", blocksize);
+
+	/* Read the root inode */
+	root_pi = nova_get_inode_by_ino(sb, NOVA_ROOT_INO);
+
+	/* Check that the root inode is in a sane state */
+	nova_root_check(sb, root_pi);
+
+	/* Set it all up.. */
+setup_sb:
+	sb->s_magic = le32_to_cpu(sbi->nova_sb->s_magic);
+	sb->s_op = &nova_sops;
+	sb->s_maxbytes = nova_max_size(sb->s_blocksize_bits);
+	sb->s_time_gran = 1000000000; // 1 second.
+	sb->s_xattr = NULL;
+	sb->s_flags |= MS_NOSEC;
+
+	root_i = nova_iget(sb, NOVA_ROOT_INO);
+	if (IS_ERR(root_i)) {
+		retval = PTR_ERR(root_i);
+		nova_err(sb, "%s: failed to get root inode",
+			 __func__);
+
+		goto out;
+	}
+
+	sb->s_root = d_make_root(root_i);
+	if (!sb->s_root) {
+		nova_err(sb, "get nova root inode failed\n");
+		retval = -ENOMEM;
+		goto out;
+	}
+
+	retval = 0;
+	return retval;
+
+out:
+	kfree(sbi->zeroed_page);
+	sbi->zeroed_page = NULL;
+
+	kfree(sbi->nova_sb);
+	kfree(sbi);
+	nova_dbg("%s failed: return %d\n", __func__, retval);
+	return retval;
+}
+
+static void nova_put_super(struct super_block *sb)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+
+	if (sbi->virt_addr) {
+		sbi->virt_addr = NULL;
+	}
+
+	kfree(sbi->zeroed_page);
+	nova_dbgmask = 0;
+
+	kfree(sbi->nova_sb);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+}
+
+static struct inode *nova_alloc_inode(struct super_block *sb)
+{
+	struct nova_inode_info *vi;
+
+	vi = kmem_cache_alloc(nova_inode_cachep, GFP_NOFS);
+	if (!vi)
+		return NULL;
+
+	return &vi->vfs_inode;
+}
+
+static void nova_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct nova_inode_info *vi = NOVA_I(inode);
+
+	nova_dbg_verbose("%s: ino %lu\n", __func__, inode->i_ino);
+	kmem_cache_free(nova_inode_cachep, vi);
+}
+
+static void nova_destroy_inode(struct inode *inode)
+{
+	nova_dbgv("%s: %lu\n", __func__, inode->i_ino);
+	call_rcu(&inode->i_rcu, nova_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct nova_inode_info *vi = foo;
+
+	inode_init_once(&vi->vfs_inode);
+}
+
+static int __init init_inodecache(void)
+{
+	nova_inode_cachep = kmem_cache_create("nova_inode_cache",
+					       sizeof(struct nova_inode_info),
+					       0, (SLAB_RECLAIM_ACCOUNT |
+						   SLAB_MEM_SPREAD), init_once);
+	if (nova_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before
+	 * we destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(nova_inode_cachep);
+}
+
+
+/*
+ * the super block writes are all done "on the fly", so the
+ * super block is never in a "dirty" state, so there's no need
+ * for write_super.
+ */
+static struct super_operations nova_sops = {
+	.alloc_inode	= nova_alloc_inode,
+	.destroy_inode	= nova_destroy_inode,
+	.put_super	= nova_put_super,
+};
+
+static struct dentry *nova_mount(struct file_system_type *fs_type,
+				  int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, nova_fill_super);
+}
+
+static struct file_system_type nova_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "NOVA",
+	.mount		= nova_mount,
+	.kill_sb	= kill_block_super,
+};
+
+static int __init init_nova_fs(void)
+{
+	int rc = 0;
+
+	nova_dbg("%s: %d cpus online\n", __func__, num_online_cpus());
+	if (arch_has_clwb())
+		support_clwb = 1;
+
+	nova_info("Arch new instructions support: CLWB %s\n",
+			support_clwb ? "YES" : "NO");
+
+	rc = init_inodecache();
+	if (rc)
+		return rc;
+
+	rc = register_filesystem(&nova_fs_type);
+	if (rc)
+		goto out1;
+
+	return rc;
+
+out1:
+	destroy_inodecache();
+	return rc;
+}
+
+static void __exit exit_nova_fs(void)
+{
+	unregister_filesystem(&nova_fs_type);
+	destroy_inodecache();
+}
+
+MODULE_AUTHOR("Andiry Xu <jix024@cs.ucsd.edu>");
+MODULE_DESCRIPTION("NOVA: NOn-Volatile memory Accelerated File System");
+MODULE_LICENSE("GPL");
+
+module_init(init_nova_fs)
+module_exit(exit_nova_fs)