diff mbox series

[2/4] btrfs: add numdevs= mount option.

Message ID 162848132773.25823.8504921416553051353.stgit@noble.brown (mailing list archive)
State New, archived
Headers show
Series Attempt to make progress with btrfs dev number strangeness. | expand

Commit Message

NeilBrown Aug. 9, 2021, 3:55 a.m. UTC
btrfs currently allocates multiple anonymous bdev numbers to hide the
fact that inode numbers are not unique across "subvolumes".
Each subvol gets a different device number.

As described in a previous patch, this is incomplete, doesn't scale, and
should be deprecated.  This patch is another step to deprecation.

With mount option "-o numdevs=many", which is the default, the current
behaviour is preserved.

With mount option "-o numdevs=1", the st_dev reported by stat() is
exactly the number that appears in /proc/$PID/mountinfo (i.e.
sb->s_dev).  This will prevent "du -x", "find -xdev" and similar tools
from keeping within a subvol, but is otherwise quite functional.

If numdevs=1 and inumbits=0, then there will often be inode number
reuse, so that combination is forbidden and the default fo inumbits
changes to BITS_PER_LONG*7/8.  With larger inumbits (close to
BITS_PER_LONG), inode number reuse is still possible, but only with
large or old filesystems.

With mount option "-o numdevs=2", precisely two anon device numbers are
allocated.  Each subvol gets the number that its parent isn't using.
When subvols are moved, the device number reported will change if needed
to differentiate from its parent.
If a subvol with dependent subvols is moved and the device numbers need
to change, the numbers in dependent subvols that are currently in cache
will NOT change.  Fixing this is a stretch goal.

Using numdevs=2 removes any problems with exhausting the number of
available anon devs, and preserves the functionality of "du -x" and
similar.  It may be a useful option for sites that experience exhaustion
problems.

numdevs=1 is, at this stage, most useful for exploring the consequences
of fully deprecating the use of multiple device numbers.  It may also be
useful for site that find they have no dependency on multiple device
numbers.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/btrfs/ctree.h   |   17 +++++++++++++++--
 fs/btrfs/disk-io.c |   24 +++++++++++++++++++++---
 fs/btrfs/inode.c   |   29 ++++++++++++++++++++++++++++-
 fs/btrfs/ioctl.c   |    6 ++++--
 fs/btrfs/super.c   |   30 ++++++++++++++++++++++++++++++
 5 files changed, 98 insertions(+), 8 deletions(-)

Comments

kernel test robot Aug. 9, 2021, 7:50 a.m. UTC | #1
Hi NeilBrown,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on kdave/for-next]
[also build test WARNING on ext3/fsnotify linus/master v5.14-rc5 next-20210806]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/NeilBrown/Attempt-to-make-progress-with-btrfs-dev-number-strangeness/20210809-120046
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
config: x86_64-randconfig-c001-20210809 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project c5c3cdb9c92895a63993cee70d2dd776ff9519c3)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/c5bae87ed5b72b9fd999fa935f477483da001f63
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review NeilBrown/Attempt-to-make-progress-with-btrfs-dev-number-strangeness/20210809-120046
        git checkout c5bae87ed5b72b9fd999fa935f477483da001f63
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> fs/btrfs/ioctl.c:740:6: warning: variable 'ret' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
           if (fs_info->num_devs == BTRFS_MANY_DEVS)
               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   fs/btrfs/ioctl.c:742:6: note: uninitialized use occurs here
           if (ret < 0)
               ^~~
   fs/btrfs/ioctl.c:740:2: note: remove the 'if' if its condition is always true
           if (fs_info->num_devs == BTRFS_MANY_DEVS)
           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   fs/btrfs/ioctl.c:725:9: note: initialize the variable 'ret' to silence this warning
           int ret;
                  ^
                   = 0
   1 warning generated.


vim +740 fs/btrfs/ioctl.c

   716	
   717	static int create_snapshot(struct btrfs_root *root, struct inode *dir,
   718				   struct dentry *dentry, bool readonly,
   719				   struct btrfs_qgroup_inherit *inherit)
   720	{
   721		struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
   722		struct inode *inode;
   723		struct btrfs_pending_snapshot *pending_snapshot;
   724		struct btrfs_trans_handle *trans;
   725		int ret;
   726	
   727		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
   728			return -EINVAL;
   729	
   730		if (atomic_read(&root->nr_swapfiles)) {
   731			btrfs_warn(fs_info,
   732				   "cannot snapshot subvolume with active swapfile");
   733			return -ETXTBSY;
   734		}
   735	
   736		pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
   737		if (!pending_snapshot)
   738			return -ENOMEM;
   739	
 > 740		if (fs_info->num_devs == BTRFS_MANY_DEVS)
   741			ret = get_anon_bdev(&pending_snapshot->anon_dev);
   742		if (ret < 0)
   743			goto free_pending;
   744		pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
   745				GFP_KERNEL);
   746		pending_snapshot->path = btrfs_alloc_path();
   747		if (!pending_snapshot->root_item || !pending_snapshot->path) {
   748			ret = -ENOMEM;
   749			goto free_pending;
   750		}
   751	
   752		btrfs_init_block_rsv(&pending_snapshot->block_rsv,
   753				     BTRFS_BLOCK_RSV_TEMP);
   754		/*
   755		 * 1 - parent dir inode
   756		 * 2 - dir entries
   757		 * 1 - root item
   758		 * 2 - root ref/backref
   759		 * 1 - root of snapshot
   760		 * 1 - UUID item
   761		 */
   762		ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
   763						&pending_snapshot->block_rsv, 8,
   764						false);
   765		if (ret)
   766			goto free_pending;
   767	
   768		pending_snapshot->dentry = dentry;
   769		pending_snapshot->root = root;
   770		pending_snapshot->readonly = readonly;
   771		pending_snapshot->dir = dir;
   772		pending_snapshot->inherit = inherit;
   773	
   774		trans = btrfs_start_transaction(root, 0);
   775		if (IS_ERR(trans)) {
   776			ret = PTR_ERR(trans);
   777			goto fail;
   778		}
   779	
   780		spin_lock(&fs_info->trans_lock);
   781		list_add(&pending_snapshot->list,
   782			 &trans->transaction->pending_snapshots);
   783		spin_unlock(&fs_info->trans_lock);
   784	
   785		ret = btrfs_commit_transaction(trans);
   786		if (ret)
   787			goto fail;
   788	
   789		ret = pending_snapshot->error;
   790		if (ret)
   791			goto fail;
   792	
   793		ret = btrfs_orphan_cleanup(pending_snapshot->snap);
   794		if (ret)
   795			goto fail;
   796	
   797		inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
   798		if (IS_ERR(inode)) {
   799			ret = PTR_ERR(inode);
   800			goto fail;
   801		}
   802	
   803		d_instantiate(dentry, inode);
   804		ret = 0;
   805		pending_snapshot->anon_dev = 0;
   806	fail:
   807		/* Prevent double freeing of anon_dev */
   808		if (ret && pending_snapshot->snap)
   809			pending_snapshot->snap->anon_dev = 0;
   810		btrfs_put_root(pending_snapshot->snap);
   811		btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
   812	free_pending:
   813		if (pending_snapshot->anon_dev)
   814			free_anon_bdev(pending_snapshot->anon_dev);
   815		kfree(pending_snapshot->root_item);
   816		btrfs_free_path(pending_snapshot->path);
   817		kfree(pending_snapshot);
   818	
   819		return ret;
   820	}
   821	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0ef557db3a8b..2caedb8c8c6d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -988,6 +988,14 @@  struct btrfs_fs_info {
 	u32 stripesize;
 
 	unsigned short inumbits;
+	/* num_devs can be:
+	 * 1 - all files in all trees use sb->s_dev
+	 * 2 - file trees alternate between using sb->s_dev and
+	 *     secondary_anon_dev.
+	 * 3 (BTTSF_MANY_DEVS) - Each subtree uses a unique ->anon_dev
+	 */
+	unsigned short num_devs;
+	dev_t secondary_anon_dev;
 
 	/* Block groups and devices containing active swapfiles. */
 	spinlock_t swapfile_pins_lock;
@@ -1035,6 +1043,8 @@  struct btrfs_fs_info {
 #endif
 };
 
+#define BTRFS_MANY_DEVS	(3)
+
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -1176,10 +1186,13 @@  struct btrfs_root {
 	 */
 	struct radix_tree_root delayed_nodes_tree;
 	/*
-	 * right now this just gets used so that a root has its own devid
-	 * for stat.  It may be used for more later
+	 * If fs_info->num_devs == 3 (BTRFS_MANY_DEVS) anon_dev holds a device
+	 * number to be reported by ->getattr().
+	 * If fs_info->num_devs == 2, anon_dev is 0 and use_secondary_dev
+	 * is true when this root uses the secondary, not primary, dev.
 	 */
 	dev_t anon_dev;
+	bool use_secondary_dev;
 
 	spinlock_t root_item_lock;
 	refcount_t refs;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7f3bfa042d66..5127e2689756 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1516,7 +1516,8 @@  static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
 	 * userspace, the id pool is limited to 1M
 	 */
 	if (is_fstree(root->root_key.objectid) &&
-	    btrfs_root_refs(&root->root_item) > 0) {
+	    btrfs_root_refs(&root->root_item) > 0 &&
+	    root->fs_info->num_devs == BTRFS_MANY_DEVS) {
 		if (!anon_dev) {
 			ret = get_anon_bdev(&root->anon_dev);
 			if (ret)
@@ -3332,8 +3333,12 @@  int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	 * "-o inumbits" can over-ride this default.
 	 * BITS_PER_LONG * 7 / 8 is a good value to use
 	 */
-	if (fs_info->inumbits > BITS_PER_LONG)
-		fs_info->inumbits = 0;
+	if (fs_info->inumbits > BITS_PER_LONG) {
+		if (fs_info->num_devs == 1)
+			fs_info->inumbits = BITS_PER_LONG * 7 / 8;
+		else
+			fs_info->inumbits = 0;
+	}
 
 	features = btrfs_super_incompat_flags(disk_super) &
 		~BTRFS_FEATURE_INCOMPAT_SUPP;
@@ -3379,6 +3384,15 @@  int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
 	fs_info->stripesize = stripesize;
 
+	if (fs_info->num_devs == 0)
+		/* set default value */
+		fs_info->num_devs = BTRFS_MANY_DEVS;
+
+	if (fs_info->num_devs == 2) {
+		err = get_anon_bdev(&fs_info->secondary_anon_dev);
+		if (err)
+			goto fail_alloc;
+	}
 	/*
 	 * mixed block groups end up with duplicate but slightly offset
 	 * extent buffers for the same range.  It leads to corruptions
@@ -4446,6 +4460,10 @@  void __cold close_ctree(struct btrfs_fs_info *fs_info)
 
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 	btrfs_close_devices(fs_info->fs_devices);
+
+	if (fs_info->secondary_anon_dev)
+		free_anon_bdev(fs_info->secondary_anon_dev);
+	fs_info->secondary_anon_dev = 0;
 }
 
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 860cb5045123..30fa64cbe6dc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5966,6 +5966,8 @@  struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 			iput(inode);
 			inode = ERR_PTR(ret);
 		}
+		if (fs_info->num_devs == 2)
+			sub_root->use_secondary_dev = !root->use_secondary_dev;
 	}
 
 	return inode;
@@ -9204,7 +9206,15 @@  static int btrfs_getattr(struct user_namespace *mnt_userns,
 				  STATX_ATTR_NODUMP);
 
 	generic_fillattr(&init_user_ns, inode, stat);
-	stat->dev = BTRFS_I(inode)->root->anon_dev;
+	/* If we don't set stat->dev here, sb->s_dev will be used */
+	switch (btrfs_sb(inode->i_sb)->num_devs) {
+	case 2:
+		if (BTRFS_I(inode)->root->use_secondary_dev)
+			stat->dev = btrfs_sb(inode->i_sb)->secondary_anon_dev;
+		break;
+	case BTRFS_MANY_DEVS:
+		stat->dev = BTRFS_I(inode)->root->anon_dev;
+	}
 
 	spin_lock(&BTRFS_I(inode)->lock);
 	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
@@ -9390,6 +9400,15 @@  static int btrfs_rename_exchange(struct inode *old_dir,
 	if (new_inode->i_nlink == 1)
 		BTRFS_I(new_inode)->dir_index = new_idx;
 
+	if (fs_info->num_devs == 2 &&
+	    root->use_secondary_dev != dest->use_secondary_dev) {
+		BTRFS_I(old_inode)->root->use_secondary_dev =
+				!dest->use_secondary_dev;
+		BTRFS_I(new_inode)->root->use_secondary_dev =
+				!root->use_secondary_dev;
+		// FIXME any subvols beneeath 'old_inode' or 'new_inode'
+		// that are in cache are now wrong.
+	}
 	if (root_log_pinned) {
 		btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
 				   new_dentry->d_parent);
@@ -9656,6 +9675,14 @@  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out_fail;
 	}
 
+	if (fs_info->num_devs == 2 &&
+	    root->use_secondary_dev != dest->use_secondary_dev) {
+		BTRFS_I(old_inode)->root->use_secondary_dev =
+				!dest->use_secondary_dev;
+		// FIXME any subvols beneeath 'old_inode' that are
+		// in cache are now wrong.
+	}
+
 	if (old_inode->i_nlink == 1)
 		BTRFS_I(old_inode)->dir_index = index;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e008a9ceb827..a246f91b4df4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -522,7 +522,8 @@  static noinline int create_subvol(struct inode *dir,
 	if (ret)
 		goto fail_free;
 
-	ret = get_anon_bdev(&anon_dev);
+	if (fs_info->num_devs == BTRFS_MANY_DEVS)
+		ret = get_anon_bdev(&anon_dev);
 	if (ret < 0)
 		goto fail_free;
 
@@ -729,7 +730,8 @@  static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	if (!pending_snapshot)
 		return -ENOMEM;
 
-	ret = get_anon_bdev(&pending_snapshot->anon_dev);
+	if (fs_info->num_devs == BTRFS_MANY_DEVS)
+		ret = get_anon_bdev(&pending_snapshot->anon_dev);
 	if (ret < 0)
 		goto free_pending;
 	pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5f3350e2f7ec..b1aecb834234 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -361,6 +361,7 @@  enum {
 	Opt_discard_mode,
 	Opt_inumbits,
 	Opt_norecovery,
+	Opt_numdevs,
 	Opt_ratio,
 	Opt_rescan_uuid_tree,
 	Opt_skip_balance,
@@ -431,6 +432,7 @@  static const match_table_t tokens = {
 	{Opt_inumbits, "inumbits=%u"},
 	{Opt_nodiscard, "nodiscard"},
 	{Opt_norecovery, "norecovery"},
+	{Opt_numdevs, "numdevs=%s"},
 	{Opt_ratio, "metadata_ratio=%u"},
 	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
 	{Opt_skip_balance, "skip_balance"},
@@ -849,8 +851,35 @@  int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 				ret = -EINVAL;
 				goto out;
 			}
+			if (intarg == 0 && info->num_devs == 1) {
+				btrfs_err(info,
+					  "inumbits=0 not permitted when numdevs=1");
+				ret = -EINVAL;
+				goto out;
+			}
 			info->inumbits = intarg;
 			break;
+		case Opt_numdevs:
+			if (info->num_devs) {
+				; /* silently ignore attempts to change this */
+			} else if (strcmp(args[0].from, "many") == 0) {
+				info->num_devs = BTRFS_MANY_DEVS;
+			} else if (strcmp(args[0].from, "1") == 0) {
+				if (info->inumbits == 0) {
+					btrfs_err(info,
+"numdevs=1 not permitted with inumbits=0");
+					ret = -EINVAL;
+				}
+				info->num_devs = 1;
+			} else if (strcmp(args[0].from, "2") == 0) {
+				info->num_devs = 2;
+			} else {
+				btrfs_err(info,
+					  "numdevs must be \"1\", \"2\", or \"many\".");
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
 		case Opt_ratio:
 			ret = match_int(&args[0], &intarg);
 			if (ret)
@@ -1559,6 +1588,7 @@  static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (btrfs_test_opt(info, REF_VERIFY))
 		seq_puts(seq, ",ref_verify");
 	seq_printf(seq, ",inumbits=%u", info->inumbits);
+	seq_printf(seq, ",numdevs=%u", info->num_devs);
 	seq_printf(seq, ",subvolid=%llu",
 		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
 	subvol_name = btrfs_get_subvol_name_from_objectid(info,