diff mbox series

[PoC,v2,02/10] btrfs: scrub: introduce place holder for btrfs_scrub_fs()

Message ID f2b6d30ae49294645b6c9487723e81e68fe09b24.1664353497.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: scrub: introduce a new family of ioctl, scrub_fs | expand

Commit Message

Qu Wenruo Sept. 28, 2022, 8:35 a.m. UTC
The new function btrfs_scrub_fs() will do the exclusive checking against
regular scrub and dev-replace, then return -EOPNOTSUPP as a place
holder.

Also to let regular scrub/dev-replace to be exclusive against
btrfs_scrub_fs(), also introduce btrfs_fs_info::scrub_fs_running member.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/ctree.h |   4 ++
 fs/btrfs/ioctl.c |  41 +++++++++++++++++-
 fs/btrfs/scrub.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 149 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 727595eee973..07542a55d77d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -905,6 +905,7 @@  struct btrfs_fs_info {
 	/* private scrub information */
 	struct mutex scrub_lock;
 	atomic_t scrubs_running;
+	atomic_t scrub_fs_running;
 	atomic_t scrub_pause_req;
 	atomic_t scrubs_paused;
 	atomic_t scrub_cancel_req;
@@ -4026,6 +4027,9 @@  int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
 		    int readonly, int is_dev_replace);
+int btrfs_scrub_fs(struct btrfs_fs_info *fs_info, u64 start, u64 end,
+		   struct btrfs_scrub_fs_progress *progress,
+		   bool readonly);
 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
 int btrfs_scrub_cancel(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6944216e1425..00870b20bd5a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4112,6 +4112,45 @@  static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
+static long btrfs_ioctl_scrub_fs(struct file *file, void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+	struct btrfs_ioctl_scrub_fs_args *sfsa;
+	bool readonly = false;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	sfsa = memdup_user(arg, sizeof(*sfsa));
+	if (IS_ERR(sfsa))
+		return PTR_ERR(sfsa);
+
+	if (sfsa->flags & ~BTRFS_SCRUB_FS_FLAG_SUPP) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if (sfsa->flags & BTRFS_SCRUB_FS_FLAG_READONLY)
+		readonly = true;
+
+	if (!readonly) {
+		ret = mnt_want_write_file(file);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_scrub_fs(fs_info, sfsa->start, sfsa->end, &sfsa->progress,
+			     readonly);
+	if (copy_to_user(arg, sfsa, sizeof(*sfsa)))
+		ret = -EFAULT;
+
+	if (!readonly)
+		mnt_drop_write_file(file);
+out:
+	kfree(sfsa);
+	return ret;
+}
+
 static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
 				      void __user *arg)
 {
@@ -5509,7 +5548,7 @@  long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SCRUB_PROGRESS:
 		return btrfs_ioctl_scrub_progress(fs_info, argp);
 	case BTRFS_IOC_SCRUB_FS:
-		return -EOPNOTSUPP;
+		return btrfs_ioctl_scrub_fs(file, argp);
 	case BTRFS_IOC_SCRUB_FS_CANCEL:
 		return -EOPNOTSUPP;
 	case BTRFS_IOC_SCRUB_FS_PROGRESS:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f260c53829e5..d3d64f048c7b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4297,6 +4297,15 @@  int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	}
 
 	mutex_lock(&fs_info->scrub_lock);
+
+	/* Conflict with scrub_fs ioctls. */
+	if (atomic_read(&fs_info->scrub_fs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		ret = -EINPROGRESS;
+		goto out;
+	}
+
 	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
 	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
 		mutex_unlock(&fs_info->scrub_lock);
@@ -4418,6 +4427,102 @@  int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	return ret;
 }
 
+/*
+ * Unlike btrfs_scrub_dev(), this function works completely in logical bytenr
+ * level, and has the following advantage:
+ *
+ * - Better error reporting
+ *   The new btrfs_scrub_fs_progress has better classified errors, more
+ *   members to include parity errors.
+ *
+ * - Always scrub one block group at one time
+ *   btrfs_scrub_dev() works by starting one scrub for each device.
+ *   This can cause asynchronised progress, and mark multiple block groups
+ *   RO, reducing the avaialbe space unnecessarily.
+ *
+ * - Less IO for RAID56
+ *   Instead of treating RAID56 data and P/Q stripes differently, here we only
+ *   scrub a full stripe at most once.
+ *   Instead of the 2x read for data stripes (one for scrubbing the data stripe itself,
+ *   the other one from scrubbing the P/Q stripe).
+ *
+ * - No bio formshaping and streamlined code
+ *   Always submit bio for all involved mirrors (or data/p/q stripes for
+ *   RAID56), wait for the IO, then run the check.
+ *
+ *   Thus there are at most nr_mirrors (nr_stripes for RAID56) bios on-the-fly,
+ *   and for each device, there is always at most one bio for scrub.
+ *
+ *   This would greatly simplify all involved code.
+ *
+ * - No need to support dev-replace
+ *   Thus we can have simpler code.
+ *
+ * Unfortunately this ioctl has the following disadvantage so far:
+ *
+ * - No resume after unmount
+ *   We may need extra on-disk format to save the progress.
+ *   Thus we may need a new RO compat flags for the resume ability.
+ *
+ * - Conflicts with dev-replace/scrub
+ *
+ * - Needs kernel support.
+ *
+ * - Not fully finished
+ */
+int btrfs_scrub_fs(struct btrfs_fs_info *fs_info, u64 start, u64 end,
+		   struct btrfs_scrub_fs_progress *progress,
+		   bool readonly)
+{
+	int ret;
+
+	if (btrfs_fs_closing(fs_info))
+		return -EAGAIN;
+
+	if (btrfs_is_zoned(fs_info))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Metadata and data unit should be able to be contained inside one
+	 * stripe.
+	 */
+	ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
+	ASSERT(fs_info->sectorsize <= BTRFS_STRIPE_LEN);
+
+	mutex_lock(&fs_info->scrub_lock);
+	/* This function conflicts with scrub/dev-replace. */
+	if (atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		return -EINPROGRESS;
+	}
+
+	/* And there can only be one running btrfs_scrub_fs(). */
+	if (atomic_read(&fs_info->scrub_fs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		return -EINPROGRESS;
+	}
+
+	__scrub_blocked_if_needed(fs_info);
+	atomic_inc(&fs_info->scrub_fs_running);
+
+	/* This is to allow existing scrub pause to be reused. */
+	atomic_inc(&fs_info->scrubs_running);
+	btrfs_info(fs_info, "scrub_fs: started");
+	mutex_unlock(&fs_info->scrub_lock);
+
+	/* Place holder for real workload. */
+	ret = -EOPNOTSUPP;
+
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_dec(&fs_info->scrubs_running);
+	atomic_dec(&fs_info->scrub_fs_running);
+	btrfs_info(fs_info, "scrub_fs: finished with status: %d", ret);
+	mutex_unlock(&fs_info->scrub_lock);
+	wake_up(&fs_info->scrub_pause_wait);
+
+	return ret;
+}
+
 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
 {
 	mutex_lock(&fs_info->scrub_lock);