diff mbox series

[2/3] btrfs: introduce super_failure_tolerance sysfs interface

Message ID 9cc262a52ddb23a7948c8338b660449ec8598914.1695535440.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: introduce 3 debug sysfs interface to tweak the error handling behavior | expand

Commit Message

Qu Wenruo Sept. 24, 2023, 6:14 a.m. UTC
Currently btrfs has a questionable tolerance on how many devices can
fail their super blocks writeback, it allows "num_devices - 1" to
fail.

This can already be problematic for multi-device btrfses, but
unfortunately I don't have anything better for now.

Instead this patch would allow debug builds to configure the tolerance
by the new sysfs interface:

  /sys/fs/btrfs/<uuid>/debug/super_failure_tolerance

This value is s8, for values >= 0 it's the tolerance number directly.
E.g. if the value is 0, we do not allow any device to fail its super
block writeback.
If the value is 2, and the fs only have 2 devices, it means we allow all
devices to fail their super block writeback (aka, very dangerous).

If the value is minus, then the tolerance is num_devices plus this
value.
E.g. if the value is -1 (default), and we have 2 devices, it means the
tolerance is 1 (at most one device can fail).
If the value is -2, and we have 1 devices, this means we allow all
devices to fail (again, very dangerous).

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/disk-io.c | 27 ++++++++++++++++++++++++---
 fs/btrfs/fs.h      | 18 ++++++++++++++++++
 fs/btrfs/sysfs.c   | 30 ++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d8eb968e9e5e..062e28ac94b1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2723,6 +2723,7 @@  void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	INIT_LIST_HEAD(&fs_info->allocated_ebs);
 	spin_lock_init(&fs_info->eb_leak_lock);
 	fs_info->allow_backup_super_failure = true;
+	fs_info->super_failure_tolerance = -1;
 #endif
 	extent_map_tree_init(&fs_info->mapping_tree);
 	btrfs_init_block_rsv(&fs_info->global_block_rsv,
@@ -4033,6 +4034,26 @@  int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
 	return min_tolerated;
 }
 
+static int calculate_max_super_errors(struct btrfs_fs_info *fs_info)
+{
+	int num_devs = btrfs_super_num_devices(fs_info->super_copy);
+	int tolerance_value = READ_ONCE(fs_info->super_failure_tolerance);
+
+	if (tolerance_value >= 0)
+		return tolerance_value;
+
+	ASSERT(num_devs >= 0);
+
+	/*
+	 * Now tolerance_value is minus, check if
+	 * abs(@tolerance_value) is > @num_devices. If so we allow all devices
+	 * to fail.
+	 */
+	if (-tolerance_value >= num_devs)
+		return INT_MAX;
+	return num_devs + tolerance_value;
+}
+
 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 {
 	struct list_head *head;
@@ -4060,7 +4081,7 @@  int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	head = &fs_info->fs_devices->devices;
-	max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
+	max_errors = calculate_max_super_errors(fs_info);
 
 	if (do_barriers) {
 		ret = barrier_all_devices(fs_info);
@@ -4138,8 +4159,8 @@  int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 	if (total_errors > max_errors) {
 		btrfs_handle_fs_error(fs_info, -EIO,
-				      "%d errors while writing supers",
-				      total_errors);
+			"failed to write supers: errors %d tolerance %d",
+				      total_errors, max_errors);
 		return -EIO;
 	}
 	return 0;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 2dff41cb463d..7608a1cf612f 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -688,6 +688,24 @@  struct btrfs_fs_info {
 
 	/* If we allow backup superblocks writeback to fail. */
 	bool allow_backup_super_failure;
+
+	/*
+	 * Tolerance on how many devices can fail their superblock writeback.
+	 *
+	 * If the value >= 0, then the value itself is the tolerance.
+	 * If the value < 0, then it would be (rw_devices - value) as the tolerance.
+	 *
+	 * Default value is -1.
+	 *
+	 * E.g. 0 means we do not accept any device to fail its super blocks writeback.
+	 *
+	 * If there are 3 devices and the value is -1, then it means we allow up to 2
+	 * devices to fail its super blocks writeback.
+	 *
+	 * If there are 3 devices and the value is -3 or -4, we would allow all devices
+	 * to fail their super blocks writeback, which can be very DANGEROUS!
+	 */
+	s8 super_failure_tolerance;
 	u8 qgroup_drop_subtree_thres;
 
 	/*
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 852090622a76..bd9f574c2471 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -644,6 +644,35 @@  static ssize_t allow_backup_super_failure_store(struct kobject *debug_kobj,
 BTRFS_ATTR_RW(debug, allow_backup_super_failure, allow_backup_super_failure_show,
 	      allow_backup_super_failure_store);
 
+static ssize_t super_failure_tolerance_show(struct kobject *debug_kobj,
+					    struct kobj_attribute *a,
+					    char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(debug_kobj->parent);
+
+	ASSERT(fs_info);
+	return sysfs_emit(buf, "%d\n",
+			  READ_ONCE(fs_info->super_failure_tolerance));
+}
+
+static ssize_t super_failure_tolerance_store(struct kobject *debug_kobj,
+					     struct kobj_attribute *a,
+					     const char *buf, size_t len)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(debug_kobj->parent);
+	u8 new_number;
+	int ret;
+
+	ASSERT(fs_info);
+
+	ret = kstrtos8(buf, 10, &new_number);
+	if (ret)
+		return -EINVAL;
+	WRITE_ONCE(fs_info->super_failure_tolerance, new_number);
+	return len;
+}
+BTRFS_ATTR_RW(debug, super_failure_tolerance, super_failure_tolerance_show,
+	      super_failure_tolerance_store);
 /*
  * Per-filesystem runtime debugging exported via sysfs.
  *
@@ -657,6 +686,7 @@  BTRFS_ATTR_RW(debug, allow_backup_super_failure, allow_backup_super_failure_show
  */
 static const struct attribute *btrfs_debug_mount_attrs[] = {
 	BTRFS_ATTR_PTR(debug, allow_backup_super_failure),
+	BTRFS_ATTR_PTR(debug, super_failure_tolerance),
 	NULL,
 };