@@ -2723,6 +2723,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->allocated_ebs);
spin_lock_init(&fs_info->eb_leak_lock);
fs_info->allow_backup_super_failure = true;
+ fs_info->super_failure_tolerance = -1;
#endif
extent_map_tree_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
@@ -4033,6 +4034,26 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
return min_tolerated;
}
+static int calculate_max_super_errors(struct btrfs_fs_info *fs_info)
+{
+ int num_devs = btrfs_super_num_devices(fs_info->super_copy);
+ int tolerance_value = READ_ONCE(fs_info->super_failure_tolerance);
+
+ if (tolerance_value >= 0)
+ return tolerance_value;
+
+ ASSERT(num_devs >= 0);
+
+ /*
+ * Now tolerance_value is minus, check if
+ * abs(@tolerance_value) is > @num_devices. If so we allow all devices
+ * to fail.
+ */
+ if (-tolerance_value >= num_devs)
+ return INT_MAX;
+ return num_devs + tolerance_value;
+}
+
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
{
struct list_head *head;
@@ -4060,7 +4081,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
mutex_lock(&fs_info->fs_devices->device_list_mutex);
head = &fs_info->fs_devices->devices;
- max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
+ max_errors = calculate_max_super_errors(fs_info);
if (do_barriers) {
ret = barrier_all_devices(fs_info);
@@ -4138,8 +4159,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
btrfs_handle_fs_error(fs_info, -EIO,
- "%d errors while writing supers",
- total_errors);
+ "failed to write supers: errors %d tolerance %d",
+ total_errors, max_errors);
return -EIO;
}
return 0;
@@ -688,6 +688,24 @@ struct btrfs_fs_info {
/* If we allow backup superblocks writeback to fail. */
bool allow_backup_super_failure;
+
+ /*
+ * Tolerance on how many devices can fail their superblock writeback.
+ *
+ * If the value >= 0, then the value itself is the tolerance.
+ * If the value < 0, then it would be (rw_devices - value) as the tolerance.
+ *
+ * Default value is -1.
+ *
+ * E.g. 0 means we do not accept any device to fail its super blocks writeback.
+ *
+ * If there are 3 devices and the value is -1, then it means we allow up to 2
+ * devices to fail its super blocks writeback.
+ *
+ * If there are 3 devices and the value is -3 or -4, we would allow all devices
+ * to fail their super blocks writeback, which can be very DANGEROUS!
+ */
+ s8 super_failure_tolerance;
u8 qgroup_drop_subtree_thres;
/*
@@ -644,6 +644,35 @@ static ssize_t allow_backup_super_failure_store(struct kobject *debug_kobj,
BTRFS_ATTR_RW(debug, allow_backup_super_failure, allow_backup_super_failure_show,
allow_backup_super_failure_store);
+static ssize_t super_failure_tolerance_show(struct kobject *debug_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(debug_kobj->parent);
+
+ ASSERT(fs_info);
+ return sysfs_emit(buf, "%d\n",
+ READ_ONCE(fs_info->super_failure_tolerance));
+}
+
+static ssize_t super_failure_tolerance_store(struct kobject *debug_kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(debug_kobj->parent);
+ u8 new_number;
+ int ret;
+
+ ASSERT(fs_info);
+
+ ret = kstrtos8(buf, 10, &new_number);
+ if (ret)
+ return -EINVAL;
+ WRITE_ONCE(fs_info->super_failure_tolerance, new_number);
+ return len;
+}
+BTRFS_ATTR_RW(debug, super_failure_tolerance, super_failure_tolerance_show,
+ super_failure_tolerance_store);
/*
* Per-filesystem runtime debugging exported via sysfs.
*
@@ -657,6 +686,7 @@ BTRFS_ATTR_RW(debug, allow_backup_super_failure, allow_backup_super_failure_show
*/
static const struct attribute *btrfs_debug_mount_attrs[] = {
BTRFS_ATTR_PTR(debug, allow_backup_super_failure),
+ BTRFS_ATTR_PTR(debug, super_failure_tolerance),
NULL,
};
Currently btrfs has a questionable tolerance on how many devices can fail their super blocks writeback, it allows "num_devices - 1" to fail. This can already be problematic for multi-device btrfses, but unfortunately I don't have anything better for now. Instead this patch would allow debug builds to configure the tolerance by the new sysfs interface: /sys/fs/btrfs/<uuid>/debug/super_failure_tolerance This value is s8, for values >= 0 it's the tolerance number directly. E.g. if the value is 0, we do not allow any device to fail its super block writeback. If the value is 2, and the fs only have 2 devices, it means we allow all devices to fail their super block writeback (aka, very dangerous). If the value is minus, then the tolerance is num_devices plus this value. E.g. if the value is -1 (default), and we have 2 devices, it means the tolerance is 1 (at most one device can fail). If the value is -2, and we have 1 devices, this means we allow all devices to fail (again, very dangerous). Signed-off-by: Qu Wenruo <wqu@suse.com> --- fs/btrfs/disk-io.c | 27 ++++++++++++++++++++++++--- fs/btrfs/fs.h | 18 ++++++++++++++++++ fs/btrfs/sysfs.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 3 deletions(-)