From patchwork Thu Nov 7 06:27:08 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Qu Wenruo X-Patchwork-Id: 11231917 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 1A0EA112B for ; Thu, 7 Nov 2019 06:27:19 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 0041C21D6C for ; Thu, 7 Nov 2019 06:27:18 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726681AbfKGG1S (ORCPT ); Thu, 7 Nov 2019 01:27:18 -0500 Received: from mx2.suse.de ([195.135.220.15]:37588 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1725938AbfKGG1S (ORCPT ); Thu, 7 Nov 2019 01:27:18 -0500 X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id E7D9FAF8E for ; Thu, 7 Nov 2019 06:27:15 +0000 (UTC) From: Qu Wenruo To: linux-btrfs@vger.kernel.org Subject: [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function Date: Thu, 7 Nov 2019 14:27:08 +0800 Message-Id: <20191107062710.67964-2-wqu@suse.com> X-Mailer: git-send-email 2.24.0 In-Reply-To: <20191107062710.67964-1-wqu@suse.com> References: <20191107062710.67964-1-wqu@suse.com> MIME-Version: 1.0 Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org In __btrfs_alloc_chunk() we need to iterate through all rw devices and gather the hole sizes of them. This function can be refactor into a new function, gather_dev_holes(), to gather holes info from a list_head. This provides the basis for later degraded chunk feature. Signed-off-by: Qu Wenruo --- fs/btrfs/volumes.c | 129 ++++++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 55 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cdd7af424033..eee5fc1d11f0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4898,17 +4898,84 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } +static int gather_dev_holes(struct btrfs_fs_info *info, + struct btrfs_device_info *devices_info, + int *index, struct list_head *list, + int max_nr_devs, u64 stripe_size, int dev_stripes) +{ + struct btrfs_device *device; + int ret; + int ndevs = 0; + + list_for_each_entry(device, list, dev_alloc_list) { + u64 max_avail; + u64 dev_offset; + u64 total_avail; + + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + WARN(1, KERN_ERR + "BTRFS: read-only device in alloc_list\n"); + continue; + } + + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + continue; + + if (device->total_bytes > device->bytes_used) + total_avail = device->total_bytes - device->bytes_used; + else + total_avail = 0; + + /* If there is no space on this device, skip it. */ + if (total_avail == 0) + continue; + + ret = find_free_dev_extent(device, + stripe_size * dev_stripes, + &dev_offset, &max_avail); + if (ret && ret != -ENOSPC) + break; + + if (ret == 0) + max_avail = stripe_size * dev_stripes; + + if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, + "%s: devid %llu has no free space, have=%llu want=%u", + __func__, device->devid, max_avail, + BTRFS_STRIPE_LEN * dev_stripes); + continue; + } + + if (ndevs == max_nr_devs) { + WARN(1, "%s: found more than %u devices\n", __func__, + max_nr_devs); + break; + } + ret = 0; + devices_info[ndevs].dev_offset = dev_offset; + devices_info[ndevs].max_avail = max_avail; + devices_info[ndevs].total_avail = total_avail; + devices_info[ndevs].dev = device; + ++ndevs; + } + *index += ndevs; + return 0; +} + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct btrfs_device *device; struct map_lookup *map = NULL; struct extent_map_tree *em_tree; struct extent_map *em; struct btrfs_device_info *devices_info = NULL; - u64 total_avail; int num_stripes; /* total number of stripes to allocate */ int data_stripes; /* number of stripes that count for block group size */ @@ -4983,59 +5050,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * about the available holes on each device. */ ndevs = 0; - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 max_avail; - u64 dev_offset; - - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { - WARN(1, KERN_ERR - "BTRFS: read-only device in alloc_list\n"); - continue; - } - - if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state) || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) - continue; - - if (device->total_bytes > device->bytes_used) - total_avail = device->total_bytes - device->bytes_used; - else - total_avail = 0; - - /* If there is no space on this device, skip it. */ - if (total_avail == 0) - continue; - - ret = find_free_dev_extent(device, - max_stripe_size * dev_stripes, - &dev_offset, &max_avail); - if (ret && ret != -ENOSPC) - goto error; - - if (ret == 0) - max_avail = max_stripe_size * dev_stripes; - - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { - if (btrfs_test_opt(info, ENOSPC_DEBUG)) - btrfs_debug(info, - "%s: devid %llu has no free space, have=%llu want=%u", - __func__, device->devid, max_avail, - BTRFS_STRIPE_LEN * dev_stripes); - continue; - } - - if (ndevs == fs_devices->rw_devices) { - WARN(1, "%s: found more than %llu devices\n", - __func__, fs_devices->rw_devices); - break; - } - devices_info[ndevs].dev_offset = dev_offset; - devices_info[ndevs].max_avail = max_avail; - devices_info[ndevs].total_avail = total_avail; - devices_info[ndevs].dev = device; - ++ndevs; - } + ret = gather_dev_holes(info, devices_info, &ndevs, + &fs_devices->alloc_list, fs_devices->rw_devices, + max_stripe_size, dev_stripes); + if (ret < 0) + goto error; /* * now sort the devices by hole size / available space From patchwork Thu Nov 7 06:27:09 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Qu Wenruo X-Patchwork-Id: 11231919 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 7FAE8112B for ; Thu, 7 Nov 2019 06:27:20 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 676E021D79 for ; Thu, 7 Nov 2019 06:27:20 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726778AbfKGG1T (ORCPT ); Thu, 7 Nov 2019 01:27:19 -0500 Received: from mx2.suse.de ([195.135.220.15]:37594 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1726467AbfKGG1T (ORCPT ); Thu, 7 Nov 2019 01:27:19 -0500 X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id 0F6EFAE4D for ; Thu, 7 Nov 2019 06:27:17 +0000 (UTC) From: Qu Wenruo To: linux-btrfs@vger.kernel.org Subject: [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices Date: Thu, 7 Nov 2019 14:27:09 +0800 Message-Id: <20191107062710.67964-3-wqu@suse.com> X-Mailer: git-send-email 2.24.0 In-Reply-To: <20191107062710.67964-1-wqu@suse.com> References: <20191107062710.67964-1-wqu@suse.com> MIME-Version: 1.0 Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org This enables btrfs to iterate missing devices separately, without iterating all fs_devices. This provides the basis for later degraded chunk enhancement. The change includes: - Add missing devices to btrfs_fs_devices::missing_list This happens at add_missing_dev() and other locations where missing_devices get increased. - Remove missing devices from btrfs_fs_devices::missing_list This needs to cover all locations where missing_devices get decreased. Signed-off-by: Qu Wenruo --- fs/btrfs/volumes.c | 27 +++++++++++++++++++++------ fs/btrfs/volumes.h | 6 ++++++ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eee5fc1d11f0..a462d8de5d2a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -324,6 +324,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); + INIT_LIST_HEAD(&fs_devs->missing_list); INIT_LIST_HEAD(&fs_devs->fs_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); @@ -1089,6 +1090,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + list_del_init(&device->dev_alloc_list); } } @@ -1250,11 +1252,10 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->bdev) fs_devices->open_devices--; + list_del_init(&device->dev_alloc_list); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); + device->devid != BTRFS_DEV_REPLACE_DEVID) fs_devices->rw_devices--; - } if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) fs_devices->missing_devices--; @@ -2140,6 +2141,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, device->fs_devices->rw_devices--; mutex_unlock(&fs_info->chunk_mutex); } + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + mutex_lock(&fs_info->chunk_mutex); + list_del_init(&device->dev_alloc_list); + device->fs_devices->missing_devices--; + mutex_unlock(&fs_info->chunk_mutex); + } mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); @@ -2184,9 +2191,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (cur_devices != fs_devices) fs_devices->total_devices--; - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - cur_devices->missing_devices--; - btrfs_assign_next_active_device(device, NULL); if (device->bdev) { @@ -2236,6 +2240,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + mutex_lock(&fs_info->chunk_mutex); + list_add(&device->dev_alloc_list, + &fs_devices->missing_list); + device->fs_devices->missing_devices++; + mutex_unlock(&fs_info->chunk_mutex); + } goto out; } @@ -2438,6 +2449,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) seed_devices->opened = 1; INIT_LIST_HEAD(&seed_devices->devices); INIT_LIST_HEAD(&seed_devices->alloc_list); + INIT_LIST_HEAD(&seed_devices->missing_list); mutex_init(&seed_devices->device_list_mutex); mutex_lock(&fs_devices->device_list_mutex); @@ -6640,6 +6652,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, fs_devices->num_devices++; set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + list_add(&device->dev_alloc_list, &fs_devices->missing_list); fs_devices->missing_devices++; return device; @@ -6979,6 +6992,7 @@ static int read_one_dev(struct extent_buffer *leaf, */ device->fs_devices->missing_devices++; set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + list_add(&device->dev_alloc_list, &fs_devices->missing_list); } /* Move the device to its own fs_devices */ @@ -6992,6 +7006,7 @@ static int read_one_dev(struct extent_buffer *leaf, device->fs_devices->missing_devices--; fs_devices->missing_devices++; + list_move(&device->dev_alloc_list, &fs_devices->missing_list); device->fs_devices = fs_devices; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a7da1f3e3627..9cef4dc4b5be 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -253,6 +253,12 @@ struct btrfs_fs_devices { */ struct list_head alloc_list; + /* + * Devices which can't be found. Projected by chunk_mutex. + * This acts as a fallback allocation list for certain degraded mount. + */ + struct list_head missing_list; + struct btrfs_fs_devices *seed; int seeding; From patchwork Thu Nov 7 06:27:10 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Qu Wenruo X-Patchwork-Id: 11231921 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id E7D0F112B for ; Thu, 7 Nov 2019 06:27:21 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id CF93221D79 for ; Thu, 7 Nov 2019 06:27:21 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726806AbfKGG1V (ORCPT ); Thu, 7 Nov 2019 01:27:21 -0500 Received: from mx2.suse.de ([195.135.220.15]:37600 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1725938AbfKGG1U (ORCPT ); Thu, 7 Nov 2019 01:27:20 -0500 X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id 29C09B300 for ; Thu, 7 Nov 2019 06:27:18 +0000 (UTC) From: Qu Wenruo To: linux-btrfs@vger.kernel.org Subject: [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk Date: Thu, 7 Nov 2019 14:27:10 +0800 Message-Id: <20191107062710.67964-4-wqu@suse.com> X-Mailer: git-send-email 2.24.0 In-Reply-To: <20191107062710.67964-1-wqu@suse.com> References: <20191107062710.67964-1-wqu@suse.com> MIME-Version: 1.0 Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org [PROBLEM] Btrfs degraded mount will fallback to SINGLE profile if there are not enough devices: # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1 # wipefs -fa /dev/test/scratch2 # mount -o degraded /dev/test/scratch1 /mnt/btrfs # fallocate -l 1G /mnt/btrfs/foobar # btrfs ins dump-tree -t chunk /dev/test/scratch1 item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80 length 536870912 owner 2 stripe_len 65536 type DATA New data chunk will fallback to SINGLE. If user doesn't balance those SINGLE chunks, even with missing devices replaced, the fs is no longer full RAID1, and a missing device can break the tolerance. [CAUSE] The cause is pretty simple, when mounted degraded, missing devices can't be used for chunk allocation. Thus btrfs has to fall back to SINGLE profile. [ENHANCEMENT] To avoid such problem, this patch will: - Make all profiler reducer/updater to consider missing devices as part of num_devices - Make chunk allocator to fallback to missing_list as last resort If we have enough rw_devices, then go regular chunk allocation code. This can avoid allocating degraded chunks. E.g. for 3 devices RAID1 degraded mount, we will use the 2 existing devices to allocate chunk, avoid degraded chunk. But if we don't have enough rw_devices, then we check missing devices to allocate degraded chunks. E.g. for 2 devices RAID1 degraded mount, we have to allocate degraded chunks to keep the RAID1 profile. Signed-off-by: Qu Wenruo Reviewed-by: Qu Wenruo --- fs/btrfs/block-group.c | 10 +++++++--- fs/btrfs/volumes.c | 18 +++++++++++++++--- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bf7e3f23bba7..1686fd31679b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -52,11 +52,13 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) */ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) { - u64 num_devices = fs_info->fs_devices->rw_devices; + u64 num_devices; u64 target; u64 raid_type; u64 allowed = 0; + num_devices = fs_info->fs_devices->rw_devices + + fs_info->fs_devices->missing_devices; /* * See if restripe for this chunk_type is in progress, if so try to * reduce to the target profile @@ -1986,7 +1988,8 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) if (stripped) return extended_to_chunk(stripped); - num_devices = fs_info->fs_devices->rw_devices; + num_devices = fs_info->fs_devices->rw_devices + + fs_info->fs_devices->missing_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; @@ -2981,7 +2984,8 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; if (!num_dev) - num_dev = fs_info->fs_devices->rw_devices; + num_dev = fs_info->fs_devices->rw_devices + + fs_info->fs_devices->missing_devices; return num_dev; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a462d8de5d2a..4dee1974ceb7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5052,8 +5052,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); - devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), - GFP_NOFS); + devices_info = kcalloc(fs_devices->rw_devices + + fs_devices->missing_devices, + sizeof(*devices_info), GFP_NOFS); if (!devices_info) return -ENOMEM; @@ -5067,7 +5068,18 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size, dev_stripes); if (ret < 0) goto error; - + /* + * If rw devices can't fullfil the request, fallback to missing devices + * as last resort. + */ + if (ndevs < devs_min) { + ret = gather_dev_holes(info, devices_info + ndevs, &ndevs, + &fs_devices->missing_list, + fs_devices->missing_devices, + max_stripe_size, dev_stripes); + if (ret < 0) + goto error; + } /* * now sort the devices by hole size / available space */