Message ID | cfd97a36f22e60f29878598673be01f69208fea9.1543935982.git.dsterba@suse.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Scrub allocations vs reclaim fix | expand |
On 4.12.18 г. 17:11 ч., David Sterba wrote: > The scrub context is allocated with GFP_KERNEL and called from > btrfs_scrub_dev under the fs_info::device_list_mutex. This is not safe > regarding reclaim that could try to flush filesystem data in order to > get the memory. And the device_list_mutex is held during superblock > commit, so this would cause a lockup. > > Move the alocation and initialization before any changes that require > the mutex. > > Signed-off-by: David Sterba <dsterba@suse.com> > --- > fs/btrfs/scrub.c | 30 ++++++++++++++++++------------ > 1 file changed, 18 insertions(+), 12 deletions(-) > > diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c > index ffcab263e057..051d14c9f013 100644 > --- a/fs/btrfs/scrub.c > +++ b/fs/btrfs/scrub.c > @@ -3834,13 +3834,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, > return -EINVAL; > } > > + /* Allocate outside of device_list_mutex */ > + sctx = scrub_setup_ctx(fs_info, is_dev_replace); > + if (IS_ERR(sctx)) > + return PTR_ERR(sctx); > > mutex_lock(&fs_info->fs_devices->device_list_mutex); > dev = btrfs_find_device(fs_info, devid, NULL, NULL); > if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && > !is_dev_replace)) { > mutex_unlock(&fs_info->fs_devices->device_list_mutex); > - return -ENODEV; > + ret = -ENODEV; > + goto out_free_ctx; > } > > if (!is_dev_replace && !readonly && > @@ -3848,7 +3853,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, > mutex_unlock(&fs_info->fs_devices->device_list_mutex); > btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", > rcu_str_deref(dev->name)); > - return -EROFS; > + ret = -EROFS; > + goto out_free_ctx; > } > > mutex_lock(&fs_info->scrub_lock); > @@ -3856,7 +3862,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, > test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { > mutex_unlock(&fs_info->scrub_lock); > mutex_unlock(&fs_info->fs_devices->device_list_mutex); > - return -EIO; > + ret = -EIO; > + goto out_free_ctx; > } > > btrfs_dev_replace_read_lock(&fs_info->dev_replace); > @@ -3866,7 +3873,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, > btrfs_dev_replace_read_unlock(&fs_info->dev_replace); > mutex_unlock(&fs_info->scrub_lock); > mutex_unlock(&fs_info->fs_devices->device_list_mutex); > - return -EINPROGRESS; > + ret = -EINPROGRESS; > + goto out_free_ctx; > } > btrfs_dev_replace_read_unlock(&fs_info->dev_replace); > > @@ -3874,16 +3882,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, > if (ret) { > mutex_unlock(&fs_info->scrub_lock); > mutex_unlock(&fs_info->fs_devices->device_list_mutex); > - return ret; > + goto out_free_ctx; Don't we suffer the same issue when calling scrub_workers_get since in it we do btrfs_alloc_workqueue which also calls kzalloc with GFP_KERNEL? > } > > - sctx = scrub_setup_ctx(fs_info, is_dev_replace); > - if (IS_ERR(sctx)) { > - mutex_unlock(&fs_info->scrub_lock); > - mutex_unlock(&fs_info->fs_devices->device_list_mutex); > - scrub_workers_put(fs_info); > - return PTR_ERR(sctx); > - } > sctx->readonly = readonly; > dev->scrub_ctx = sctx; > mutex_unlock(&fs_info->fs_devices->device_list_mutex); > @@ -3936,6 +3937,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, > > scrub_put_ctx(sctx); > > + return ret; > + > +out_free_ctx: > + scrub_free_ctx(sctx); > + > return ret; > } > >
On Tue, Dec 04, 2018 at 05:22:19PM +0200, Nikolay Borisov wrote: > > @@ -3874,16 +3882,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, > > if (ret) { > > mutex_unlock(&fs_info->scrub_lock); > > mutex_unlock(&fs_info->fs_devices->device_list_mutex); > > - return ret; > > + goto out_free_ctx; > > Don't we suffer the same issue when calling scrub_workers_get since in > it we do btrfs_alloc_workqueue which also calls kzalloc with GFP_KERNEL? Yes, that's right. I instrumented only the allocations in scrub.c to see if the nofs and lock_not_held assertions work at all so this one did not get caught directly. As scrub_workers_get still needs the scrub_lock, fixing it by moving does not work and would need more restructuring.
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ffcab263e057..051d14c9f013 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3834,13 +3834,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } + /* Allocate outside of device_list_mutex */ + sctx = scrub_setup_ctx(fs_info, is_dev_replace); + if (IS_ERR(sctx)) + return PTR_ERR(sctx); mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -ENODEV; + ret = -ENODEV; + goto out_free_ctx; } if (!is_dev_replace && !readonly && @@ -3848,7 +3853,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); - return -EROFS; + ret = -EROFS; + goto out_free_ctx; } mutex_lock(&fs_info->scrub_lock); @@ -3856,7 +3862,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EIO; + ret = -EIO; + goto out_free_ctx; } btrfs_dev_replace_read_lock(&fs_info->dev_replace); @@ -3866,7 +3873,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, btrfs_dev_replace_read_unlock(&fs_info->dev_replace); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EINPROGRESS; + ret = -EINPROGRESS; + goto out_free_ctx; } btrfs_dev_replace_read_unlock(&fs_info->dev_replace); @@ -3874,16 +3882,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (ret) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return ret; + goto out_free_ctx; } - sctx = scrub_setup_ctx(fs_info, is_dev_replace); - if (IS_ERR(sctx)) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); - return PTR_ERR(sctx); - } sctx->readonly = readonly; dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3936,6 +3937,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_put_ctx(sctx); + return ret; + +out_free_ctx: + scrub_free_ctx(sctx); + return ret; }
The scrub context is allocated with GFP_KERNEL and called from btrfs_scrub_dev under the fs_info::device_list_mutex. This is not safe regarding reclaim that could try to flush filesystem data in order to get the memory. And the device_list_mutex is held during superblock commit, so this would cause a lockup. Move the alocation and initialization before any changes that require the mutex. Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/scrub.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-)