Message ID | 20190111171759.19920-1-fdmanana@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2] Btrfs: avoid deadlock with memory reclaim due to allocation of devices | expand |
On 01/12/2019 01:17 AM, fdmanana@kernel.org wrote: > From: Filipe Manana <fdmanana@suse.com> > > In a few places we are allocating a device using the GFP_KERNEL flag when > it is not safe to do so, because if reclaim is triggered it can cause a > transaction commit while we are holding the device list mutex. This mutex > is required in the transaction commit path (at write_all_supers() and > btrfs_update_commit_device_size()). > > So fix this by setting up a nofs memory allocation context in those cases. > > Fixes: 78f2c9e6dbb14 ("btrfs: device add and remove: use GFP_KERNEL") > Fixes: e0ae999414238 ("btrfs: preallocate device flush bio") > Signed-off-by: Filipe Manana <fdmanana@suse.com> > --- > > V2: Change the approach to fix the problem by setting up nofs contextes > where needed. > > fs/btrfs/volumes.c | 33 ++++++++++++++++++++++++++++++--- > 1 file changed, 30 insertions(+), 3 deletions(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index 2576b1a379c9..663566baae78 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -14,6 +14,7 @@ > #include <linux/semaphore.h> > #include <linux/uuid.h> > #include <linux/list_sort.h> > +#include <linux/sched/mm.h> > #include "ctree.h" > #include "extent_map.h" > #include "disk-io.h" > @@ -988,20 +989,29 @@ static noinline struct btrfs_device *device_list_add(const char *path, > } > > if (!device) { > + unsigned int nofs_flag; > + > if (fs_devices->opened) { > mutex_unlock(&fs_devices->device_list_mutex); > return ERR_PTR(-EBUSY); > } > > + /* > + * Setup nofs context because we are holding the device list > + * mutex, which is required for a transaction commit. > + */ I wonder if there is a bug due to GFP_KERNEL in device_list_add()? as device_list_add() can only be called only when the FSID is not yet mounted. OR if its done for the sake of consistency when calling\ btrfs_alloc_device(). Thanks, Anand > + nofs_flag = memalloc_nofs_save(); > device = btrfs_alloc_device(NULL, &devid, > disk_super->dev_item.uuid); > if (IS_ERR(device)) { > + memalloc_nofs_restore(nofs_flag); > mutex_unlock(&fs_devices->device_list_mutex); > /* we can safely leave the fs_devices entry around */ > return device; > } > > - name = rcu_string_strdup(path, GFP_NOFS); > + name = rcu_string_strdup(path, GFP_KERNEL); > + memalloc_nofs_restore(nofs_flag); > if (!name) { > btrfs_free_device(device); > mutex_unlock(&fs_devices->device_list_mutex); > @@ -1137,11 +1147,19 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) > /* We have held the volume lock, it is safe to get the devices. */ > list_for_each_entry(orig_dev, &orig->devices, dev_list) { > struct rcu_string *name; > + unsigned int nofs_flag; > > + /* > + * Setup nofs context because we are holding the device list > + * mutex, which is required for a transaction commit. > + */ > + nofs_flag = memalloc_nofs_save(); > device = btrfs_alloc_device(NULL, &orig_dev->devid, > orig_dev->uuid); > - if (IS_ERR(device)) > + if (IS_ERR(device)) { > + memalloc_nofs_restore(nofs_flag); > goto error; > + } > > /* > * This is ok to do without rcu read locked because we hold the > @@ -1151,12 +1169,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) > name = rcu_string_strdup(orig_dev->name->str, > GFP_KERNEL); > if (!name) { > + memalloc_nofs_restore(nofs_flag); > btrfs_free_device(device); > goto error; > } > rcu_assign_pointer(device->name, name); > } > > + memalloc_nofs_restore(nofs_flag); > list_add(&device->dev_list, &fs_devices->devices); > device->fs_devices = fs_devices; > fs_devices->num_devices++; > @@ -1262,6 +1282,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) > struct btrfs_fs_devices *fs_devices = device->fs_devices; > struct btrfs_device *new_device; > struct rcu_string *name; > + unsigned int nofs_flag; > > if (device->bdev) > fs_devices->open_devices--; > @@ -1277,17 +1298,23 @@ static void btrfs_close_one_device(struct btrfs_device *device) > > btrfs_close_bdev(device); > > + /* > + * Setup nofs context because we are holding the device list > + * mutex, which is required for a transaction commit. > + */ > + nofs_flag = memalloc_nofs_save(); > new_device = btrfs_alloc_device(NULL, &device->devid, > device->uuid); > BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ > > /* Safe because we are under uuid_mutex */ > if (device->name) { > - name = rcu_string_strdup(device->name->str, GFP_NOFS); > + name = rcu_string_strdup(device->name->str, GFP_KERNEL); > BUG_ON(!name); /* -ENOMEM */ > rcu_assign_pointer(new_device->name, name); > } > > + memalloc_nofs_restore(nofs_flag); > list_replace_rcu(&device->dev_list, &new_device->dev_list); > new_device->fs_devices = device->fs_devices; > >
On Mon, Jan 14, 2019 at 04:21:43PM +0800, Anand Jain wrote: > > > On 01/12/2019 01:17 AM, fdmanana@kernel.org wrote: > > From: Filipe Manana <fdmanana@suse.com> > > > > In a few places we are allocating a device using the GFP_KERNEL flag when > > it is not safe to do so, because if reclaim is triggered it can cause a > > transaction commit while we are holding the device list mutex. This mutex > > is required in the transaction commit path (at write_all_supers() and > > btrfs_update_commit_device_size()). > > > > So fix this by setting up a nofs memory allocation context in those cases. > > > > Fixes: 78f2c9e6dbb14 ("btrfs: device add and remove: use GFP_KERNEL") > > Fixes: e0ae999414238 ("btrfs: preallocate device flush bio") > > Signed-off-by: Filipe Manana <fdmanana@suse.com> > > --- > > > > V2: Change the approach to fix the problem by setting up nofs contextes > > where needed. > > > > fs/btrfs/volumes.c | 33 ++++++++++++++++++++++++++++++--- > > 1 file changed, 30 insertions(+), 3 deletions(-) > > > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > > index 2576b1a379c9..663566baae78 100644 > > --- a/fs/btrfs/volumes.c > > +++ b/fs/btrfs/volumes.c > > @@ -14,6 +14,7 @@ > > #include <linux/semaphore.h> > > #include <linux/uuid.h> > > #include <linux/list_sort.h> > > +#include <linux/sched/mm.h> > > #include "ctree.h" > > #include "extent_map.h" > > #include "disk-io.h" > > @@ -988,20 +989,29 @@ static noinline struct btrfs_device *device_list_add(const char *path, > > } > > > > if (!device) { > > + unsigned int nofs_flag; > > + > > if (fs_devices->opened) { > > mutex_unlock(&fs_devices->device_list_mutex); > > return ERR_PTR(-EBUSY); > > } > > > > + /* > > + * Setup nofs context because we are holding the device list > > + * mutex, which is required for a transaction commit. > > + */ > > I wonder if there is a bug due to GFP_KERNEL in device_list_add()? > as device_list_add() can only be called only when the FSID is not yet > mounted. OR if its done for the sake of consistency when calling\ > btrfs_alloc_device(). It still could be called but a new device will not be allocated, all is done either via scan or during mount. A missing device has an entry in fs_devices. We can keep th NOFS protection around that to make it future-proof, as it's not trivial to see if this is always called from safe context or not.
On 01/19/2019 02:07 AM, David Sterba wrote: > On Mon, Jan 14, 2019 at 04:21:43PM +0800, Anand Jain wrote: >> >> >> On 01/12/2019 01:17 AM, fdmanana@kernel.org wrote: >>> From: Filipe Manana <fdmanana@suse.com> >>> >>> In a few places we are allocating a device using the GFP_KERNEL flag when >>> it is not safe to do so, because if reclaim is triggered it can cause a >>> transaction commit while we are holding the device list mutex. This mutex >>> is required in the transaction commit path (at write_all_supers() and >>> btrfs_update_commit_device_size()). >>> >>> So fix this by setting up a nofs memory allocation context in those cases. >>> >>> Fixes: 78f2c9e6dbb14 ("btrfs: device add and remove: use GFP_KERNEL") >>> Fixes: e0ae999414238 ("btrfs: preallocate device flush bio") >>> Signed-off-by: Filipe Manana <fdmanana@suse.com> >>> --- >>> >>> V2: Change the approach to fix the problem by setting up nofs contextes >>> where needed. >>> >>> fs/btrfs/volumes.c | 33 ++++++++++++++++++++++++++++++--- >>> 1 file changed, 30 insertions(+), 3 deletions(-) >>> >>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >>> index 2576b1a379c9..663566baae78 100644 >>> --- a/fs/btrfs/volumes.c >>> +++ b/fs/btrfs/volumes.c >>> @@ -14,6 +14,7 @@ >>> #include <linux/semaphore.h> >>> #include <linux/uuid.h> >>> #include <linux/list_sort.h> >>> +#include <linux/sched/mm.h> >>> #include "ctree.h" >>> #include "extent_map.h" >>> #include "disk-io.h" >>> @@ -988,20 +989,29 @@ static noinline struct btrfs_device *device_list_add(const char *path, >>> } >>> >>> if (!device) { >>> + unsigned int nofs_flag; >>> + >>> if (fs_devices->opened) { >>> mutex_unlock(&fs_devices->device_list_mutex); >>> return ERR_PTR(-EBUSY); >>> } >>> >>> + /* >>> + * Setup nofs context because we are holding the device list >>> + * mutex, which is required for a transaction commit. >>> + */ >> >> I wonder if there is a bug due to GFP_KERNEL in device_list_add()? >> as device_list_add() can only be called only when the FSID is not yet >> mounted. OR if its done for the sake of consistency when calling\ >> btrfs_alloc_device(). > > It still could be called but a new device will not be allocated, all is > done either via scan or during mount. A missing device has an entry in > fs_devices. > We can keep th NOFS protection around that to make it future-proof, as > it's not trivial to see if this is always called from safe context or > not. Makes sense to me. Thanks.
On 01/12/2019 01:17 AM, fdmanana@kernel.org wrote: > From: Filipe Manana <fdmanana@suse.com> > > In a few places we are allocating a device using the GFP_KERNEL flag when > it is not safe to do so, because if reclaim is triggered it can cause a > transaction commit while we are holding the device list mutex. This mutex > is required in the transaction commit path (at write_all_supers() and > btrfs_update_commit_device_size()). > > So fix this by setting up a nofs memory allocation context in those cases. > > Fixes: 78f2c9e6dbb14 ("btrfs: device add and remove: use GFP_KERNEL") > Fixes: e0ae999414238 ("btrfs: preallocate device flush bio") > Signed-off-by: Filipe Manana <fdmanana@suse.com> > --- > > V2: Change the approach to fix the problem by setting up nofs contextes > where needed. So remaining functions which still does GFP_KERNEL are.. btrfs_init_dev_replace_tgtdev() btrfs_init_new_device() add_missing_dev() Normally most of the device allocations are already done at device_list_add(), except for the special three functions above, so can we simplify the fix to [1] as there isn't much toll if the above three were also under GFP_NOFS. [1] -------------------------------------------- diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b2c1d26f577e..3c350ab6535b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -387,7 +387,7 @@ static struct btrfs_device *__alloc_device(void) { struct btrfs_device *dev; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_NOFS); if (!dev) return ERR_PTR(-ENOMEM); @@ -395,7 +395,7 @@ static struct btrfs_device *__alloc_device(void) * Preallocate a bio that's always going to be used for flushing device * barriers and matches the device lifespan */ - dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); + dev->flush_bio = bio_alloc_bioset(GFP_NOFS, 0, NULL); if (!dev->flush_bio) { kfree(dev); return ERR_PTR(-ENOMEM); -------------------------------------------- Thanks. > fs/btrfs/volumes.c | 33 ++++++++++++++++++++++++++++++--- > 1 file changed, 30 insertions(+), 3 deletions(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index 2576b1a379c9..663566baae78 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -14,6 +14,7 @@ > #include <linux/semaphore.h> > #include <linux/uuid.h> > #include <linux/list_sort.h> > +#include <linux/sched/mm.h> > #include "ctree.h" > #include "extent_map.h" > #include "disk-io.h" > @@ -988,20 +989,29 @@ static noinline struct btrfs_device *device_list_add(const char *path, > } > > if (!device) { > + unsigned int nofs_flag; > + > if (fs_devices->opened) { > mutex_unlock(&fs_devices->device_list_mutex); > return ERR_PTR(-EBUSY); > } > > + /* > + * Setup nofs context because we are holding the device list > + * mutex, which is required for a transaction commit. > + */ > + nofs_flag = memalloc_nofs_save(); > device = btrfs_alloc_device(NULL, &devid, > disk_super->dev_item.uuid); > if (IS_ERR(device)) { > + memalloc_nofs_restore(nofs_flag); > mutex_unlock(&fs_devices->device_list_mutex); > /* we can safely leave the fs_devices entry around */ > return device; > } > > - name = rcu_string_strdup(path, GFP_NOFS); > + name = rcu_string_strdup(path, GFP_KERNEL); > + memalloc_nofs_restore(nofs_flag); > if (!name) { > btrfs_free_device(device); > mutex_unlock(&fs_devices->device_list_mutex); > @@ -1137,11 +1147,19 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) > /* We have held the volume lock, it is safe to get the devices. */ > list_for_each_entry(orig_dev, &orig->devices, dev_list) { > struct rcu_string *name; > + unsigned int nofs_flag; > > + /* > + * Setup nofs context because we are holding the device list > + * mutex, which is required for a transaction commit. > + */ > + nofs_flag = memalloc_nofs_save(); > device = btrfs_alloc_device(NULL, &orig_dev->devid, > orig_dev->uuid); > - if (IS_ERR(device)) > + if (IS_ERR(device)) { > + memalloc_nofs_restore(nofs_flag); > goto error; > + } > > /* > * This is ok to do without rcu read locked because we hold the > @@ -1151,12 +1169,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) > name = rcu_string_strdup(orig_dev->name->str, > GFP_KERNEL); > if (!name) { > + memalloc_nofs_restore(nofs_flag); > btrfs_free_device(device); > goto error; > } > rcu_assign_pointer(device->name, name); > } > > + memalloc_nofs_restore(nofs_flag); > list_add(&device->dev_list, &fs_devices->devices); > device->fs_devices = fs_devices; > fs_devices->num_devices++; > @@ -1262,6 +1282,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) > struct btrfs_fs_devices *fs_devices = device->fs_devices; > struct btrfs_device *new_device; > struct rcu_string *name; > + unsigned int nofs_flag; > > if (device->bdev) > fs_devices->open_devices--; > @@ -1277,17 +1298,23 @@ static void btrfs_close_one_device(struct btrfs_device *device) > > btrfs_close_bdev(device); > > + /* > + * Setup nofs context because we are holding the device list > + * mutex, which is required for a transaction commit. > + */ > + nofs_flag = memalloc_nofs_save(); > new_device = btrfs_alloc_device(NULL, &device->devid, > device->uuid); > BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ > > /* Safe because we are under uuid_mutex */ > if (device->name) { > - name = rcu_string_strdup(device->name->str, GFP_NOFS); > + name = rcu_string_strdup(device->name->str, GFP_KERNEL); > BUG_ON(!name); /* -ENOMEM */ > rcu_assign_pointer(new_device->name, name); > } > > + memalloc_nofs_restore(nofs_flag); > list_replace_rcu(&device->dev_list, &new_device->dev_list); > new_device->fs_devices = device->fs_devices; > >
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2576b1a379c9..663566baae78 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,6 +14,7 @@ #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -988,20 +989,29 @@ static noinline struct btrfs_device *device_list_add(const char *path, } if (!device) { + unsigned int nofs_flag; + if (fs_devices->opened) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } + /* + * Setup nofs context because we are holding the device list + * mutex, which is required for a transaction commit. + */ + nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, disk_super->dev_item.uuid); if (IS_ERR(device)) { + memalloc_nofs_restore(nofs_flag); mutex_unlock(&fs_devices->device_list_mutex); /* we can safely leave the fs_devices entry around */ return device; } - name = rcu_string_strdup(path, GFP_NOFS); + name = rcu_string_strdup(path, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!name) { btrfs_free_device(device); mutex_unlock(&fs_devices->device_list_mutex); @@ -1137,11 +1147,19 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { struct rcu_string *name; + unsigned int nofs_flag; + /* + * Setup nofs context because we are holding the device list + * mutex, which is required for a transaction commit. + */ + nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid); - if (IS_ERR(device)) + if (IS_ERR(device)) { + memalloc_nofs_restore(nofs_flag); goto error; + } /* * This is ok to do without rcu read locked because we hold the @@ -1151,12 +1169,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { + memalloc_nofs_restore(nofs_flag); btrfs_free_device(device); goto error; } rcu_assign_pointer(device->name, name); } + memalloc_nofs_restore(nofs_flag); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; @@ -1262,6 +1282,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) struct btrfs_fs_devices *fs_devices = device->fs_devices; struct btrfs_device *new_device; struct rcu_string *name; + unsigned int nofs_flag; if (device->bdev) fs_devices->open_devices--; @@ -1277,17 +1298,23 @@ static void btrfs_close_one_device(struct btrfs_device *device) btrfs_close_bdev(device); + /* + * Setup nofs context because we are holding the device list + * mutex, which is required for a transaction commit. + */ + nofs_flag = memalloc_nofs_save(); new_device = btrfs_alloc_device(NULL, &device->devid, device->uuid); BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ /* Safe because we are under uuid_mutex */ if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); + name = rcu_string_strdup(device->name->str, GFP_KERNEL); BUG_ON(!name); /* -ENOMEM */ rcu_assign_pointer(new_device->name, name); } + memalloc_nofs_restore(nofs_flag); list_replace_rcu(&device->dev_list, &new_device->dev_list); new_device->fs_devices = device->fs_devices;