diff mbox

btrfs: push relocation recovery into a helper thread

Message ID 0d7559af-b5ea-f725-1859-1c561984622f@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Mahoney April 17, 2018, 6:45 p.m. UTC
On a file system with many snapshots and qgroups enabled, an interrupted
balance can end up taking a long time to mount due to recovering the
relocations during mount.  It does this in the task performing the mount,
which can't be interrupted and may prevent mounting (and systems booting)
for a long time as well.  The thing is that as part of balance, this
runs in the background all the time.  This patch pushes the recovery
into a helper thread and allows the mount to continue normally.  We hold
off on resuming any paused balance operation until after the relocation
has completed, disallow any new balance operations if it's running, and
wait for it on umount and remounting read-only.

This doesn't do anything to address the relocation recovery operation
taking a long time but does allow the file system to mount.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/btrfs/ctree.h      |    7 +++
 fs/btrfs/disk-io.c    |    7 ++-
 fs/btrfs/relocation.c |   92 +++++++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/super.c      |    5 +-
 fs/btrfs/volumes.c    |    6 +++
 5 files changed, 97 insertions(+), 20 deletions(-)

Comments

David Sterba April 23, 2018, 9:43 p.m. UTC | #1
On Tue, Apr 17, 2018 at 02:45:33PM -0400, Jeff Mahoney wrote:
> On a file system with many snapshots and qgroups enabled, an interrupted
> balance can end up taking a long time to mount due to recovering the
> relocations during mount.  It does this in the task performing the mount,
> which can't be interrupted and may prevent mounting (and systems booting)
> for a long time as well.  The thing is that as part of balance, this
> runs in the background all the time.  This patch pushes the recovery
> into a helper thread and allows the mount to continue normally.  We hold
> off on resuming any paused balance operation until after the relocation
> has completed, disallow any new balance operations if it's running, and
> wait for it on umount and remounting read-only.

The overall logic sounds ok.

So, this can also stall the umount, right? Eg. if I start mount,
relocation goes to background, then unmount will have to wait for
completion.

Balance pause is requested at umount time, something similar should be
possible for relocation recovery. The fs_state bit CLOSING could be
checked somewhere in the loop.

> This doesn't do anything to address the relocation recovery operation
> taking a long time but does allow the file system to mount.
> 
> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> ---
>  fs/btrfs/ctree.h      |    7 +++
>  fs/btrfs/disk-io.c    |    7 ++-
>  fs/btrfs/relocation.c |   92 +++++++++++++++++++++++++++++++++++++++++---------
>  fs/btrfs/super.c      |    5 +-
>  fs/btrfs/volumes.c    |    6 +++
>  5 files changed, 97 insertions(+), 20 deletions(-)
> 
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1052,6 +1052,10 @@ struct btrfs_fs_info {
>  	struct btrfs_work qgroup_rescan_work;
>  	bool qgroup_rescan_running;	/* protected by qgroup_rescan_lock */
>  
> +	/* relocation recovery items */
> +	bool relocation_recovery_started;
> +	struct completion relocation_recovery_completion;

This seems to copy the pattern of qgroup rescan, the completion +
workqueue. I'm planning to move this scheme to the fs_state bit instead
of bool and the wait_for_war with global workqueue, but for now we can
leave as it is here.

> +
>  	/* filesystem state */
>  	unsigned long fs_state;
>  
> @@ -3671,7 +3675,8 @@ int btrfs_init_reloc_root(struct btrfs_t
>  			  struct btrfs_root *root);
>  int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
>  			    struct btrfs_root *root);
> -int btrfs_recover_relocation(struct btrfs_root *root);
> +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
> +void btrfs_wait_for_relocation_completion(struct btrfs_fs_info *fs_info);
>  int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
>  int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
>  			  struct btrfs_root *root, struct extent_buffer *buf,
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2999,7 +2999,7 @@ retry_root_backup:
>  			goto fail_qgroup;
>  
>  		mutex_lock(&fs_info->cleaner_mutex);
> -		ret = btrfs_recover_relocation(tree_root);
> +		ret = btrfs_recover_relocation(fs_info);
>  		mutex_unlock(&fs_info->cleaner_mutex);
>  		if (ret < 0) {
>  			btrfs_warn(fs_info, "failed to recover relocation: %d",
> @@ -3017,7 +3017,8 @@ retry_root_backup:
>  	if (IS_ERR(fs_info->fs_root)) {
>  		err = PTR_ERR(fs_info->fs_root);
>  		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
> -		goto fail_qgroup;
> +		close_ctree(fs_info);
> +		return err;
>  	}
>  
>  	if (sb_rdonly(sb))
> @@ -3778,6 +3779,8 @@ void close_ctree(struct btrfs_fs_info *f
>  	/* wait for the qgroup rescan worker to stop */
>  	btrfs_qgroup_wait_for_completion(fs_info, false);
>  
> +	btrfs_wait_for_relocation_completion(fs_info);
> +
>  	/* wait for the uuid_scan task to finish */
>  	down(&fs_info->uuid_tree_rescan_sem);
>  	/* avoid complains from lockdep et al., set sem back to initial state */
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -22,6 +22,7 @@
>  #include <linux/blkdev.h>
>  #include <linux/rbtree.h>
>  #include <linux/slab.h>
> +#include <linux/kthread.h>
>  #include "ctree.h"
>  #include "disk-io.h"
>  #include "transaction.h"
> @@ -4492,14 +4493,61 @@ static noinline_for_stack int mark_garba
>  }
>  
>  /*
> - * recover relocation interrupted by system crash.
> - *
>   * this function resumes merging reloc trees with corresponding fs trees.
>   * this is important for keeping the sharing of tree blocks
>   */
> -int btrfs_recover_relocation(struct btrfs_root *root)
> +static int
> +btrfs_resume_relocation(void *data)

Please keep the type and function name on one line.

>  {
> -	struct btrfs_fs_info *fs_info = root->fs_info;
> +	struct btrfs_fs_info *fs_info = data;
> +	struct btrfs_trans_handle *trans;
> +	struct reloc_control *rc = fs_info->reloc_ctl;
> +	int err, ret;
> +
> +	btrfs_info(fs_info, "resuming relocation");
> +
> +	BUG_ON(!rc);
> +
> +	mutex_lock(&fs_info->cleaner_mutex);
> +
> +	merge_reloc_roots(rc);
> +
> +	unset_reloc_control(rc);
> +
> +	trans = btrfs_join_transaction(rc->extent_root);
> +	if (IS_ERR(trans))
> +		err = PTR_ERR(trans);
> +	else {
> +		ret = btrfs_commit_transaction(trans);
> +		if (ret < 0)
> +			err = ret;
> +	}
> +
> +	kfree(rc);
> +
> +	if (err == 0) {
> +		struct btrfs_root *fs_root;
> +
> +		/* cleanup orphan inode in data relocation tree */
> +		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
> +		if (IS_ERR(fs_root))
> +			err = PTR_ERR(fs_root);
> +		else
> +			err = btrfs_orphan_cleanup(fs_root);
> +	}
> +	mutex_unlock(&fs_info->cleaner_mutex);
> +	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);

The part that sets the bit is in the caller, btrfs_recover_relocation,
but this can race if the kthread is too fast.

btrfs_recover_relocation
	start kthread with btrfs_resume_relocation
		clear_bit
	set_bit
	...

now we're stuck with the EXCL_OP set without any operation actually running.

The bit can be set right before the kthread is started and cleared
inside.

> +	complete_all(&fs_info->relocation_recovery_completion);
> +	return err;
> +}
> +
> +/*
> + * recover relocation interrupted by system crash.
> + * this function locates the relocation trees
> + */
> +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_root *tree_root = fs_info->tree_root;
>  	LIST_HEAD(reloc_roots);
>  	struct btrfs_key key;
>  	struct btrfs_root *fs_root;
> @@ -4508,9 +4556,12 @@ int btrfs_recover_relocation(struct btrf
>  	struct extent_buffer *leaf;
>  	struct reloc_control *rc = NULL;
>  	struct btrfs_trans_handle *trans;
> +	struct task_struct *tsk;
>  	int ret;
>  	int err = 0;
>  
> +	WARN_ON(!rwsem_is_locked(&fs_info->sb->s_umount));
> +
>  	path = btrfs_alloc_path();
>  	if (!path)
>  		return -ENOMEM;
> @@ -4521,8 +4572,7 @@ int btrfs_recover_relocation(struct btrf
>  	key.offset = (u64)-1;
>  
>  	while (1) {
> -		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
> -					path, 0, 0);
> +		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
>  		if (ret < 0) {
>  			err = ret;
>  			goto out;
> @@ -4540,7 +4590,7 @@ int btrfs_recover_relocation(struct btrf
>  		    key.type != BTRFS_ROOT_ITEM_KEY)
>  			break;
>  
> -		reloc_root = btrfs_read_fs_root(root, &key);
> +		reloc_root = btrfs_read_fs_root(tree_root, &key);
>  		if (IS_ERR(reloc_root)) {
>  			err = PTR_ERR(reloc_root);
>  			goto out;
> @@ -4620,16 +4670,21 @@ int btrfs_recover_relocation(struct btrf
>  	if (err)
>  		goto out_free;
>  
> -	merge_reloc_roots(rc);
> -
> -	unset_reloc_control(rc);
> -
> -	trans = btrfs_join_transaction(rc->extent_root);
> -	if (IS_ERR(trans)) {
> -		err = PTR_ERR(trans);
> +	tsk = kthread_run(btrfs_resume_relocation, fs_info,
> +			  "relocation-recovery");

Would be good to name it 'btrfs-reloc-recovery', ie with btrfs in the
name so it's easy greppable from the process list.

> +	if (IS_ERR(tsk)) {
> +		err = PTR_ERR(tsk);
>  		goto out_free;
>  	}
> -	err = btrfs_commit_transaction(trans);
> +
> +	fs_info->relocation_recovery_started = true;
> +
> +	/* protected from racing with resume thread by the cleaner_mutex */
> +	init_completion(&fs_info->relocation_recovery_completion);
> +
> +	set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);

The 2nd comment above refers to this.

> +	return 0;
> +
>  out_free:
>  	kfree(rc);
>  out:
> @@ -4649,6 +4704,13 @@ out:
>  	return err;
>  }
>  
> +void
> +btrfs_wait_for_relocation_completion(struct btrfs_fs_info *fs_info)

type function(...)

> +{
> +	if (fs_info->relocation_recovery_started)
> +		wait_for_completion(&fs_info->relocation_recovery_completion);
> +}
> +
>  /*
>   * helper to add ordered checksum for data relocation.
>   *
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -1767,7 +1767,6 @@ static inline void btrfs_remount_cleanup
>  static int btrfs_remount(struct super_block *sb, int *flags, char *data)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> -	struct btrfs_root *root = fs_info->tree_root;
>  	unsigned old_flags = sb->s_flags;
>  	unsigned long old_opts = fs_info->mount_opt;
>  	unsigned long old_compress_type = fs_info->compress_type;
> @@ -1834,6 +1833,8 @@ static int btrfs_remount(struct super_bl
>  		btrfs_scrub_cancel(fs_info);
>  		btrfs_pause_balance(fs_info);
>  
> +		btrfs_wait_for_relocation_completion(fs_info);
> +
>  		ret = btrfs_commit_super(fs_info);
>  		if (ret)
>  			goto restore;
> @@ -1867,7 +1868,7 @@ static int btrfs_remount(struct super_bl
>  
>  		/* recover relocation */
>  		mutex_lock(&fs_info->cleaner_mutex);
> -		ret = btrfs_recover_relocation(root);
> +		ret = btrfs_recover_relocation(fs_info);
>  		mutex_unlock(&fs_info->cleaner_mutex);
>  		if (ret)
>  			goto restore;
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -4034,6 +4034,12 @@ static int balance_kthread(void *data)
>  	struct btrfs_fs_info *fs_info = data;
>  	int ret = 0;
>  
> +	if (fs_info->relocation_recovery_started) {
> +		btrfs_info(fs_info,
> +			   "waiting for relocation recovery before resuming balance");
> +		wait_for_completion(&fs_info->relocation_recovery_completion);
> +	}
> +
>  	mutex_lock(&fs_info->volume_mutex);
>  	mutex_lock(&fs_info->balance_mutex);
>  
> 
> -- 
> Jeff Mahoney
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Mahoney April 24, 2018, 6:03 p.m. UTC | #2
On 4/23/18 5:43 PM, David Sterba wrote:
> On Tue, Apr 17, 2018 at 02:45:33PM -0400, Jeff Mahoney wrote:
>> On a file system with many snapshots and qgroups enabled, an interrupted
>> balance can end up taking a long time to mount due to recovering the
>> relocations during mount.  It does this in the task performing the mount,
>> which can't be interrupted and may prevent mounting (and systems booting)
>> for a long time as well.  The thing is that as part of balance, this
>> runs in the background all the time.  This patch pushes the recovery
>> into a helper thread and allows the mount to continue normally.  We hold
>> off on resuming any paused balance operation until after the relocation
>> has completed, disallow any new balance operations if it's running, and
>> wait for it on umount and remounting read-only.
> 
> The overall logic sounds ok.

Thanks for the review.  I've updated the style issues in my patch and
removed them from the quote below.

> So, this can also stall the umount, right? Eg. if I start mount,
> relocation goes to background, then unmount will have to wait for
> completion.

Yep, I didn't try to solve that problem since the file system wouldn't
even mount before.  Makes sense to make it unmountable, though.  That's
a change that would probably speed up btrfs balance cancel as well.

> Balance pause is requested at umount time, something similar should be
> possible for relocation recovery. The fs_state bit CLOSING could be
> checked somewhere in the loop.

An earlier version had that check in the top of the loop in
merge_reloc_roots, but I think a better spot would be the top of the
merge_reloc_root loop.

>> This doesn't do anything to address the relocation recovery operation
>> taking a long time but does allow the file system to mount.
>>
>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
>> ---
>>  fs/btrfs/ctree.h      |    7 +++
>>  fs/btrfs/disk-io.c    |    7 ++-
>>  fs/btrfs/relocation.c |   92 +++++++++++++++++++++++++++++++++++++++++---------
>>  fs/btrfs/super.c      |    5 +-
>>  fs/btrfs/volumes.c    |    6 +++
>>  5 files changed, 97 insertions(+), 20 deletions(-)
>>
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -1052,6 +1052,10 @@ struct btrfs_fs_info {
>>  	struct btrfs_work qgroup_rescan_work;
>>  	bool qgroup_rescan_running;	/* protected by qgroup_rescan_lock */
>>  
>> +	/* relocation recovery items */
>> +	bool relocation_recovery_started;
>> +	struct completion relocation_recovery_completion;
> 
> This seems to copy the pattern of qgroup rescan, the completion +
> workqueue. I'm planning to move this scheme to the fs_state bit instead
> of bool and the wait_for_war with global workqueue, but for now we can
> leave as it is here.

Such that we just put these jobs on a workqueue instead?

>> +	if (err == 0) {
>> +		struct btrfs_root *fs_root;
>> +
>> +		/* cleanup orphan inode in data relocation tree */
>> +		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
>> +		if (IS_ERR(fs_root))
>> +			err = PTR_ERR(fs_root);
>> +		else
>> +			err = btrfs_orphan_cleanup(fs_root);
>> +	}
>> +	mutex_unlock(&fs_info->cleaner_mutex);
>> +	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
> 
> The part that sets the bit is in the caller, btrfs_recover_relocation,
> but this can race if the kthread is too fast.
> 
> btrfs_recover_relocation
> 	start kthread with btrfs_resume_relocation
> 		clear_bit
> 	set_bit
> 	...
> 
> now we're stuck with the EXCL_OP set without any operation actually running.
> 
> The bit can be set right before the kthread is started and cleared
> inside.

There's no opportunity to race since the thread can't run until
btrfs_recover_relocation returns and releases the cleaner mutex.

>> @@ -4620,16 +4670,21 @@ int btrfs_recover_relocation(struct btrf
>>  	if (err)
>>  		goto out_free;
>>  
>> -	merge_reloc_roots(rc);
>> -
>> -	unset_reloc_control(rc);
>> -
>> -	trans = btrfs_join_transaction(rc->extent_root);
>> -	if (IS_ERR(trans)) {
>> -		err = PTR_ERR(trans);
>> +	tsk = kthread_run(btrfs_resume_relocation, fs_info,
>> +			  "relocation-recovery");
> 
> Would be good to name it 'btrfs-reloc-recovery', ie with btrfs in the
> name so it's easy greppable from the process list.

Right.  In an earlier version, I was using a btrfs_worker so that was
added automatically.

>> +	if (IS_ERR(tsk)) {
>> +		err = PTR_ERR(tsk);
>>  		goto out_free;
>>  	}
>> -	err = btrfs_commit_transaction(trans);
>> +
>> +	fs_info->relocation_recovery_started = true;
>> +
>> +	/* protected from racing with resume thread by the cleaner_mutex */
>> +	init_completion(&fs_info->relocation_recovery_completion);
>> +
>> +	set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
> 
> The 2nd comment above refers to this.

The response too.

-Jeff
Jeff Mahoney May 11, 2018, 4:20 a.m. UTC | #3
On 4/17/18 2:45 PM, Jeff Mahoney wrote:
> On a file system with many snapshots and qgroups enabled, an interrupted
> balance can end up taking a long time to mount due to recovering the
> relocations during mount.  It does this in the task performing the mount,
> which can't be interrupted and may prevent mounting (and systems booting)
> for a long time as well.  The thing is that as part of balance, this
> runs in the background all the time.  This patch pushes the recovery
> into a helper thread and allows the mount to continue normally.  We hold
> off on resuming any paused balance operation until after the relocation
> has completed, disallow any new balance operations if it's running, and
> wait for it on umount and remounting read-only.
> 
> This doesn't do anything to address the relocation recovery operation
> taking a long time but does allow the file system to mount.

I'm abandoning this patch.  The right fix is to fix qgroups.  The
workload that I was targeting takes seconds to recover without qgroups
in the way.

-Jeff

> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> ---
>  fs/btrfs/ctree.h      |    7 +++
>  fs/btrfs/disk-io.c    |    7 ++-
>  fs/btrfs/relocation.c |   92 +++++++++++++++++++++++++++++++++++++++++---------
>  fs/btrfs/super.c      |    5 +-
>  fs/btrfs/volumes.c    |    6 +++
>  5 files changed, 97 insertions(+), 20 deletions(-)
> 
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1052,6 +1052,10 @@ struct btrfs_fs_info {
>  	struct btrfs_work qgroup_rescan_work;
>  	bool qgroup_rescan_running;	/* protected by qgroup_rescan_lock */
>  
> +	/* relocation recovery items */
> +	bool relocation_recovery_started;
> +	struct completion relocation_recovery_completion;
> +
>  	/* filesystem state */
>  	unsigned long fs_state;
>  
> @@ -3671,7 +3675,8 @@ int btrfs_init_reloc_root(struct btrfs_t
>  			  struct btrfs_root *root);
>  int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
>  			    struct btrfs_root *root);
> -int btrfs_recover_relocation(struct btrfs_root *root);
> +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
> +void btrfs_wait_for_relocation_completion(struct btrfs_fs_info *fs_info);
>  int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
>  int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
>  			  struct btrfs_root *root, struct extent_buffer *buf,
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2999,7 +2999,7 @@ retry_root_backup:
>  			goto fail_qgroup;
>  
>  		mutex_lock(&fs_info->cleaner_mutex);
> -		ret = btrfs_recover_relocation(tree_root);
> +		ret = btrfs_recover_relocation(fs_info);
>  		mutex_unlock(&fs_info->cleaner_mutex);
>  		if (ret < 0) {
>  			btrfs_warn(fs_info, "failed to recover relocation: %d",
> @@ -3017,7 +3017,8 @@ retry_root_backup:
>  	if (IS_ERR(fs_info->fs_root)) {
>  		err = PTR_ERR(fs_info->fs_root);
>  		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
> -		goto fail_qgroup;
> +		close_ctree(fs_info);
> +		return err;
>  	}
>  
>  	if (sb_rdonly(sb))
> @@ -3778,6 +3779,8 @@ void close_ctree(struct btrfs_fs_info *f
>  	/* wait for the qgroup rescan worker to stop */
>  	btrfs_qgroup_wait_for_completion(fs_info, false);
>  
> +	btrfs_wait_for_relocation_completion(fs_info);
> +
>  	/* wait for the uuid_scan task to finish */
>  	down(&fs_info->uuid_tree_rescan_sem);
>  	/* avoid complains from lockdep et al., set sem back to initial state */
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -22,6 +22,7 @@
>  #include <linux/blkdev.h>
>  #include <linux/rbtree.h>
>  #include <linux/slab.h>
> +#include <linux/kthread.h>
>  #include "ctree.h"
>  #include "disk-io.h"
>  #include "transaction.h"
> @@ -4492,14 +4493,61 @@ static noinline_for_stack int mark_garba
>  }
>  
>  /*
> - * recover relocation interrupted by system crash.
> - *
>   * this function resumes merging reloc trees with corresponding fs trees.
>   * this is important for keeping the sharing of tree blocks
>   */
> -int btrfs_recover_relocation(struct btrfs_root *root)
> +static int
> +btrfs_resume_relocation(void *data)
>  {
> -	struct btrfs_fs_info *fs_info = root->fs_info;
> +	struct btrfs_fs_info *fs_info = data;
> +	struct btrfs_trans_handle *trans;
> +	struct reloc_control *rc = fs_info->reloc_ctl;
> +	int err, ret;
> +
> +	btrfs_info(fs_info, "resuming relocation");
> +
> +	BUG_ON(!rc);
> +
> +	mutex_lock(&fs_info->cleaner_mutex);
> +
> +	merge_reloc_roots(rc);
> +
> +	unset_reloc_control(rc);
> +
> +	trans = btrfs_join_transaction(rc->extent_root);
> +	if (IS_ERR(trans))
> +		err = PTR_ERR(trans);
> +	else {
> +		ret = btrfs_commit_transaction(trans);
> +		if (ret < 0)
> +			err = ret;
> +	}
> +
> +	kfree(rc);
> +
> +	if (err == 0) {
> +		struct btrfs_root *fs_root;
> +
> +		/* cleanup orphan inode in data relocation tree */
> +		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
> +		if (IS_ERR(fs_root))
> +			err = PTR_ERR(fs_root);
> +		else
> +			err = btrfs_orphan_cleanup(fs_root);
> +	}
> +	mutex_unlock(&fs_info->cleaner_mutex);
> +	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
> +	complete_all(&fs_info->relocation_recovery_completion);
> +	return err;
> +}
> +
> +/*
> + * recover relocation interrupted by system crash.
> + * this function locates the relocation trees
> + */
> +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_root *tree_root = fs_info->tree_root;
>  	LIST_HEAD(reloc_roots);
>  	struct btrfs_key key;
>  	struct btrfs_root *fs_root;
> @@ -4508,9 +4556,12 @@ int btrfs_recover_relocation(struct btrf
>  	struct extent_buffer *leaf;
>  	struct reloc_control *rc = NULL;
>  	struct btrfs_trans_handle *trans;
> +	struct task_struct *tsk;
>  	int ret;
>  	int err = 0;
>  
> +	WARN_ON(!rwsem_is_locked(&fs_info->sb->s_umount));
> +
>  	path = btrfs_alloc_path();
>  	if (!path)
>  		return -ENOMEM;
> @@ -4521,8 +4572,7 @@ int btrfs_recover_relocation(struct btrf
>  	key.offset = (u64)-1;
>  
>  	while (1) {
> -		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
> -					path, 0, 0);
> +		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
>  		if (ret < 0) {
>  			err = ret;
>  			goto out;
> @@ -4540,7 +4590,7 @@ int btrfs_recover_relocation(struct btrf
>  		    key.type != BTRFS_ROOT_ITEM_KEY)
>  			break;
>  
> -		reloc_root = btrfs_read_fs_root(root, &key);
> +		reloc_root = btrfs_read_fs_root(tree_root, &key);
>  		if (IS_ERR(reloc_root)) {
>  			err = PTR_ERR(reloc_root);
>  			goto out;
> @@ -4620,16 +4670,21 @@ int btrfs_recover_relocation(struct btrf
>  	if (err)
>  		goto out_free;
>  
> -	merge_reloc_roots(rc);
> -
> -	unset_reloc_control(rc);
> -
> -	trans = btrfs_join_transaction(rc->extent_root);
> -	if (IS_ERR(trans)) {
> -		err = PTR_ERR(trans);
> +	tsk = kthread_run(btrfs_resume_relocation, fs_info,
> +			  "relocation-recovery");
> +	if (IS_ERR(tsk)) {
> +		err = PTR_ERR(tsk);
>  		goto out_free;
>  	}
> -	err = btrfs_commit_transaction(trans);
> +
> +	fs_info->relocation_recovery_started = true;
> +
> +	/* protected from racing with resume thread by the cleaner_mutex */
> +	init_completion(&fs_info->relocation_recovery_completion);
> +
> +	set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
> +	return 0;
> +
>  out_free:
>  	kfree(rc);
>  out:
> @@ -4649,6 +4704,13 @@ out:
>  	return err;
>  }
>  
> +void
> +btrfs_wait_for_relocation_completion(struct btrfs_fs_info *fs_info)
> +{
> +	if (fs_info->relocation_recovery_started)
> +		wait_for_completion(&fs_info->relocation_recovery_completion);
> +}
> +
>  /*
>   * helper to add ordered checksum for data relocation.
>   *
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -1767,7 +1767,6 @@ static inline void btrfs_remount_cleanup
>  static int btrfs_remount(struct super_block *sb, int *flags, char *data)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> -	struct btrfs_root *root = fs_info->tree_root;
>  	unsigned old_flags = sb->s_flags;
>  	unsigned long old_opts = fs_info->mount_opt;
>  	unsigned long old_compress_type = fs_info->compress_type;
> @@ -1834,6 +1833,8 @@ static int btrfs_remount(struct super_bl
>  		btrfs_scrub_cancel(fs_info);
>  		btrfs_pause_balance(fs_info);
>  
> +		btrfs_wait_for_relocation_completion(fs_info);
> +
>  		ret = btrfs_commit_super(fs_info);
>  		if (ret)
>  			goto restore;
> @@ -1867,7 +1868,7 @@ static int btrfs_remount(struct super_bl
>  
>  		/* recover relocation */
>  		mutex_lock(&fs_info->cleaner_mutex);
> -		ret = btrfs_recover_relocation(root);
> +		ret = btrfs_recover_relocation(fs_info);
>  		mutex_unlock(&fs_info->cleaner_mutex);
>  		if (ret)
>  			goto restore;
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -4034,6 +4034,12 @@ static int balance_kthread(void *data)
>  	struct btrfs_fs_info *fs_info = data;
>  	int ret = 0;
>  
> +	if (fs_info->relocation_recovery_started) {
> +		btrfs_info(fs_info,
> +			   "waiting for relocation recovery before resuming balance");
> +		wait_for_completion(&fs_info->relocation_recovery_completion);
> +	}
> +
>  	mutex_lock(&fs_info->volume_mutex);
>  	mutex_lock(&fs_info->balance_mutex);
>  
>
diff mbox

Patch

--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1052,6 +1052,10 @@  struct btrfs_fs_info {
 	struct btrfs_work qgroup_rescan_work;
 	bool qgroup_rescan_running;	/* protected by qgroup_rescan_lock */
 
+	/* relocation recovery items */
+	bool relocation_recovery_started;
+	struct completion relocation_recovery_completion;
+
 	/* filesystem state */
 	unsigned long fs_state;
 
@@ -3671,7 +3675,8 @@  int btrfs_init_reloc_root(struct btrfs_t
 			  struct btrfs_root *root);
 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
+void btrfs_wait_for_relocation_completion(struct btrfs_fs_info *fs_info);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct extent_buffer *buf,
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2999,7 +2999,7 @@  retry_root_backup:
 			goto fail_qgroup;
 
 		mutex_lock(&fs_info->cleaner_mutex);
-		ret = btrfs_recover_relocation(tree_root);
+		ret = btrfs_recover_relocation(fs_info);
 		mutex_unlock(&fs_info->cleaner_mutex);
 		if (ret < 0) {
 			btrfs_warn(fs_info, "failed to recover relocation: %d",
@@ -3017,7 +3017,8 @@  retry_root_backup:
 	if (IS_ERR(fs_info->fs_root)) {
 		err = PTR_ERR(fs_info->fs_root);
 		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
-		goto fail_qgroup;
+		close_ctree(fs_info);
+		return err;
 	}
 
 	if (sb_rdonly(sb))
@@ -3778,6 +3779,8 @@  void close_ctree(struct btrfs_fs_info *f
 	/* wait for the qgroup rescan worker to stop */
 	btrfs_qgroup_wait_for_completion(fs_info, false);
 
+	btrfs_wait_for_relocation_completion(fs_info);
+
 	/* wait for the uuid_scan task to finish */
 	down(&fs_info->uuid_tree_rescan_sem);
 	/* avoid complains from lockdep et al., set sem back to initial state */
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -22,6 +22,7 @@ 
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/kthread.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -4492,14 +4493,61 @@  static noinline_for_stack int mark_garba
 }
 
 /*
- * recover relocation interrupted by system crash.
- *
  * this function resumes merging reloc trees with corresponding fs trees.
  * this is important for keeping the sharing of tree blocks
  */
-int btrfs_recover_relocation(struct btrfs_root *root)
+static int
+btrfs_resume_relocation(void *data)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_fs_info *fs_info = data;
+	struct btrfs_trans_handle *trans;
+	struct reloc_control *rc = fs_info->reloc_ctl;
+	int err, ret;
+
+	btrfs_info(fs_info, "resuming relocation");
+
+	BUG_ON(!rc);
+
+	mutex_lock(&fs_info->cleaner_mutex);
+
+	merge_reloc_roots(rc);
+
+	unset_reloc_control(rc);
+
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans))
+		err = PTR_ERR(trans);
+	else {
+		ret = btrfs_commit_transaction(trans);
+		if (ret < 0)
+			err = ret;
+	}
+
+	kfree(rc);
+
+	if (err == 0) {
+		struct btrfs_root *fs_root;
+
+		/* cleanup orphan inode in data relocation tree */
+		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+		if (IS_ERR(fs_root))
+			err = PTR_ERR(fs_root);
+		else
+			err = btrfs_orphan_cleanup(fs_root);
+	}
+	mutex_unlock(&fs_info->cleaner_mutex);
+	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+	complete_all(&fs_info->relocation_recovery_completion);
+	return err;
+}
+
+/*
+ * recover relocation interrupted by system crash.
+ * this function locates the relocation trees
+ */
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *tree_root = fs_info->tree_root;
 	LIST_HEAD(reloc_roots);
 	struct btrfs_key key;
 	struct btrfs_root *fs_root;
@@ -4508,9 +4556,12 @@  int btrfs_recover_relocation(struct btrf
 	struct extent_buffer *leaf;
 	struct reloc_control *rc = NULL;
 	struct btrfs_trans_handle *trans;
+	struct task_struct *tsk;
 	int ret;
 	int err = 0;
 
+	WARN_ON(!rwsem_is_locked(&fs_info->sb->s_umount));
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -4521,8 +4572,7 @@  int btrfs_recover_relocation(struct btrf
 	key.offset = (u64)-1;
 
 	while (1) {
-		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
-					path, 0, 0);
+		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
 		if (ret < 0) {
 			err = ret;
 			goto out;
@@ -4540,7 +4590,7 @@  int btrfs_recover_relocation(struct btrf
 		    key.type != BTRFS_ROOT_ITEM_KEY)
 			break;
 
-		reloc_root = btrfs_read_fs_root(root, &key);
+		reloc_root = btrfs_read_fs_root(tree_root, &key);
 		if (IS_ERR(reloc_root)) {
 			err = PTR_ERR(reloc_root);
 			goto out;
@@ -4620,16 +4670,21 @@  int btrfs_recover_relocation(struct btrf
 	if (err)
 		goto out_free;
 
-	merge_reloc_roots(rc);
-
-	unset_reloc_control(rc);
-
-	trans = btrfs_join_transaction(rc->extent_root);
-	if (IS_ERR(trans)) {
-		err = PTR_ERR(trans);
+	tsk = kthread_run(btrfs_resume_relocation, fs_info,
+			  "relocation-recovery");
+	if (IS_ERR(tsk)) {
+		err = PTR_ERR(tsk);
 		goto out_free;
 	}
-	err = btrfs_commit_transaction(trans);
+
+	fs_info->relocation_recovery_started = true;
+
+	/* protected from racing with resume thread by the cleaner_mutex */
+	init_completion(&fs_info->relocation_recovery_completion);
+
+	set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+	return 0;
+
 out_free:
 	kfree(rc);
 out:
@@ -4649,6 +4704,13 @@  out:
 	return err;
 }
 
+void
+btrfs_wait_for_relocation_completion(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->relocation_recovery_started)
+		wait_for_completion(&fs_info->relocation_recovery_completion);
+}
+
 /*
  * helper to add ordered checksum for data relocation.
  *
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1767,7 +1767,6 @@  static inline void btrfs_remount_cleanup
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	struct btrfs_root *root = fs_info->tree_root;
 	unsigned old_flags = sb->s_flags;
 	unsigned long old_opts = fs_info->mount_opt;
 	unsigned long old_compress_type = fs_info->compress_type;
@@ -1834,6 +1833,8 @@  static int btrfs_remount(struct super_bl
 		btrfs_scrub_cancel(fs_info);
 		btrfs_pause_balance(fs_info);
 
+		btrfs_wait_for_relocation_completion(fs_info);
+
 		ret = btrfs_commit_super(fs_info);
 		if (ret)
 			goto restore;
@@ -1867,7 +1868,7 @@  static int btrfs_remount(struct super_bl
 
 		/* recover relocation */
 		mutex_lock(&fs_info->cleaner_mutex);
-		ret = btrfs_recover_relocation(root);
+		ret = btrfs_recover_relocation(fs_info);
 		mutex_unlock(&fs_info->cleaner_mutex);
 		if (ret)
 			goto restore;
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4034,6 +4034,12 @@  static int balance_kthread(void *data)
 	struct btrfs_fs_info *fs_info = data;
 	int ret = 0;
 
+	if (fs_info->relocation_recovery_started) {
+		btrfs_info(fs_info,
+			   "waiting for relocation recovery before resuming balance");
+		wait_for_completion(&fs_info->relocation_recovery_completion);
+	}
+
 	mutex_lock(&fs_info->volume_mutex);
 	mutex_lock(&fs_info->balance_mutex);