diff mbox

Btrfs: make sure logged extents complete in the current transaction

Message ID 1416349181-15023-1-git-send-email-jbacik@fb.com (mailing list archive)
State Superseded
Headers show

Commit Message

Josef Bacik Nov. 18, 2014, 10:19 p.m. UTC
Liu Bo pointed out that my previous fix would lose the generation update in the
scenario I described.  It is actually much worse than that, we could lose the
entire extent if we lose power right after the transaction commits.  Consider
the following

write extent 0-4k
log extent in log tree
commit transaction
	< power fail happens here
ordered extent completes

We would lose the 0-4k extent because it hasn't updated the actual fs tree, and
the transaction commit will reset the log so it isn't replayed.  If we lose
power before the transaction commit we are save, otherwise we are not.

Fix this by keeping track of all extents we logged in this transaction.  Then
when we go to commit the transaction make sure we wait for all of those ordered
extents to complete before proceeding.  This will make sure that if we lose
power after the transaction commit we still have our data.  This also fixes the
problem of the improperly updated extent generation.  Thanks,

cc: stable@vger.kernel.org
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ordered-data.c |  6 ++++--
 fs/btrfs/ordered-data.h |  6 +++++-
 fs/btrfs/transaction.c  | 33 +++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.h  |  2 ++
 fs/btrfs/tree-log.c     |  6 +++---
 5 files changed, 47 insertions(+), 6 deletions(-)

Comments

Liu Bo Nov. 19, 2014, 8:28 a.m. UTC | #1
On Tue, Nov 18, 2014 at 05:19:41PM -0500, Josef Bacik wrote:
> Liu Bo pointed out that my previous fix would lose the generation update in the
> scenario I described.  It is actually much worse than that, we could lose the
> entire extent if we lose power right after the transaction commits.  Consider
> the following
> 
> write extent 0-4k
> log extent in log tree
> commit transaction
> 	< power fail happens here
> ordered extent completes
> 
> We would lose the 0-4k extent because it hasn't updated the actual fs tree, and
> the transaction commit will reset the log so it isn't replayed.  If we lose
> power before the transaction commit we are save, otherwise we are not.
> 
> Fix this by keeping track of all extents we logged in this transaction.  Then
> when we go to commit the transaction make sure we wait for all of those ordered
> extents to complete before proceeding.  This will make sure that if we lose
> power after the transaction commit we still have our data.  This also fixes the
> problem of the improperly updated extent generation.  Thanks,

This looks saner.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>

thanks,
-liubo

> 
> cc: stable@vger.kernel.org
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  fs/btrfs/ordered-data.c |  6 ++++--
>  fs/btrfs/ordered-data.h |  6 +++++-
>  fs/btrfs/transaction.c  | 33 +++++++++++++++++++++++++++++++++
>  fs/btrfs/transaction.h  |  2 ++
>  fs/btrfs/tree-log.c     |  6 +++---
>  5 files changed, 47 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
> index ac734ec..7c2dd7a 100644
> --- a/fs/btrfs/ordered-data.c
> +++ b/fs/btrfs/ordered-data.c
> @@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
>  	INIT_LIST_HEAD(&entry->work_list);
>  	init_completion(&entry->completion);
>  	INIT_LIST_HEAD(&entry->log_list);
> +	INIT_LIST_HEAD(&entry->trans_list);
>  
>  	trace_btrfs_ordered_extent_add(inode, entry);
>  
> @@ -472,7 +473,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
>  	spin_unlock_irq(&log->log_extents_lock[index]);
>  }
>  
> -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
> +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
> +			       struct btrfs_root *log, u64 transid)
>  {
>  	struct btrfs_ordered_extent *ordered;
>  	int index = transid % 2;
> @@ -497,7 +499,7 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
>  		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
>  						   &ordered->flags));
>  
> -		btrfs_put_ordered_extent(ordered);
> +		list_add_tail(&ordered->trans_list, &trans->ordered);
>  		spin_lock_irq(&log->log_extents_lock[index]);
>  	}
>  	spin_unlock_irq(&log->log_extents_lock[index]);
> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
> index d81a274..171a841 100644
> --- a/fs/btrfs/ordered-data.h
> +++ b/fs/btrfs/ordered-data.h
> @@ -121,6 +121,9 @@ struct btrfs_ordered_extent {
>  	/* If we need to wait on this to be done */
>  	struct list_head log_list;
>  
> +	/* If the transaction needs to wait on this ordered extent */
> +	struct list_head trans_list;
> +
>  	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
>  	wait_queue_head_t wait;
>  
> @@ -197,7 +200,8 @@ void btrfs_get_logged_extents(struct inode *inode,
>  void btrfs_put_logged_extents(struct list_head *logged_list);
>  void btrfs_submit_logged_extents(struct list_head *logged_list,
>  				 struct btrfs_root *log);
> -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
> +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
> +			       struct btrfs_root *log, u64 transid);
>  void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
>  int __init ordered_data_init(void);
>  void ordered_data_exit(void);
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index dcaae36..63c6d05 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -220,6 +220,7 @@ loop:
>  	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
>  	INIT_LIST_HEAD(&cur_trans->pending_chunks);
>  	INIT_LIST_HEAD(&cur_trans->switch_commits);
> +	INIT_LIST_HEAD(&cur_trans->pending_ordered);
>  	list_add_tail(&cur_trans->list, &fs_info->trans_list);
>  	extent_io_tree_init(&cur_trans->dirty_pages,
>  			     fs_info->btree_inode->i_mapping);
> @@ -488,6 +489,7 @@ again:
>  	h->sync = false;
>  	INIT_LIST_HEAD(&h->qgroup_ref_list);
>  	INIT_LIST_HEAD(&h->new_bgs);
> +	INIT_LIST_HEAD(&h->ordered);
>  
>  	smp_mb();
>  	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
> @@ -719,6 +721,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
>  	if (!list_empty(&trans->new_bgs))
>  		btrfs_create_pending_block_groups(trans, root);
>  
> +	if (!list_empty(&trans->ordered)) {
> +		spin_lock(&info->trans_lock);
> +		list_splice(&trans->ordered, &cur_trans->pending_ordered);
> +		spin_unlock(&info->trans_lock);
> +	}
> +
>  	trans->delayed_ref_updates = 0;
>  	if (!trans->sync) {
>  		must_run_delayed_refs =
> @@ -1652,6 +1660,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
>  		btrfs_wait_ordered_roots(fs_info, -1);
>  }
>  
> +static inline void
> +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
> +			   struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_ordered_extent *ordered;
> +
> +	spin_lock(&fs_info->trans_lock);
> +	while (!list_empty(&cur_trans->pending_ordered)) {
> +		ordered = list_first_entry(&cur_trans->pending_ordered,
> +					   struct btrfs_ordered_extent,
> +					   trans_list);
> +		list_del_init(&ordered->trans_list);
> +		spin_unlock(&fs_info->trans_lock);
> +
> +		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
> +						   &ordered->flags));
> +		btrfs_put_ordered_extent(ordered);
> +		spin_lock(&fs_info->trans_lock);
> +	}
> +	spin_unlock(&fs_info->trans_lock);
> +}
> +
>  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
>  			     struct btrfs_root *root)
>  {
> @@ -1702,6 +1732,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
>  	}
>  
>  	spin_lock(&root->fs_info->trans_lock);
> +	list_splice(&trans->ordered, &cur_trans->pending_ordered);
>  	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
>  		spin_unlock(&root->fs_info->trans_lock);
>  		atomic_inc(&cur_trans->use_count);
> @@ -1754,6 +1785,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
>  
>  	btrfs_wait_delalloc_flush(root->fs_info);
>  
> +	btrfs_wait_pending_ordered(cur_trans, root->fs_info);
> +
>  	btrfs_scrub_pause(root);
>  	/*
>  	 * Ok now we need to make sure to block out any other joins while we
> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
> index d8f40e1..1ba9c3e 100644
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -56,6 +56,7 @@ struct btrfs_transaction {
>  	wait_queue_head_t commit_wait;
>  	struct list_head pending_snapshots;
>  	struct list_head pending_chunks;
> +	struct list_head pending_ordered;
>  	struct list_head switch_commits;
>  	struct btrfs_delayed_ref_root delayed_refs;
>  	int aborted;
> @@ -105,6 +106,7 @@ struct btrfs_trans_handle {
>  	 */
>  	struct btrfs_root *root;
>  	struct seq_list delayed_ref_elem;
> +	struct list_head ordered;
>  	struct list_head qgroup_ref_list;
>  	struct list_head new_bgs;
>  };
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 70f99b1..337e4bc 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -2600,7 +2600,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
>  	if (atomic_read(&log_root_tree->log_commit[index2])) {
>  		blk_finish_plug(&plug);
>  		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
> -		btrfs_wait_logged_extents(log, log_transid);
> +		btrfs_wait_logged_extents(trans, log, log_transid);
>  		wait_log_commit(trans, log_root_tree,
>  				root_log_ctx.log_transid);
>  		mutex_unlock(&log_root_tree->log_mutex);
> @@ -2645,7 +2645,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
>  	btrfs_wait_marked_extents(log_root_tree,
>  				  &log_root_tree->dirty_log_pages,
>  				  EXTENT_NEW | EXTENT_DIRTY);
> -	btrfs_wait_logged_extents(log, log_transid);
> +	btrfs_wait_logged_extents(trans, log, log_transid);
>  
>  	btrfs_set_super_log_root(root->fs_info->super_for_commit,
>  				log_root_tree->node->start);
> @@ -3766,7 +3766,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
>  	fi = btrfs_item_ptr(leaf, path->slots[0],
>  			    struct btrfs_file_extent_item);
>  
> -	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
> +	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
>  					       &token);
>  	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
>  		btrfs_set_token_file_extent_type(leaf, fi,
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec..7c2dd7a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@  static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	INIT_LIST_HEAD(&entry->work_list);
 	init_completion(&entry->completion);
 	INIT_LIST_HEAD(&entry->log_list);
+	INIT_LIST_HEAD(&entry->trans_list);
 
 	trace_btrfs_ordered_extent_add(inode, entry);
 
@@ -472,7 +473,8 @@  void btrfs_submit_logged_extents(struct list_head *logged_list,
 	spin_unlock_irq(&log->log_extents_lock[index]);
 }
 
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log, u64 transid)
 {
 	struct btrfs_ordered_extent *ordered;
 	int index = transid % 2;
@@ -497,7 +499,7 @@  void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
 		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
 						   &ordered->flags));
 
-		btrfs_put_ordered_extent(ordered);
+		list_add_tail(&ordered->trans_list, &trans->ordered);
 		spin_lock_irq(&log->log_extents_lock[index]);
 	}
 	spin_unlock_irq(&log->log_extents_lock[index]);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274..171a841 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -121,6 +121,9 @@  struct btrfs_ordered_extent {
 	/* If we need to wait on this to be done */
 	struct list_head log_list;
 
+	/* If the transaction needs to wait on this ordered extent */
+	struct list_head trans_list;
+
 	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
 	wait_queue_head_t wait;
 
@@ -197,7 +200,8 @@  void btrfs_get_logged_extents(struct inode *inode,
 void btrfs_put_logged_extents(struct list_head *logged_list);
 void btrfs_submit_logged_extents(struct list_head *logged_list,
 				 struct btrfs_root *log);
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
 void ordered_data_exit(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dcaae36..63c6d05 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -220,6 +220,7 @@  loop:
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	INIT_LIST_HEAD(&cur_trans->pending_chunks);
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
+	INIT_LIST_HEAD(&cur_trans->pending_ordered);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
 			     fs_info->btree_inode->i_mapping);
@@ -488,6 +489,7 @@  again:
 	h->sync = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
+	INIT_LIST_HEAD(&h->ordered);
 
 	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +721,12 @@  static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans, root);
 
+	if (!list_empty(&trans->ordered)) {
+		spin_lock(&info->trans_lock);
+		list_splice(&trans->ordered, &cur_trans->pending_ordered);
+		spin_unlock(&info->trans_lock);
+	}
+
 	trans->delayed_ref_updates = 0;
 	if (!trans->sync) {
 		must_run_delayed_refs =
@@ -1652,6 +1660,28 @@  static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 		btrfs_wait_ordered_roots(fs_info, -1);
 }
 
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+			   struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	spin_lock(&fs_info->trans_lock);
+	while (!list_empty(&cur_trans->pending_ordered)) {
+		ordered = list_first_entry(&cur_trans->pending_ordered,
+					   struct btrfs_ordered_extent,
+					   trans_list);
+		list_del_init(&ordered->trans_list);
+		spin_unlock(&fs_info->trans_lock);
+
+		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						   &ordered->flags));
+		btrfs_put_ordered_extent(ordered);
+		spin_lock(&fs_info->trans_lock);
+	}
+	spin_unlock(&fs_info->trans_lock);
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -1702,6 +1732,7 @@  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	spin_lock(&root->fs_info->trans_lock);
+	list_splice(&trans->ordered, &cur_trans->pending_ordered);
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
 		spin_unlock(&root->fs_info->trans_lock);
 		atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1785,8 @@  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_wait_delalloc_flush(root->fs_info);
 
+	btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
 	btrfs_scrub_pause(root);
 	/*
 	 * Ok now we need to make sure to block out any other joins while we
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8f40e1..1ba9c3e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@  struct btrfs_transaction {
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
 	struct list_head pending_chunks;
+	struct list_head pending_ordered;
 	struct list_head switch_commits;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;
@@ -105,6 +106,7 @@  struct btrfs_trans_handle {
 	 */
 	struct btrfs_root *root;
 	struct seq_list delayed_ref_elem;
+	struct list_head ordered;
 	struct list_head qgroup_ref_list;
 	struct list_head new_bgs;
 };
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 70f99b1..337e4bc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2600,7 +2600,7 @@  int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
-		btrfs_wait_logged_extents(log, log_transid);
+		btrfs_wait_logged_extents(trans, log, log_transid);
 		wait_log_commit(trans, log_root_tree,
 				root_log_ctx.log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -2645,7 +2645,7 @@  int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_wait_marked_extents(log_root_tree,
 				  &log_root_tree->dirty_log_pages,
 				  EXTENT_NEW | EXTENT_DIRTY);
-	btrfs_wait_logged_extents(log, log_transid);
+	btrfs_wait_logged_extents(trans, log, log_transid);
 
 	btrfs_set_super_log_root(root->fs_info->super_for_commit,
 				log_root_tree->node->start);
@@ -3766,7 +3766,7 @@  static int log_one_extent(struct btrfs_trans_handle *trans,
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
 
-	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
 					       &token);
 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 		btrfs_set_token_file_extent_type(leaf, fi,