diff mbox

[RFC] Btrfs: fix full backref problem when inserting shared block reference

Message ID 20120809180405.GC32103@shiny (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Mason Aug. 9, 2012, 6:04 p.m. UTC
On Wed, Aug 08, 2012 at 09:10:17PM -0600, Miao Xie wrote:
> If we create several snapshots at the same time, the following BUG_ON() will be
> triggered.
> 
> 	kernel BUG at fs/btrfs/extent-tree.c:6047!
> 
> Steps to reproduce:
>  # mkfs.btrfs <partition>
>  # mount <partition> <mnt>
>  # cd <mnt>
>  # for ((i=0;i<2400;i++)); do touch long_name_to_make_tree_more_deep$i; done
>  # for ((i=0; i<4; i++))
>  > do
>  > mkdir $i
>  > for ((j=0; j<200; j++))
>  > do
>  > btrfs sub snap . $i/$j
>  > done &
>  > done

snapshot creation has a critical section.  Once we copy a given root to
its snapshot, we're not allowed to change it until the transaction
is fully committed.

This means that if you're taking a snapshot of root A and storing the
directory item of the snapshot in root A, you can only do it once per
transaction with getting into trouble.

Looks like the code doesn't enforce this though.  Dave, could you please
give this a try:

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Miao Xie Aug. 10, 2012, 10:38 a.m. UTC | #1
On 	thu, 9 Aug 2012 14:04:05 -0400, Chris Mason wrote:
> On Wed, Aug 08, 2012 at 09:10:17PM -0600, Miao Xie wrote:
>> If we create several snapshots at the same time, the following BUG_ON() will be
>> triggered.
>>
>> 	kernel BUG at fs/btrfs/extent-tree.c:6047!
>>
>> Steps to reproduce:
>>  # mkfs.btrfs <partition>
>>  # mount <partition> <mnt>
>>  # cd <mnt>
>>  # for ((i=0;i<2400;i++)); do touch long_name_to_make_tree_more_deep$i; done
>>  # for ((i=0; i<4; i++))
>>  > do
>>  > mkdir $i
>>  > for ((j=0; j<200; j++))
>>  > do
>>  > btrfs sub snap . $i/$j
>>  > done &
>>  > done
> 
> snapshot creation has a critical section.  Once we copy a given root to
> its snapshot, we're not allowed to change it until the transaction
> is fully committed.

I knew this critical section. But I think we can kick it away by forcing the
snapshoted tree to do COW.

BTW, I will take a vacation next week, so I can not reply it until the week after next.
If it is not urgent, I will continue looking into this problem after I come back.

Thanks
Miao

> 
> This means that if you're taking a snapshot of root A and storing the
> directory item of the snapshot in root A, you can only do it once per
> transaction with getting into trouble.
> 
> Looks like the code doesn't enforce this though.  Dave, could you please
> give this a try:
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index fcc8c21..9e7c621 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -1324,6 +1324,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
>  	u64 search_start;
>  	int ret;
>  
> +	WARN_ON(root->danger_transid == trans->transid);
> +
>  	if (trans->transaction != root->fs_info->running_transaction) {
>  		printk(KERN_CRIT "trans %llu running %llu\n",
>  		       (unsigned long long)trans->transid,
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 0d195b5..35b5603 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1490,6 +1490,12 @@ struct btrfs_root {
>  	u64 objectid;
>  	u64 last_trans;
>  
> +	/*
> +	 * the last transaction that took a snapshot of this
> +	 * root.  We're only allowed one snapshot per root per transaction
> +	 */
> +	u64 snapshot_trans;
> +
>  	/* data allocations are done in sectorsize units */
>  	u32 sectorsize;
>  
> @@ -1550,6 +1556,13 @@ struct btrfs_root {
>  
>  	int force_cow;
>  
> +	/*
> +	 * this marks the critical section of snapshot creation.  If we
> +	 * make any changes to a root after this critical section starts,
> +	 * we corrupt the FS.  It is checked by btrfs_cow_block
> +	 */
> +	u64 danger_transid;
> +
>  	spinlock_t root_times_lock;
>  };
>  
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 9df50fa..b20d835 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -524,13 +524,28 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
>  		pending_snapshot->inherit = *inherit;
>  		*inherit = NULL;	/* take responsibility to free it */
>  	}
> -
> +again:
>  	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
>  	if (IS_ERR(trans)) {
>  		ret = PTR_ERR(trans);
>  		goto fail;
>  	}
>  
> +	/* we're only allowed to snapshot a given root once per transaction */
> +	spin_lock(&root->fs_info->trans_lock);
> +	if (root->snapshot_trans == trans->transid) {
> +		spin_unlock(&root->fs_info->trans_lock);
> +		ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
> +		if (ret)
> +			goto fail;
> +		goto again;
> +	}
> +
> +	root->snapshot_trans = trans->transid;
> +
> +	spin_unlock(&root->fs_info->trans_lock);
> +
> +
>  	ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
>  	BUG_ON(ret);
>  
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 27c2600..4507421 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -1093,6 +1093,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
>  
>  	/* see comments in should_cow_block() */
>  	root->force_cow = 1;
> +	root->danger_transid = trans->transid;
>  	smp_wmb();
>  
>  	btrfs_set_root_node(new_root_item, tmp);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Mason Aug. 10, 2012, 11:56 a.m. UTC | #2
On Fri, Aug 10, 2012 at 04:38:47AM -0600, Miao Xie wrote:
> On 	thu, 9 Aug 2012 14:04:05 -0400, Chris Mason wrote:
> > On Wed, Aug 08, 2012 at 09:10:17PM -0600, Miao Xie wrote:
> >> If we create several snapshots at the same time, the following BUG_ON() will be
> >> triggered.
> >>
> >> 	kernel BUG at fs/btrfs/extent-tree.c:6047!
> >>
> >> Steps to reproduce:
> >>  # mkfs.btrfs <partition>
> >>  # mount <partition> <mnt>
> >>  # cd <mnt>
> >>  # for ((i=0;i<2400;i++)); do touch long_name_to_make_tree_more_deep$i; done
> >>  # for ((i=0; i<4; i++))
> >>  > do
> >>  > mkdir $i
> >>  > for ((j=0; j<200; j++))
> >>  > do
> >>  > btrfs sub snap . $i/$j
> >>  > done &
> >>  > done
> > 
> > snapshot creation has a critical section.  Once we copy a given root to
> > its snapshot, we're not allowed to change it until the transaction
> > is fully committed.
> 
> I knew this critical section. But I think we can kick it away by forcing the
> snapshoted tree to do COW.

Yes, it should be possible.

> 
> BTW, I will take a vacation next week, so I can not reply it until the week after next.
> If it is not urgent, I will continue looking into this problem after I come back.

No problem, we can take the current patch for now and improve add back
the ability to do multiple snapshots per root later.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Lyakas Jan. 30, 2013, 6:23 p.m. UTC | #3
Hi Miao,
I was following this thread in the past, but I did not understand it
fully, maybe you can explain?

>> >>  # mkfs.btrfs <partition>
>> >>  # mount <partition> <mnt>
>> >>  # cd <mnt>
>> >>  # for ((i=0;i<2400;i++)); do touch long_name_to_make_tree_more_deep$i; done
>> >>  # for ((i=0; i<4; i++))
>> >>  > do
>> >>  > mkdir $i
>> >>  > for ((j=0; j<200; j++))
>> >>  > do
>> >>  > btrfs sub snap . $i/$j
>> >>  > done &
>> >>  > done
>> >
>> > snapshot creation has a critical section.  Once we copy a given root to
>> > its snapshot, we're not allowed to change it until the transaction
>> > is fully committed.

Is the limitation that if we are creating a snap B of root A, and
placing the root of B somewhere into the tree of A, then we can do
this only once per transaction? Does this limitation still exist or
your fix fixes it?

Also, according to your reproducer, each "btrfs sub snap" will
start/join a transaction, but then it will call
btrfs_commit_transaction() and not btrfs_commit_transaction_async(),
so it will wait until the transaction commits. So how it may happen
that you create more than one snap in the same transaction with your
reproducer?

The reason I am asking, is that I want to try to write code that
creates several snaps in one transaction and only then commits. Should
this be possible or there is some limitation, like I mentioned above?

Thanks for your help,
Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miao Xie Jan. 31, 2013, 2:42 a.m. UTC | #4
On Wed, 30 Jan 2013 20:23:22 +0200, Alex Lyakas wrote:
> Hi Miao,
> I was following this thread in the past, but I did not understand it
> fully, maybe you can explain?
> 
>>>>>  # mkfs.btrfs <partition>
>>>>>  # mount <partition> <mnt>
>>>>>  # cd <mnt>
>>>>>  # for ((i=0;i<2400;i++)); do touch long_name_to_make_tree_more_deep$i; done
>>>>>  # for ((i=0; i<4; i++))
>>>>>  > do
>>>>>  > mkdir $i
>>>>>  > for ((j=0; j<200; j++))
>>>>>  > do
>>>>>  > btrfs sub snap . $i/$j
>>>>>  > done &
>>>>>  > done
>>>>
>>>> snapshot creation has a critical section.  Once we copy a given root to
>>>> its snapshot, we're not allowed to change it until the transaction
>>>> is fully committed.
> 
> Is the limitation that if we are creating a snap B of root A, and
> placing the root of B somewhere into the tree of A, then we can do
> this only once per transaction? Does this limitation still exist or
> your fix fixes it?

The limitation is the snapshoted subvolume can not be changed until the transaction
is committed. That is we can not insert anything(including the root of B and the
directory item/index of B) into the tree of A after snap B is created.

This limitation was fixed.

> Also, according to your reproducer, each "btrfs sub snap" will
> start/join a transaction, but then it will call
> btrfs_commit_transaction() and not btrfs_commit_transaction_async(),
> so it will wait until the transaction commits. So how it may happen
> that you create more than one snap in the same transaction with your
> reproducer?

run several tasks, and each task create snapshots repeatedly in its own
directory.
(If we create snapshots in the same directory, the i_mutex of the directory
 will make the process serialized)

> The reason I am asking, is that I want to try to write code that
> creates several snaps in one transaction and only then commits. Should
> this be possible or there is some limitation, like I mentioned above?

As far as I know, it is possible, there is no limitation now.

Thanks
Miao
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Lyakas Jan. 31, 2013, 1:06 p.m. UTC | #5
Thanks for your comments, Miao.

On Thu, Jan 31, 2013 at 4:42 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> On Wed, 30 Jan 2013 20:23:22 +0200, Alex Lyakas wrote:
>> Hi Miao,
>> I was following this thread in the past, but I did not understand it
>> fully, maybe you can explain?
>>
>>>>>>  # mkfs.btrfs <partition>
>>>>>>  # mount <partition> <mnt>
>>>>>>  # cd <mnt>
>>>>>>  # for ((i=0;i<2400;i++)); do touch long_name_to_make_tree_more_deep$i; done
>>>>>>  # for ((i=0; i<4; i++))
>>>>>>  > do
>>>>>>  > mkdir $i
>>>>>>  > for ((j=0; j<200; j++))
>>>>>>  > do
>>>>>>  > btrfs sub snap . $i/$j
>>>>>>  > done &
>>>>>>  > done
>>>>>
>>>>> snapshot creation has a critical section.  Once we copy a given root to
>>>>> its snapshot, we're not allowed to change it until the transaction
>>>>> is fully committed.
>>
>> Is the limitation that if we are creating a snap B of root A, and
>> placing the root of B somewhere into the tree of A, then we can do
>> this only once per transaction? Does this limitation still exist or
>> your fix fixes it?
>
> The limitation is the snapshoted subvolume can not be changed until the transaction
> is committed. That is we can not insert anything(including the root of B and the
> directory item/index of B) into the tree of A after snap B is created.
>
> This limitation was fixed.
>
>> Also, according to your reproducer, each "btrfs sub snap" will
>> start/join a transaction, but then it will call
>> btrfs_commit_transaction() and not btrfs_commit_transaction_async(),
>> so it will wait until the transaction commits. So how it may happen
>> that you create more than one snap in the same transaction with your
>> reproducer?
>
> run several tasks, and each task create snapshots repeatedly in its own
> directory.
> (If we create snapshots in the same directory, the i_mutex of the directory
>  will make the process serialized)

So I missed the fact that btrfs_start_transaction() can actually join
an existing transaction. So if several threads ask to create snaps in
parallel, we may hit this.

>
>> The reason I am asking, is that I want to try to write code that
>> creates several snaps in one transaction and only then commits. Should
>> this be possible or there is some limitation, like I mentioned above?
>
> As far as I know, it is possible, there is no limitation now.
>
> Thanks
> Miao
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fcc8c21..9e7c621 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1324,6 +1324,8 @@  noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	u64 search_start;
 	int ret;
 
+	WARN_ON(root->danger_transid == trans->transid);
+
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %llu running %llu\n",
 		       (unsigned long long)trans->transid,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d195b5..35b5603 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1490,6 +1490,12 @@  struct btrfs_root {
 	u64 objectid;
 	u64 last_trans;
 
+	/*
+	 * the last transaction that took a snapshot of this
+	 * root.  We're only allowed one snapshot per root per transaction
+	 */
+	u64 snapshot_trans;
+
 	/* data allocations are done in sectorsize units */
 	u32 sectorsize;
 
@@ -1550,6 +1556,13 @@  struct btrfs_root {
 
 	int force_cow;
 
+	/*
+	 * this marks the critical section of snapshot creation.  If we
+	 * make any changes to a root after this critical section starts,
+	 * we corrupt the FS.  It is checked by btrfs_cow_block
+	 */
+	u64 danger_transid;
+
 	spinlock_t root_times_lock;
 };
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9df50fa..b20d835 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -524,13 +524,28 @@  static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 		pending_snapshot->inherit = *inherit;
 		*inherit = NULL;	/* take responsibility to free it */
 	}
-
+again:
 	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto fail;
 	}
 
+	/* we're only allowed to snapshot a given root once per transaction */
+	spin_lock(&root->fs_info->trans_lock);
+	if (root->snapshot_trans == trans->transid) {
+		spin_unlock(&root->fs_info->trans_lock);
+		ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
+		if (ret)
+			goto fail;
+		goto again;
+	}
+
+	root->snapshot_trans = trans->transid;
+
+	spin_unlock(&root->fs_info->trans_lock);
+
+
 	ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
 	BUG_ON(ret);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c2600..4507421 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1093,6 +1093,7 @@  static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	/* see comments in should_cow_block() */
 	root->force_cow = 1;
+	root->danger_transid = trans->transid;
 	smp_wmb();
 
 	btrfs_set_root_node(new_root_item, tmp);