[RFC] btrfs: csum: Introduce partial csum for tree block.
diff mbox

Message ID 1434078015-8868-1-git-send-email-quwenruo@cn.fujitsu.com
State New
Headers show

Commit Message

Qu Wenruo June 12, 2015, 3 a.m. UTC
Introduce the new partial csum mechanism for tree block.

[Old tree block csum]
0     4     8    12    16    20    24    28    32
-------------------------------------------------
|csum |   unused, all 0				|
-------------------------------------------------
Csum is the crc32 of the whole tree block data.

[New tree block csum]
-------------------------------------------------
|csum0|csum1|csum2|csum3|csum4|csum5|csum6|csum7|
-------------------------------------------------
Where csum0 is the same as the old one, crc32 of the whole tree block
data.

But csum1~csum7 will restore crc32 of each eighth part.
Take example of 16K leafsize, then:
csum1: crc32 of BTRFS_CSUM_SIZE~4K
csum2: crc32 of 4K~6K
...
csum7: crc32 of 14K~16K

This provides the ability for btrfs not only to detect corruption but
also to know where corruption is.
Further improve the robustness of btrfs.

Although the best practise is to introduce new csum type and put every
eighth crc32 into corresponding place, but the benefit is not worthy to
break the backward compatibility.
So keep csum0 and modify csum1 range to keep backward compatibility.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
TODO:
  Add scrub support to use partial csum to rebuild metadata.
  This is a little hard as current scrub don't treat tree block as a whole.
  Needs a lot of other improvement for scrub.
---
 fs/btrfs/disk-io.c | 75 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 50 insertions(+), 25 deletions(-)

Comments

Liu Bo June 12, 2015, 2:10 p.m. UTC | #1
On Fri, Jun 12, 2015 at 11:00:15AM +0800, Qu Wenruo wrote:
> Introduce the new partial csum mechanism for tree block.
> 
> [Old tree block csum]
> 0     4     8    12    16    20    24    28    32
> -------------------------------------------------
> |csum |   unused, all 0				|
> -------------------------------------------------
> Csum is the crc32 of the whole tree block data.
> 
> [New tree block csum]
> -------------------------------------------------
> |csum0|csum1|csum2|csum3|csum4|csum5|csum6|csum7|
> -------------------------------------------------
> Where csum0 is the same as the old one, crc32 of the whole tree block
> data.
> 
> But csum1~csum7 will restore crc32 of each eighth part.
> Take example of 16K leafsize, then:
> csum1: crc32 of BTRFS_CSUM_SIZE~4K
> csum2: crc32 of 4K~6K
> ...
> csum7: crc32 of 14K~16K
> 
> This provides the ability for btrfs not only to detect corruption but
> also to know where corruption is.
> Further improve the robustness of btrfs.

What of the use to know where corruption locates?  It still turns to
find a good copy of this block (if there is) and recovery from it,
doesn't it?

Can you eleborate more on how to improve the robustness?

> 
> Although the best practise is to introduce new csum type and put every
> eighth crc32 into corresponding place, but the benefit is not worthy to
> break the backward compatibility.
> So keep csum0 and modify csum1 range to keep backward compatibility.
> 
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
> TODO:
>   Add scrub support to use partial csum to rebuild metadata.
>   This is a little hard as current scrub don't treat tree block as a whole.
>   Needs a lot of other improvement for scrub.
> ---
>  fs/btrfs/disk-io.c | 75 ++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 50 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 7f83778..54052d3 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -271,47 +271,76 @@ void btrfs_csum_final(u32 crc, char *result)
>  }
>  
>  /*
> - * compute the csum for a btree block, and either verify it or write it
> - * into the csum field of the block.
> + * Calcuate partial crc32 for each part.
> + *
> + * Part should be in [0, 7].
> + * Part 0 is the old crc32 of the whole leaf/node.
> + * Part 1 is the crc32 of 32~ 2/8 of leaf/node.
> + * Part 2 is the crc32 of 3/8 of leaf/node.
> + * Part 3 is the crc32 of 4/8 of lean/node and so on.

typo here.

Thanks,

-liubo
>   */
> -static int csum_tree_block(struct btrfs_fs_info *fs_info,
> -			   struct extent_buffer *buf,
> -			   int verify)
> +static int csum_tree_block_part(struct extent_buffer *buf,
> +				char *result, int part)
>  {
> -	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
> -	char *result = NULL;
> +	int offset;
> +	int err;
>  	unsigned long len;
>  	unsigned long cur_len;
> -	unsigned long offset = BTRFS_CSUM_SIZE;
> -	char *kaddr;
>  	unsigned long map_start;
>  	unsigned long map_len;
> -	int err;
> +	char *kaddr;
>  	u32 crc = ~(u32)0;
> -	unsigned long inline_result;
>  
> -	len = buf->len - offset;
> +	BUG_ON(part >= 8 || part < 0);
> +	BUG_ON(ALIGN(buf->len, 8) != buf->len);
> +
> +	if (part == 0) {
> +		offset = BTRFS_CSUM_SIZE;
> +		len = buf->len - offset;
> +	} else if (part == 0) {
> +		offset = BTRFS_CSUM_SIZE;
> +		len = buf->len * 2 / 8 - offset;
> +	} else {
> +		offset = part * buf->len / 8;
> +		len = buf->len / 8;
> +	}
> +
>  	while (len > 0) {
>  		err = map_private_extent_buffer(buf, offset, 32,
>  					&kaddr, &map_start, &map_len);
>  		if (err)
> -			return 1;
> +			return err;
>  		cur_len = min(len, map_len - (offset - map_start));
>  		crc = btrfs_csum_data(kaddr + offset - map_start,
>  				      crc, cur_len);
>  		len -= cur_len;
>  		offset += cur_len;
>  	}
> -	if (csum_size > sizeof(inline_result)) {
> -		result = kzalloc(csum_size, GFP_NOFS);
> -		if (!result)
> +	btrfs_csum_final(crc, result + BTRFS_CSUM_SIZE * part/ 8);
> +	return 0;
> +}
> +
> +/*
> + * compute the csum for a btree block, and either verify it or write it
> + * into the csum field of the block.
> + */
> +static int csum_tree_block(struct btrfs_fs_info *fs_info,
> +			   struct extent_buffer *buf,
> +			   int verify)
> +{
> +	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
> +	char result[BTRFS_CSUM_SIZE] = {0};
> +	int err;
> +	u32 crc = ~(u32)0;
> +	int index = 0;
> +
> +	/* get every part csum */
> +	for (index = 0; index < 8; index++) {
> +		err = csum_tree_block_part(buf, result, index);
> +		if (err)
>  			return 1;
> -	} else {
> -		result = (char *)&inline_result;
>  	}
>  
> -	btrfs_csum_final(crc, result);
> -
>  	if (verify) {
>  		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
>  			u32 val;
> @@ -324,15 +353,11 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
>  				"level %d\n",
>  				fs_info->sb->s_id, buf->start,
>  				val, found, btrfs_header_level(buf));
> -			if (result != (char *)&inline_result)
> -				kfree(result);
>  			return 1;
>  		}
>  	} else {
> -		write_extent_buffer(buf, result, 0, csum_size);
> +		write_extent_buffer(buf, result, 0, BTRFS_CSUM_SIZE);
>  	}
> -	if (result != (char *)&inline_result)
> -		kfree(result);
>  	return 0;
>  }
>  
> -- 
> 2.4.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Mason June 12, 2015, 4:23 p.m. UTC | #2
On 06/11/2015 11:00 PM, Qu Wenruo wrote:
> Introduce the new partial csum mechanism for tree block.
> 
> [Old tree block csum]
> 0     4     8    12    16    20    24    28    32
> -------------------------------------------------
> |csum |   unused, all 0				|
> -------------------------------------------------
> Csum is the crc32 of the whole tree block data.
> 
> [New tree block csum]
> -------------------------------------------------
> |csum0|csum1|csum2|csum3|csum4|csum5|csum6|csum7|
> -------------------------------------------------
> Where csum0 is the same as the old one, crc32 of the whole tree block
> data.
> 
> But csum1~csum7 will restore crc32 of each eighth part.
> Take example of 16K leafsize, then:
> csum1: crc32 of BTRFS_CSUM_SIZE~4K
> csum2: crc32 of 4K~6K
> ...
> csum7: crc32 of 14K~16K
> 
> This provides the ability for btrfs not only to detect corruption but
> also to know where corruption is.
> Further improve the robustness of btrfs.
> 
> Although the best practise is to introduce new csum type and put every
> eighth crc32 into corresponding place, but the benefit is not worthy to
> break the backward compatibility.
> So keep csum0 and modify csum1 range to keep backward compatibility.

I do like how you're maintaining compatibility here, but I'm curious if
you have data about situations this is likely to help?  Is there a
particular kind of corruption you're targeting?

Or is the goal to prevent tossing the whole block, and try to limit it
to a smaller set of items in a node?

-chris

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo June 15, 2015, 8:02 a.m. UTC | #3
> On 06/11/2015 11:00 PM, Qu Wenruo wrote:
>> Introduce the new partial csum mechanism for tree block.
>>
>> [Old tree block csum]
>> 0     4     8    12    16    20    24    28    32
>> -------------------------------------------------
>> |csum |   unused, all 0				|
>> -------------------------------------------------
>> Csum is the crc32 of the whole tree block data.
>>
>> [New tree block csum]
>> -------------------------------------------------
>> |csum0|csum1|csum2|csum3|csum4|csum5|csum6|csum7|
>> -------------------------------------------------
>> Where csum0 is the same as the old one, crc32 of the whole tree block
>> data.
>>
>> But csum1~csum7 will restore crc32 of each eighth part.
>> Take example of 16K leafsize, then:
>> csum1: crc32 of BTRFS_CSUM_SIZE~4K
>> csum2: crc32 of 4K~6K
>> ...
>> csum7: crc32 of 14K~16K
>>
>> This provides the ability for btrfs not only to detect corruption but
>> also to know where corruption is.
>> Further improve the robustness of btrfs.
>>
>> Although the best practise is to introduce new csum type and put every
>> eighth crc32 into corresponding place, but the benefit is not worthy to
>> break the backward compatibility.
>> So keep csum0 and modify csum1 range to keep backward compatibility.
>
> I do like how you're maintaining compatibility here, but I'm curious if
> you have data about situations this is likely to help?  Is there a
> particular kind of corruption you're targeting?
>
> Or is the goal to prevent tossing the whole block, and try to limit it
> to a smaller set of items in a node?
>
> -chris
>
To both Chris and Liu,

In the following case of corruption, RAID1 or DUP will fail to recover
it(Use 16K as leafsize)
0		4K		8K		12K		16K
Mirror 0:
|<-OK---------->|<----ERROR---->|<-----------------OK------------->|

Mirror 1:
|<----------------------------OK--------------->|<------Error----->|

Since the CRC32 stored in header is calculated for the whole leaf,
so both will fail the CRC32 check.

But the corruption are in different position, in fact, if we know where
the corruption is (no need to be so accurate), we can recover the tree
block by using the current part.

In above example, we can just use the correct 0~12K from mirror 1
and then 12K~16K from mirror 0.

And in my patch, since csum1~7 is the csum for each 1/8 parts
(except csum1), so csum1~5 in mirror 1 should pass the CRC32 check,
and csum6~6 in mirror 0 should pass too.

And scrub (or read_tree_block?) should be able to repair the tree block
using the correct parts.
The repair patches are still under coding as it's much harder to
implement with current scrub codes.

Yes, this corruption case may be minor enough, since even corruption in
one mirror is rare enough.
So I didn't introduce a new CRC32 checksum, but use the extra 32-4 bytes
to store the partial CRC32 to keep the backward compatibility.

Thanks,
Qu
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba June 15, 2015, 1:15 p.m. UTC | #4
On Mon, Jun 15, 2015 at 04:02:49PM +0800, Qu Wenruo wrote:
> In the following case of corruption, RAID1 or DUP will fail to recover
> it(Use 16K as leafsize)
> 0		4K		8K		12K		16K
> Mirror 0:
> |<-OK---------->|<----ERROR---->|<-----------------OK------------->|
> 
> Mirror 1:
> |<----------------------------OK--------------->|<------Error----->|
> 
> Since the CRC32 stored in header is calculated for the whole leaf,
> so both will fail the CRC32 check.
> 
> But the corruption are in different position, in fact, if we know where
> the corruption is (no need to be so accurate), we can recover the tree
> block by using the current part.
> 
> In above example, we can just use the correct 0~12K from mirror 1
> and then 12K~16K from mirror 0.

If the mirror 0 copy is intact, you can use it entirely. Your
improvement could help if each mirror is partially broken but we can
find good copies of all 4k blocks among all mirrors.

The natural question is how often this happens and if it's worth adding
the code complexity and what's the estimated speed drop.

I think the conditions are very rare and that we could add minimal code
to attempt to build the metadata block from the available copies without
the separate block checksums. This is an immediate idea so I could have
missed something:

* if a metadata-block checksum mismatches, do a direct comparison of the
  metadata-blocks in all available mirrors
  * if they match and checksums match, no help, it's damaged
  * if there's a good copy (ie the original checksum or data were
    corrupted), use it
  * otherwise attempt to rebuild the metadata block from what's available

* by direct comparisons of the 4k blocks, find the first where the
  metadataA and mirror1 blocks mismatch, offset N
* try to compute the checksum from metadataA[0..N-1] + mirror1 block N +
  rest of metadataA
  * if it's ok, use it
  * if not: the block N is corrupted in mirror1 (we've skipped it in
    metadataA)
    then repeat with metadataA[0..N] + mirror1[N+1..end]

That's a rough idea that I hope will cover most of the cases when it
happens. With some more exhaustive attempts to rebuild the metadata
block we can try to repair 2 damaged blocks.

As this is completely independent, we can test it separately, and also
add it as a rescue feature to the userspace tools.

> Yes, this corruption case may be minor enough, since even corruption in
> one mirror is rare enough.
> So I didn't introduce a new CRC32 checksum, but use the extra 32-4 bytes
> to store the partial CRC32 to keep the backward compatibility.

The above would work with any checksums, without the need to store the
per-block checksums which become impossible with strongher algorithms.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo June 16, 2015, 1:22 a.m. UTC | #5
David Sterba wrote on 2015/06/15 15:15 +0200:
> On Mon, Jun 15, 2015 at 04:02:49PM +0800, Qu Wenruo wrote:
>> In the following case of corruption, RAID1 or DUP will fail to recover
>> it(Use 16K as leafsize)
>> 0		4K		8K		12K		16K
>> Mirror 0:
>> |<-OK---------->|<----ERROR---->|<-----------------OK------------->|
>>
>> Mirror 1:
>> |<----------------------------OK--------------->|<------Error----->|
>>
>> Since the CRC32 stored in header is calculated for the whole leaf,
>> so both will fail the CRC32 check.
>>
>> But the corruption are in different position, in fact, if we know where
>> the corruption is (no need to be so accurate), we can recover the tree
>> block by using the current part.
>>
>> In above example, we can just use the correct 0~12K from mirror 1
>> and then 12K~16K from mirror 0.
>
> If the mirror 0 copy is intact, you can use it entirely. Your
> improvement could help if each mirror is partially broken but we can
> find good copies of all 4k blocks among all mirrors.
>
> The natural question is how often this happens and if it's worth adding
> the code complexity and what's the estimated speed drop.
>
> I think the conditions are very rare and that we could add minimal code
> to attempt to build the metadata block from the available copies without
> the separate block checksums. This is an immediate idea so I could have
> missed something:
>
> * if a metadata-block checksum mismatches, do a direct comparison of the
>    metadata-blocks in all available mirrors
>    * if they match and checksums match, no help, it's damaged
>    * if there's a good copy (ie the original checksum or data were
>      corrupted), use it
>    * otherwise attempt to rebuild the metadata block from what's available
>
> * by direct comparisons of the 4k blocks, find the first where the
>    metadataA and mirror1 blocks mismatch, offset N
> * try to compute the checksum from metadataA[0..N-1] + mirror1 block N +
>    rest of metadataA
>    * if it's ok, use it
>    * if not: the block N is corrupted in mirror1 (we've skipped it in
>      metadataA)
>      then repeat with metadataA[0..N] + mirror1[N+1..end]
The method sounds good, but the codes will be even more complex than
mine.
Also, due to the nature of CRC32 and RAID1/Dup case, things will be
quite complex like the following case using your method.

0	4K	8K	12K	16K
Mirror 0
|	|   X	|	|	|
Mirror 1
|	|	|   X	|	|

If using your method:
[0~4K]
CRC32 of 0~4K are the same. No problem.

[4~8K]
CRC32 of 0~8K differs.
Now we know there is something wrong with 4~8K, but we still don't know
which is the good copy.

We can continue, keep 2 different CRC32 seed for later checksum.
One seed using the 4~8 from mirror 0 and one from mirror1.

Note, the crc32 is for the whole tree block, so until we calculate all 
the tree block, we can know which one is correct.

[8~12K]
Now crc32 mismatches again.
We still don't know which part is correct.

We can still continue by recording different CRC32 seed for them.
But don't forget the previous 2 seeds we recorded.
So we records 4 crc32 seeds for 0~12K.(Mi0, Mi0) (Mi0, Mi1) (Mi1, Mi0) 
(Mi1, Mi1).

[12~16K]
Continue calculate the crc32 with above 4 seeds.
Finally, we found the crc32 matches with the tree block is using the 
combination of (Mi1, Mi0). And we can restore the tree block.

Yes, with this behavior, we can restore the tree block even 3 parts are 
corrupted, but in that case, we need to try 8 times.

So, I don't consider this is more easy to implement than my idea.

[ROOT CAUSE]
The root cause of the complex is:
1) Checksum algorithm depends on previous input(seed)
Almost all checksum algorithm depends on previous input.
And you won't know the data is correct or not until all data is calculated.

2) Only one extra duplication for RAID1/DUP
We don't have extra duplication to judge which block is correct as there 
are only two mirror.

So my partial checksum solves the 2 root cause at the same time.
With partial checksum, we don't depend on previous data to calculate 
checksum.
And we have extra reference if mirror differs with each other, we use 
checksum to judge which copy is correct.

>
> That's a rough idea that I hope will cover most of the cases when it
> happens. With some more exhaustive attempts to rebuild the metadata
> block we can try to repair 2 damaged blocks.
>
> As this is completely independent, we can test it separately, and also
> add it as a rescue feature to the userspace tools.
>
>> Yes, this corruption case may be minor enough, since even corruption in
>> one mirror is rare enough.
>> So I didn't introduce a new CRC32 checksum, but use the extra 32-4 bytes
>> to store the partial CRC32 to keep the backward compatibility.
>
> The above would work with any checksums, without the need to store the
> per-block checksums which become impossible with strongher algorithms.
>
[FURTHER CSUM DESIGN]
As you mentioned in later part, yes, stronger checksum like SHA256 won't
be able to do such partial checksum as it uses all the space.

But that's already a trade-off, right?

If btrfs was using partial csum from the beginning, pros and cons of 
strong checksum should be quite obvious:

Stronger checksum:
(Pros)
1. Less confliction
Which means more security.

(Cons)
1. More space usage for data.
4Bytes/4K vs 32Bytes/4K. Obvious.

2. No/less partial checksum for metadata.
Harder to detect/repair partial error.

3. (generally) More CPU time
Normally, stronger checksums are cryptographic checksum, which uses
more CPU time.

So with this patch, I also want to inspire other developers about the 
trade off between different checksums.
Especially the fact, we can make better use of the spare metadata csum
space if the csum length is less than 32bytes.

Thanks,
Qu
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo June 16, 2015, 2:39 a.m. UTC | #6
Qu Wenruo wrote on 2015/06/16 09:22 +0800:
>
>
> David Sterba wrote on 2015/06/15 15:15 +0200:
>> On Mon, Jun 15, 2015 at 04:02:49PM +0800, Qu Wenruo wrote:
>>> In the following case of corruption, RAID1 or DUP will fail to recover
>>> it(Use 16K as leafsize)
>>> 0        4K        8K        12K        16K
>>> Mirror 0:
>>> |<-OK---------->|<----ERROR---->|<-----------------OK------------->|
>>>
>>> Mirror 1:
>>> |<----------------------------OK--------------->|<------Error----->|
>>>
>>> Since the CRC32 stored in header is calculated for the whole leaf,
>>> so both will fail the CRC32 check.
>>>
>>> But the corruption are in different position, in fact, if we know where
>>> the corruption is (no need to be so accurate), we can recover the tree
>>> block by using the current part.
>>>
>>> In above example, we can just use the correct 0~12K from mirror 1
>>> and then 12K~16K from mirror 0.
>>
>> If the mirror 0 copy is intact, you can use it entirely. Your
>> improvement could help if each mirror is partially broken but we can
>> find good copies of all 4k blocks among all mirrors.
>>
>> The natural question is how often this happens and if it's worth adding
>> the code complexity and what's the estimated speed drop.
>>
>> I think the conditions are very rare and that we could add minimal code
>> to attempt to build the metadata block from the available copies without
>> the separate block checksums. This is an immediate idea so I could have
>> missed something:
>>
>> * if a metadata-block checksum mismatches, do a direct comparison of the
>>    metadata-blocks in all available mirrors
>>    * if they match and checksums match, no help, it's damaged
>>    * if there's a good copy (ie the original checksum or data were
>>      corrupted), use it
>>    * otherwise attempt to rebuild the metadata block from what's
>> available
>>
>> * by direct comparisons of the 4k blocks, find the first where the
>>    metadataA and mirror1 blocks mismatch, offset N
>> * try to compute the checksum from metadataA[0..N-1] + mirror1 block N +
>>    rest of metadataA
>>    * if it's ok, use it
>>    * if not: the block N is corrupted in mirror1 (we've skipped it in
>>      metadataA)
>>      then repeat with metadataA[0..N] + mirror1[N+1..end]
> The method sounds good, but the codes will be even more complex than
> mine.
> Also, due to the nature of CRC32 and RAID1/Dup case, things will be
> quite complex like the following case using your method.
>
> 0    4K    8K    12K    16K
> Mirror 0
> |    |   X    |    |    |
> Mirror 1
> |    |    |   X    |    |
>
> If using your method:
> [0~4K]
> CRC32 of 0~4K are the same. No problem.
>
> [4~8K]
> CRC32 of 0~8K differs.
> Now we know there is something wrong with 4~8K, but we still don't know
> which is the good copy.
>
> We can continue, keep 2 different CRC32 seed for later checksum.
> One seed using the 4~8 from mirror 0 and one from mirror1.
>
> Note, the crc32 is for the whole tree block, so until we calculate all
> the tree block, we can know which one is correct.
Sorry here, "can" -> "can't"

Thanks,
Qu
>
> [8~12K]
> Now crc32 mismatches again.
> We still don't know which part is correct.
>
> We can still continue by recording different CRC32 seed for them.
> But don't forget the previous 2 seeds we recorded.
> So we records 4 crc32 seeds for 0~12K.(Mi0, Mi0) (Mi0, Mi1) (Mi1, Mi0)
> (Mi1, Mi1).
>
> [12~16K]
> Continue calculate the crc32 with above 4 seeds.
> Finally, we found the crc32 matches with the tree block is using the
> combination of (Mi1, Mi0). And we can restore the tree block.
>
> Yes, with this behavior, we can restore the tree block even 3 parts are
> corrupted, but in that case, we need to try 8 times.
>
> So, I don't consider this is more easy to implement than my idea.
>
> [ROOT CAUSE]
> The root cause of the complex is:
> 1) Checksum algorithm depends on previous input(seed)
> Almost all checksum algorithm depends on previous input.
> And you won't know the data is correct or not until all data is calculated.
>
> 2) Only one extra duplication for RAID1/DUP
> We don't have extra duplication to judge which block is correct as there
> are only two mirror.
>
> So my partial checksum solves the 2 root cause at the same time.
> With partial checksum, we don't depend on previous data to calculate
> checksum.
> And we have extra reference if mirror differs with each other, we use
> checksum to judge which copy is correct.
>
>>
>> That's a rough idea that I hope will cover most of the cases when it
>> happens. With some more exhaustive attempts to rebuild the metadata
>> block we can try to repair 2 damaged blocks.
>>
>> As this is completely independent, we can test it separately, and also
>> add it as a rescue feature to the userspace tools.
>>
>>> Yes, this corruption case may be minor enough, since even corruption in
>>> one mirror is rare enough.
>>> So I didn't introduce a new CRC32 checksum, but use the extra 32-4 bytes
>>> to store the partial CRC32 to keep the backward compatibility.
>>
>> The above would work with any checksums, without the need to store the
>> per-block checksums which become impossible with strongher algorithms.
>>
> [FURTHER CSUM DESIGN]
> As you mentioned in later part, yes, stronger checksum like SHA256 won't
> be able to do such partial checksum as it uses all the space.
>
> But that's already a trade-off, right?
>
> If btrfs was using partial csum from the beginning, pros and cons of
> strong checksum should be quite obvious:
>
> Stronger checksum:
> (Pros)
> 1. Less confliction
> Which means more security.
>
> (Cons)
> 1. More space usage for data.
> 4Bytes/4K vs 32Bytes/4K. Obvious.
>
> 2. No/less partial checksum for metadata.
> Harder to detect/repair partial error.
>
> 3. (generally) More CPU time
> Normally, stronger checksums are cryptographic checksum, which uses
> more CPU time.
>
> So with this patch, I also want to inspire other developers about the
> trade off between different checksums.
> Especially the fact, we can make better use of the spare metadata csum
> space if the csum length is less than 32bytes.
>
> Thanks,
> Qu
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo June 18, 2015, 1:34 a.m. UTC | #7
Ping?

New new comments?

Thanks,
Qu

Qu Wenruo wrote on 2015/06/16 10:39 +0800:
>
>
> Qu Wenruo wrote on 2015/06/16 09:22 +0800:
>>
>>
>> David Sterba wrote on 2015/06/15 15:15 +0200:
>>> On Mon, Jun 15, 2015 at 04:02:49PM +0800, Qu Wenruo wrote:
>>>> In the following case of corruption, RAID1 or DUP will fail to recover
>>>> it(Use 16K as leafsize)
>>>> 0        4K        8K        12K        16K
>>>> Mirror 0:
>>>> |<-OK---------->|<----ERROR---->|<-----------------OK------------->|
>>>>
>>>> Mirror 1:
>>>> |<----------------------------OK--------------->|<------Error----->|
>>>>
>>>> Since the CRC32 stored in header is calculated for the whole leaf,
>>>> so both will fail the CRC32 check.
>>>>
>>>> But the corruption are in different position, in fact, if we know where
>>>> the corruption is (no need to be so accurate), we can recover the tree
>>>> block by using the current part.
>>>>
>>>> In above example, we can just use the correct 0~12K from mirror 1
>>>> and then 12K~16K from mirror 0.
>>>
>>> If the mirror 0 copy is intact, you can use it entirely. Your
>>> improvement could help if each mirror is partially broken but we can
>>> find good copies of all 4k blocks among all mirrors.
>>>
>>> The natural question is how often this happens and if it's worth adding
>>> the code complexity and what's the estimated speed drop.
>>>
>>> I think the conditions are very rare and that we could add minimal code
>>> to attempt to build the metadata block from the available copies without
>>> the separate block checksums. This is an immediate idea so I could have
>>> missed something:
>>>
>>> * if a metadata-block checksum mismatches, do a direct comparison of the
>>>    metadata-blocks in all available mirrors
>>>    * if they match and checksums match, no help, it's damaged
>>>    * if there's a good copy (ie the original checksum or data were
>>>      corrupted), use it
>>>    * otherwise attempt to rebuild the metadata block from what's
>>> available
>>>
>>> * by direct comparisons of the 4k blocks, find the first where the
>>>    metadataA and mirror1 blocks mismatch, offset N
>>> * try to compute the checksum from metadataA[0..N-1] + mirror1 block N +
>>>    rest of metadataA
>>>    * if it's ok, use it
>>>    * if not: the block N is corrupted in mirror1 (we've skipped it in
>>>      metadataA)
>>>      then repeat with metadataA[0..N] + mirror1[N+1..end]
>> The method sounds good, but the codes will be even more complex than
>> mine.
>> Also, due to the nature of CRC32 and RAID1/Dup case, things will be
>> quite complex like the following case using your method.
>>
>> 0    4K    8K    12K    16K
>> Mirror 0
>> |    |   X    |    |    |
>> Mirror 1
>> |    |    |   X    |    |
>>
>> If using your method:
>> [0~4K]
>> CRC32 of 0~4K are the same. No problem.
>>
>> [4~8K]
>> CRC32 of 0~8K differs.
>> Now we know there is something wrong with 4~8K, but we still don't know
>> which is the good copy.
>>
>> We can continue, keep 2 different CRC32 seed for later checksum.
>> One seed using the 4~8 from mirror 0 and one from mirror1.
>>
>> Note, the crc32 is for the whole tree block, so until we calculate all
>> the tree block, we can know which one is correct.
> Sorry here, "can" -> "can't"
>
> Thanks,
> Qu
>>
>> [8~12K]
>> Now crc32 mismatches again.
>> We still don't know which part is correct.
>>
>> We can still continue by recording different CRC32 seed for them.
>> But don't forget the previous 2 seeds we recorded.
>> So we records 4 crc32 seeds for 0~12K.(Mi0, Mi0) (Mi0, Mi1) (Mi1, Mi0)
>> (Mi1, Mi1).
>>
>> [12~16K]
>> Continue calculate the crc32 with above 4 seeds.
>> Finally, we found the crc32 matches with the tree block is using the
>> combination of (Mi1, Mi0). And we can restore the tree block.
>>
>> Yes, with this behavior, we can restore the tree block even 3 parts are
>> corrupted, but in that case, we need to try 8 times.
>>
>> So, I don't consider this is more easy to implement than my idea.
>>
>> [ROOT CAUSE]
>> The root cause of the complex is:
>> 1) Checksum algorithm depends on previous input(seed)
>> Almost all checksum algorithm depends on previous input.
>> And you won't know the data is correct or not until all data is
>> calculated.
>>
>> 2) Only one extra duplication for RAID1/DUP
>> We don't have extra duplication to judge which block is correct as there
>> are only two mirror.
>>
>> So my partial checksum solves the 2 root cause at the same time.
>> With partial checksum, we don't depend on previous data to calculate
>> checksum.
>> And we have extra reference if mirror differs with each other, we use
>> checksum to judge which copy is correct.
>>
>>>
>>> That's a rough idea that I hope will cover most of the cases when it
>>> happens. With some more exhaustive attempts to rebuild the metadata
>>> block we can try to repair 2 damaged blocks.
>>>
>>> As this is completely independent, we can test it separately, and also
>>> add it as a rescue feature to the userspace tools.
>>>
>>>> Yes, this corruption case may be minor enough, since even corruption in
>>>> one mirror is rare enough.
>>>> So I didn't introduce a new CRC32 checksum, but use the extra 32-4
>>>> bytes
>>>> to store the partial CRC32 to keep the backward compatibility.
>>>
>>> The above would work with any checksums, without the need to store the
>>> per-block checksums which become impossible with strongher algorithms.
>>>
>> [FURTHER CSUM DESIGN]
>> As you mentioned in later part, yes, stronger checksum like SHA256 won't
>> be able to do such partial checksum as it uses all the space.
>>
>> But that's already a trade-off, right?
>>
>> If btrfs was using partial csum from the beginning, pros and cons of
>> strong checksum should be quite obvious:
>>
>> Stronger checksum:
>> (Pros)
>> 1. Less confliction
>> Which means more security.
>>
>> (Cons)
>> 1. More space usage for data.
>> 4Bytes/4K vs 32Bytes/4K. Obvious.
>>
>> 2. No/less partial checksum for metadata.
>> Harder to detect/repair partial error.
>>
>> 3. (generally) More CPU time
>> Normally, stronger checksums are cryptographic checksum, which uses
>> more CPU time.
>>
>> So with this patch, I also want to inspire other developers about the
>> trade off between different checksums.
>> Especially the fact, we can make better use of the spare metadata csum
>> space if the csum length is less than 32bytes.
>>
>> Thanks,
>> Qu
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Mason June 18, 2015, 3:57 p.m. UTC | #8
On Wed, Jun 17, 2015 at 9:34 PM, Qu Wenruo <quwenruo@cn.fujitsu.com> 
wrote:
> Ping?
> 
> New new comments?

As our block sizes get bigger, it makes sense to think about more fine 
grained checksums.  We're using crcs for:

1) memory corruption on the way down to the storage.  We could be very 
small (bitflips) or smaller chunks (dma corrupting the whole bio).  The 
places I've seen this in production, the partial crcs might help save a 
percentage of the blocks, but overall the corruptions were just too 
pervasive to get back the data.

2) incomplete writes.  We're sending down up to 64K btree blocks, the 
storage might only write some of them.

3) IO errors from the drive.  These are likely to fail in much bigger 
chunks and the partial csums probably won't help at all.

I think the best way to repair all of these is with replication, either 
RAID5/6 or some number of mirrored copies.  It's more reliable than 
trying to stitch together streams from multiple copies, and the code 
complexity is much lower.

But, where I do find the partial crcs interesting is the ability to 
more accurately detect those three failure modes with our larger block 
sizes.  That's pure statistics based on the crc we've chosen and the 
size of the block.  The right answer might just be a different crc, but 
I'm more than open to data here.

-chris


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba June 18, 2015, 5:06 p.m. UTC | #9
On Thu, Jun 18, 2015 at 11:57:46AM -0400, Facebook wrote:
> > New new comments?
> 
> As our block sizes get bigger, it makes sense to think about more fine 
> grained checksums.  We're using crcs for:
> 
> 1) memory corruption on the way down to the storage.  We could be very 
> small (bitflips) or smaller chunks (dma corrupting the whole bio).  The 
> places I've seen this in production, the partial crcs might help save a 
> percentage of the blocks, but overall the corruptions were just too 
> pervasive to get back the data.
> 
> 2) incomplete writes.  We're sending down up to 64K btree blocks, the 
> storage might only write some of them.
> 
> 3) IO errors from the drive.  These are likely to fail in much bigger 
> chunks and the partial csums probably won't help at all.
> 
> I think the best way to repair all of these is with replication, either 
> RAID5/6 or some number of mirrored copies.  It's more reliable than 
> trying to stitch together streams from multiple copies, and the code 
> complexity is much lower.

I agree with that. I'm still not convinced that adding all the kernel
code to repair the data is justified, compared to the block-level
redundancy alternatives.

> But, where I do find the partial crcs interesting is the ability to 
> more accurately detect those three failure modes with our larger block 
> sizes.  That's pure statistics based on the crc we've chosen and the 
> size of the block.  The right answer might just be a different crc, but 
> I'm more than open to data here.

Good point, the detection aspect costs only the increased checksumming
and reporting. My assumption is that this will happen infrequently and
can point out serious hardware problems. In that case taking the
filesytem offline is a workable option and improving the userspace tools
to actually attempt the targeted block repair seems easier. Note that
this would come after redundant raid would not be able to fix it.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo June 19, 2015, 1:26 a.m. UTC | #10
David Sterba wrote on 2015/06/18 19:06 +0200:
> On Thu, Jun 18, 2015 at 11:57:46AM -0400, Facebook wrote:
>>> New new comments?
>>
>> As our block sizes get bigger, it makes sense to think about more fine
>> grained checksums.  We're using crcs for:
>>
>> 1) memory corruption on the way down to the storage.  We could be very
>> small (bitflips) or smaller chunks (dma corrupting the whole bio).  The
>> places I've seen this in production, the partial crcs might help save a
>> percentage of the blocks, but overall the corruptions were just too
>> pervasive to get back the data.
>>
>> 2) incomplete writes.  We're sending down up to 64K btree blocks, the
>> storage might only write some of them.
>>
>> 3) IO errors from the drive.  These are likely to fail in much bigger
>> chunks and the partial csums probably won't help at all.
>>
>> I think the best way to repair all of these is with replication, either
>> RAID5/6 or some number of mirrored copies.  It's more reliable than
>> trying to stitch together streams from multiple copies, and the code
>> complexity is much lower.
>
> I agree with that. I'm still not convinced that adding all the kernel
> code to repair the data is justified, compared to the block-level
> redundancy alternatives.

Totally agree with this.
That's why we have support for RAID1/5/6/10.

I also hate to add complexity to kernel codes, especially when the scrub 
codes are already quite complex.

But in fact, my teammate Zhao Lei is already doing some work to make 
scrub codes clean and neat.
During his work, one of the thing needs to clean is the function to use 
the bios without IO error to rebuild a tree block from different mirrors.

I found it quite similar with the concept of partial csum, and may 
extract some quite generic codes for both of them, and hope to reduce 
the code amount overall.

But anyway, the main part, scrub support for partial csum, is still just
a basic idea(although some coding is already done), so I hopes to see 
more ideas even it's against partial csum.

>
>> But, where I do find the partial crcs interesting is the ability to
>> more accurately detect those three failure modes with our larger block
>> sizes.  That's pure statistics based on the crc we've chosen and the
>> size of the block.  The right answer might just be a different crc, but
>> I'm more than open to data here.
>
> Good point, the detection aspect costs only the increased checksumming
> and reporting. My assumption is that this will happen infrequently and
> can point out serious hardware problems. In that case taking the
> filesytem offline is a workable option and improving the userspace tools
> to actually attempt the targeted block repair seems easier. Note that
> this would come after redundant raid would not be able to fix it.
>

My original cause for partial csum is to improve btrfsck btree repair codes.
Current btree repair codes will drop all child nodes/leaves, which is 
quite a big loss, and deadly if the error happens at tree root.
If using partial csum, we can reduce the damage to as less as 1/8 of the 
node/leave.

So I'm completely OK to implement it in btrfsck, as it's much much 
easier to code and debug in user-space.


But the level of repair is not as high as btrfsck, which overall do 
repair in a higher level, like inode/file/extent repair.
With the nature of partial csum, it leans towards block level more.
So the idea comes to me to do it in kernel scrub codes.

For my personal opinion, if Zhao Lei and I can make the scrub codes much 
clearer and neater, I still consider the kernel scrub implement worthy a 
try.

Thanks,
Qu
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba June 25, 2015, 3:31 p.m. UTC | #11
On Fri, Jun 19, 2015 at 09:26:11AM +0800, Qu Wenruo wrote:
> > I agree with that. I'm still not convinced that adding all the kernel
> > code to repair the data is justified, compared to the block-level
> > redundancy alternatives.
> 
> Totally agree with this.
> That's why we have support for RAID1/5/6/10.
> 
> I also hate to add complexity to kernel codes, especially when the scrub 
> codes are already quite complex.
> 
> But in fact, my teammate Zhao Lei is already doing some work to make 
> scrub codes clean and neat.

Doing cleanups is a good thing regardless of new features, please don't
hesitate to post them even if we do not agree to implement the partial
csum/repair.

I'm not against adding the partial csums & repair, but at the moment I'm
not convinced.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7f83778..54052d3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -271,47 +271,76 @@  void btrfs_csum_final(u32 crc, char *result)
 }
 
 /*
- * compute the csum for a btree block, and either verify it or write it
- * into the csum field of the block.
+ * Calcuate partial crc32 for each part.
+ *
+ * Part should be in [0, 7].
+ * Part 0 is the old crc32 of the whole leaf/node.
+ * Part 1 is the crc32 of 32~ 2/8 of leaf/node.
+ * Part 2 is the crc32 of 3/8 of leaf/node.
+ * Part 3 is the crc32 of 4/8 of lean/node and so on.
  */
-static int csum_tree_block(struct btrfs_fs_info *fs_info,
-			   struct extent_buffer *buf,
-			   int verify)
+static int csum_tree_block_part(struct extent_buffer *buf,
+				char *result, int part)
 {
-	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
-	char *result = NULL;
+	int offset;
+	int err;
 	unsigned long len;
 	unsigned long cur_len;
-	unsigned long offset = BTRFS_CSUM_SIZE;
-	char *kaddr;
 	unsigned long map_start;
 	unsigned long map_len;
-	int err;
+	char *kaddr;
 	u32 crc = ~(u32)0;
-	unsigned long inline_result;
 
-	len = buf->len - offset;
+	BUG_ON(part >= 8 || part < 0);
+	BUG_ON(ALIGN(buf->len, 8) != buf->len);
+
+	if (part == 0) {
+		offset = BTRFS_CSUM_SIZE;
+		len = buf->len - offset;
+	} else if (part == 0) {
+		offset = BTRFS_CSUM_SIZE;
+		len = buf->len * 2 / 8 - offset;
+	} else {
+		offset = part * buf->len / 8;
+		len = buf->len / 8;
+	}
+
 	while (len > 0) {
 		err = map_private_extent_buffer(buf, offset, 32,
 					&kaddr, &map_start, &map_len);
 		if (err)
-			return 1;
+			return err;
 		cur_len = min(len, map_len - (offset - map_start));
 		crc = btrfs_csum_data(kaddr + offset - map_start,
 				      crc, cur_len);
 		len -= cur_len;
 		offset += cur_len;
 	}
-	if (csum_size > sizeof(inline_result)) {
-		result = kzalloc(csum_size, GFP_NOFS);
-		if (!result)
+	btrfs_csum_final(crc, result + BTRFS_CSUM_SIZE * part/ 8);
+	return 0;
+}
+
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *buf,
+			   int verify)
+{
+	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+	char result[BTRFS_CSUM_SIZE] = {0};
+	int err;
+	u32 crc = ~(u32)0;
+	int index = 0;
+
+	/* get every part csum */
+	for (index = 0; index < 8; index++) {
+		err = csum_tree_block_part(buf, result, index);
+		if (err)
 			return 1;
-	} else {
-		result = (char *)&inline_result;
 	}
 
-	btrfs_csum_final(crc, result);
-
 	if (verify) {
 		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
 			u32 val;
@@ -324,15 +353,11 @@  static int csum_tree_block(struct btrfs_fs_info *fs_info,
 				"level %d\n",
 				fs_info->sb->s_id, buf->start,
 				val, found, btrfs_header_level(buf));
-			if (result != (char *)&inline_result)
-				kfree(result);
 			return 1;
 		}
 	} else {
-		write_extent_buffer(buf, result, 0, csum_size);
+		write_extent_buffer(buf, result, 0, BTRFS_CSUM_SIZE);
 	}
-	if (result != (char *)&inline_result)
-		kfree(result);
 	return 0;
 }