diff mbox

[v8,10/27] btrfs: dedupe: Add basic tree structure for on-disk dedupe method

Message ID 1458610552-9845-11-git-send-email-quwenruo@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Qu Wenruo March 22, 2016, 1:35 a.m. UTC
Introduce a new tree, dedupe tree to record on-disk dedupe hash.
As a persist hash storage instead of in-memeory only implement.

Unlike Liu Bo's implement, in this version we won't do hack for
bytenr -> hash search, but add a new type, DEDUP_BYTENR_ITEM for such
search case, just like in-memory backend.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/ctree.h             | 63 +++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/dedupe.h            |  5 ++++
 fs/btrfs/disk-io.c           |  1 +
 include/trace/events/btrfs.h |  3 ++-
 4 files changed, 70 insertions(+), 2 deletions(-)

Comments

Chris Mason March 24, 2016, 8:58 p.m. UTC | #1
On Tue, Mar 22, 2016 at 09:35:35AM +0800, Qu Wenruo wrote:
> Introduce a new tree, dedupe tree to record on-disk dedupe hash.
> As a persist hash storage instead of in-memeory only implement.
> 
> Unlike Liu Bo's implement, in this version we won't do hack for
> bytenr -> hash search, but add a new type, DEDUP_BYTENR_ITEM for such
> search case, just like in-memory backend.

Thanks for refreshing this again, I'm starting to go through the disk
format in more detail.

> 
> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/ctree.h             | 63 +++++++++++++++++++++++++++++++++++++++++++-
>  fs/btrfs/dedupe.h            |  5 ++++
>  fs/btrfs/disk-io.c           |  1 +
>  include/trace/events/btrfs.h |  3 ++-
>  4 files changed, 70 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 022ab61..bed9273 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -100,6 +100,9 @@ struct btrfs_ordered_sum;
>  /* tracks free space in block groups. */
>  #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
>  
> +/* on-disk dedupe tree (EXPERIMENTAL) */
> +#define BTRFS_DEDUPE_TREE_OBJECTID 11ULL
> +
>  /* device stats in the device tree */
>  #define BTRFS_DEV_STATS_OBJECTID 0ULL
>  
> @@ -508,6 +511,7 @@ struct btrfs_super_block {
>   * ones specified below then we will fail to mount
>   */
>  #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE	(1ULL << 0)
> +#define BTRFS_FEATURE_COMPAT_RO_DEDUPE		(1ULL << 1)
>  
>  #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
>  #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
> @@ -537,7 +541,8 @@ struct btrfs_super_block {
>  #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
>  
>  #define BTRFS_FEATURE_COMPAT_RO_SUPP			\
> -	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
> +	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
> +	 BTRFS_FEATURE_COMPAT_RO_DEDUPE)
>  
>  #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
>  #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
> @@ -959,6 +964,42 @@ struct btrfs_csum_item {
>  	u8 csum;
>  } __attribute__ ((__packed__));
>  
> +/*
> + * Objectid: 0
> + * Type: BTRFS_DEDUPE_STATUS_ITEM_KEY
> + * Offset: 0
> + */
> +struct btrfs_dedupe_status_item {
> +	__le64 blocksize;
> +	__le64 limit_nr;
> +	__le16 hash_type;
> +	__le16 backend;
> +} __attribute__ ((__packed__));
> +
> +/*
> + * Objectid: Last 64 bit of the hash
> + * Type: BTRFS_DEDUPE_HASH_ITEM_KEY
> + * Offset: Bytenr of the hash
> + *
> + * Used for hash <-> bytenr search
> + */
> +struct btrfs_dedupe_hash_item {
> +	/* length of dedupe range */
> +	__le32 len;
> +
> +	/* Hash follows */
> +} __attribute__ ((__packed__));

Are you storing the entire hash, or just the parts not represented in
the key?  I'd like to keep the on-disk part as compact as possible for
this part.

> +
> +/*
> + * Objectid: bytenr
> + * Type: BTRFS_DEDUPE_BYTENR_ITEM_KEY
> + * offset: Last 64 bit of the hash
> + *
> + * Used for bytenr <-> hash search (for free_extent)
> + * all its content is hash.
> + * So no special item struct is needed.
> + */
> +

Can we do this instead with a backref from the extent?  It'll save us a
huge amount of IO as we delete things.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo March 25, 2016, 1:59 a.m. UTC | #2
Chris Mason wrote on 2016/03/24 16:58 -0400:
> On Tue, Mar 22, 2016 at 09:35:35AM +0800, Qu Wenruo wrote:
>> Introduce a new tree, dedupe tree to record on-disk dedupe hash.
>> As a persist hash storage instead of in-memeory only implement.
>>
>> Unlike Liu Bo's implement, in this version we won't do hack for
>> bytenr -> hash search, but add a new type, DEDUP_BYTENR_ITEM for such
>> search case, just like in-memory backend.
>
> Thanks for refreshing this again, I'm starting to go through the disk
> format in more detail.
>
>>
>> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
>> Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>>   fs/btrfs/ctree.h             | 63 +++++++++++++++++++++++++++++++++++++++++++-
>>   fs/btrfs/dedupe.h            |  5 ++++
>>   fs/btrfs/disk-io.c           |  1 +
>>   include/trace/events/btrfs.h |  3 ++-
>>   4 files changed, 70 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 022ab61..bed9273 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -100,6 +100,9 @@ struct btrfs_ordered_sum;
>>   /* tracks free space in block groups. */
>>   #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
>>
>> +/* on-disk dedupe tree (EXPERIMENTAL) */
>> +#define BTRFS_DEDUPE_TREE_OBJECTID 11ULL
>> +
>>   /* device stats in the device tree */
>>   #define BTRFS_DEV_STATS_OBJECTID 0ULL
>>
>> @@ -508,6 +511,7 @@ struct btrfs_super_block {
>>    * ones specified below then we will fail to mount
>>    */
>>   #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE	(1ULL << 0)
>> +#define BTRFS_FEATURE_COMPAT_RO_DEDUPE		(1ULL << 1)
>>
>>   #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
>>   #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
>> @@ -537,7 +541,8 @@ struct btrfs_super_block {
>>   #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
>>
>>   #define BTRFS_FEATURE_COMPAT_RO_SUPP			\
>> -	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
>> +	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
>> +	 BTRFS_FEATURE_COMPAT_RO_DEDUPE)
>>
>>   #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
>>   #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
>> @@ -959,6 +964,42 @@ struct btrfs_csum_item {
>>   	u8 csum;
>>   } __attribute__ ((__packed__));
>>
>> +/*
>> + * Objectid: 0
>> + * Type: BTRFS_DEDUPE_STATUS_ITEM_KEY
>> + * Offset: 0
>> + */
>> +struct btrfs_dedupe_status_item {
>> +	__le64 blocksize;
>> +	__le64 limit_nr;
>> +	__le16 hash_type;
>> +	__le16 backend;
>> +} __attribute__ ((__packed__));
>> +
>> +/*
>> + * Objectid: Last 64 bit of the hash
>> + * Type: BTRFS_DEDUPE_HASH_ITEM_KEY
>> + * Offset: Bytenr of the hash
>> + *
>> + * Used for hash <-> bytenr search
>> + */
>> +struct btrfs_dedupe_hash_item {
>> +	/* length of dedupe range */
>> +	__le32 len;
>> +
>> +	/* Hash follows */
>> +} __attribute__ ((__packed__));
>
> Are you storing the entire hash, or just the parts not represented in
> the key?  I'd like to keep the on-disk part as compact as possible for
> this part.

Currently, it's entire hash.

More detailed can be checked in another mail.
http://article.gmane.org/gmane.comp.file-systems.btrfs/54432

Although it's OK to truncate the last duplicated 8 bytes(64bit) for me,
I still quite like current implementation, as one memcpy() is simpler.

>
>> +
>> +/*
>> + * Objectid: bytenr
>> + * Type: BTRFS_DEDUPE_BYTENR_ITEM_KEY
>> + * offset: Last 64 bit of the hash
>> + *
>> + * Used for bytenr <-> hash search (for free_extent)
>> + * all its content is hash.
>> + * So no special item struct is needed.
>> + */
>> +
>
> Can we do this instead with a backref from the extent?  It'll save us a
> huge amount of IO as we delete things.

That's the original implementation from Liu Bo.

The problem is, it changes the data backref rules(originally, only 
EXTENT_DATA item can cause data backref), and will make dedupe INCOMPACT 
other than current RO_COMPACT.
So I really don't like to change the data backref rule.


If only want to reduce ondisk space, just trashing the hash and making 
DEDUPE_BYTENR_ITEM have no data would be good enough.

As (bytenr, DEDEUPE_BYTENR_ITEM) can locate the hash uniquely.

In fact no code really checked the hash for dedupe bytenr item, they all 
just swap objectid and offset, reset the type and do search for 
DEDUPE_HASH_ITEM.

So it's OK to emit the hash.

Thanks,
Qu

>
> -chris
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Mason March 25, 2016, 3:11 p.m. UTC | #3
On Fri, Mar 25, 2016 at 09:59:39AM +0800, Qu Wenruo wrote:
> 
> 
> Chris Mason wrote on 2016/03/24 16:58 -0400:
> >Are you storing the entire hash, or just the parts not represented in
> >the key?  I'd like to keep the on-disk part as compact as possible for
> >this part.
> 
> Currently, it's entire hash.
> 
> More detailed can be checked in another mail.
> 
> Although it's OK to truncate the last duplicated 8 bytes(64bit) for me,
> I still quite like current implementation, as one memcpy() is simpler.

[ sorry FB makes urls look ugly, so I delete them from replys ;) ]

Right, I saw that but wanted to reply to the specific patch.  One of the
lessons learned from the extent allocation tree and file extent items is
they are just too big.  Lets save those bytes, it'll add up.

> 
> >
> >>+
> >>+/*
> >>+ * Objectid: bytenr
> >>+ * Type: BTRFS_DEDUPE_BYTENR_ITEM_KEY
> >>+ * offset: Last 64 bit of the hash
> >>+ *
> >>+ * Used for bytenr <-> hash search (for free_extent)
> >>+ * all its content is hash.
> >>+ * So no special item struct is needed.
> >>+ */
> >>+
> >
> >Can we do this instead with a backref from the extent?  It'll save us a
> >huge amount of IO as we delete things.
> 
> That's the original implementation from Liu Bo.
> 
> The problem is, it changes the data backref rules(originally, only
> EXTENT_DATA item can cause data backref), and will make dedupe INCOMPACT
> other than current RO_COMPACT.
> So I really don't like to change the data backref rule.

Let me reread this part, the cost of maintaining the second index is
dramatically higher than adding a backref.  I do agree that's its nice
to be able to delete the dedup trees without impacting the rest, but
over the long term I think we'll regret the added balances.

> 
> If only want to reduce ondisk space, just trashing the hash and making
> DEDUPE_BYTENR_ITEM have no data would be good enough.
> 
> As (bytenr, DEDEUPE_BYTENR_ITEM) can locate the hash uniquely.

For the second index, the big problem is the cost of the btree
operations.  We're already pretty expensive in terms of the cost of
deleting an extent, with dedup its 2x higher, with dedup + extra index,
its 3x higher.

> 
> In fact no code really checked the hash for dedupe bytenr item, they all
> just swap objectid and offset, reset the type and do search for
> DEDUPE_HASH_ITEM.
> 
> So it's OK to emit the hash.

If we have to go with the second index, I do agree here.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo March 26, 2016, 1:11 p.m. UTC | #4
On 03/25/2016 11:11 PM, Chris Mason wrote:
> On Fri, Mar 25, 2016 at 09:59:39AM +0800, Qu Wenruo wrote:
>>
>>
>> Chris Mason wrote on 2016/03/24 16:58 -0400:
>>> Are you storing the entire hash, or just the parts not represented in
>>> the key?  I'd like to keep the on-disk part as compact as possible for
>>> this part.
>>
>> Currently, it's entire hash.
>>
>> More detailed can be checked in another mail.
>>
>> Although it's OK to truncate the last duplicated 8 bytes(64bit) for me,
>> I still quite like current implementation, as one memcpy() is simpler.
>
> [ sorry FB makes urls look ugly, so I delete them from replys ;) ]
>
> Right, I saw that but wanted to reply to the specific patch.  One of the
> lessons learned from the extent allocation tree and file extent items is
> they are just too big.  Lets save those bytes, it'll add up.

OK, I'll reduce the duplicated last 8 bytes.

And also, removing the "length" member, as it can be always fetched from 
dedupe_info->block_size.

The length itself is used to verify if we are at the transaction to a 
new dedupe size, but later we use full sync_fs(), such behavior is not 
needed any more.


>
>>
>>>
>>>> +
>>>> +/*
>>>> + * Objectid: bytenr
>>>> + * Type: BTRFS_DEDUPE_BYTENR_ITEM_KEY
>>>> + * offset: Last 64 bit of the hash
>>>> + *
>>>> + * Used for bytenr <-> hash search (for free_extent)
>>>> + * all its content is hash.
>>>> + * So no special item struct is needed.
>>>> + */
>>>> +
>>>
>>> Can we do this instead with a backref from the extent?  It'll save us a
>>> huge amount of IO as we delete things.
>>
>> That's the original implementation from Liu Bo.
>>
>> The problem is, it changes the data backref rules(originally, only
>> EXTENT_DATA item can cause data backref), and will make dedupe INCOMPACT
>> other than current RO_COMPACT.
>> So I really don't like to change the data backref rule.
>
> Let me reread this part, the cost of maintaining the second index is
> dramatically higher than adding a backref.  I do agree that's its nice
> to be able to delete the dedup trees without impacting the rest, but
> over the long term I think we'll regret the added balances.

Thanks for pointing the problem. Yes, I didn't even consider this fact.

But, on the other hand. such remove only happens when we remove the 
*last* reference of the extent.
So, for medium to high dedupe rate case, such routine is not that 
frequent, which will reduce the impact.
(Which is quite different for non-dedupe case)

And for low dedupe rate case, why use dedupe anyway. In that case, 
compression would be much more appropriate if user just wants to reduce 
disk usage IMO.


Another reason I don't want to touch delayed-ref codes is, it already 
has made us quite pain.
We were fighting with delayed-ref from the beginning.
The delayed ref, especially the ability to run delayed refs 
asynchronously, is the biggest problem we met.

And that's why we added ability to increase data ref while holding 
delayed_refs->lock in patch 5, and then uses a long lock-and-try-inc 
method to search hash in patch 6.

Any modification to delayed ref can easily lead to new bugs (Yes, I have 
proved it several times by myself).
So I choose to use current method.

>
>>
>> If only want to reduce ondisk space, just trashing the hash and making
>> DEDUPE_BYTENR_ITEM have no data would be good enough.
>>
>> As (bytenr, DEDEUPE_BYTENR_ITEM) can locate the hash uniquely.
>
> For the second index, the big problem is the cost of the btree
> operations.  We're already pretty expensive in terms of the cost of
> deleting an extent, with dedup its 2x higher, with dedup + extra index,
> its 3x higher.

The good news is, we only delete hash bytenr and its ref at the last 
de-reference.
And in normal (medium to high dedupe rate) case, it's not a frequent 
operation IMHO.

Thanks,
Qu

>
>>
>> In fact no code really checked the hash for dedupe bytenr item, they all
>> just swap objectid and offset, reset the type and do search for
>> DEDUPE_HASH_ITEM.
>>
>> So it's OK to emit the hash.
>
> If we have to go with the second index, I do agree here.
>
> -chris
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Mason March 28, 2016, 2:09 p.m. UTC | #5
On Sat, Mar 26, 2016 at 09:11:53PM +0800, Qu Wenruo wrote:
> 
> 
> On 03/25/2016 11:11 PM, Chris Mason wrote:
> >On Fri, Mar 25, 2016 at 09:59:39AM +0800, Qu Wenruo wrote:
> >>
> >>
> >>Chris Mason wrote on 2016/03/24 16:58 -0400:
> >>>Are you storing the entire hash, or just the parts not represented in
> >>>the key?  I'd like to keep the on-disk part as compact as possible for
> >>>this part.
> >>
> >>Currently, it's entire hash.
> >>
> >>More detailed can be checked in another mail.
> >>
> >>Although it's OK to truncate the last duplicated 8 bytes(64bit) for me,
> >>I still quite like current implementation, as one memcpy() is simpler.
> >
> >[ sorry FB makes urls look ugly, so I delete them from replys ;) ]
> >
> >Right, I saw that but wanted to reply to the specific patch.  One of the
> >lessons learned from the extent allocation tree and file extent items is
> >they are just too big.  Lets save those bytes, it'll add up.
> 
> OK, I'll reduce the duplicated last 8 bytes.
> 
> And also, removing the "length" member, as it can be always fetched from
> dedupe_info->block_size.

This would mean dedup_info->block_size is a write once field.  I'm ok
with that (just like metadata blocksize) but we should make sure the
ioctls etc don't allow changing it.

> 
> The length itself is used to verify if we are at the transaction to a new
> dedupe size, but later we use full sync_fs(), such behavior is not needed
> any more.
> 
> 
> >
> >>
> >>>
> >>>>+
> >>>>+/*
> >>>>+ * Objectid: bytenr
> >>>>+ * Type: BTRFS_DEDUPE_BYTENR_ITEM_KEY
> >>>>+ * offset: Last 64 bit of the hash
> >>>>+ *
> >>>>+ * Used for bytenr <-> hash search (for free_extent)
> >>>>+ * all its content is hash.
> >>>>+ * So no special item struct is needed.
> >>>>+ */
> >>>>+
> >>>
> >>>Can we do this instead with a backref from the extent?  It'll save us a
> >>>huge amount of IO as we delete things.
> >>
> >>That's the original implementation from Liu Bo.
> >>
> >>The problem is, it changes the data backref rules(originally, only
> >>EXTENT_DATA item can cause data backref), and will make dedupe INCOMPACT
> >>other than current RO_COMPACT.
> >>So I really don't like to change the data backref rule.
> >
> >Let me reread this part, the cost of maintaining the second index is
> >dramatically higher than adding a backref.  I do agree that's its nice
> >to be able to delete the dedup trees without impacting the rest, but
> >over the long term I think we'll regret the added balances.
> 
> Thanks for pointing the problem. Yes, I didn't even consider this fact.
> 
> But, on the other hand. such remove only happens when we remove the *last*
> reference of the extent.
> So, for medium to high dedupe rate case, such routine is not that frequent,
> which will reduce the impact.
> (Which is quite different for non-dedupe case)

It's both addition and removal, and the efficiency hit does depend on
what level of sharing you're able to achieve.  But what we don't want is
for metadata usage to explode as people make small non-duplicate changes
to their FS.   If that happens, we'll only end up using dedup in back up
farms and other highly limited use cases.

I do agree that delayed refs are error prone, but that's a good reason
not fix delayed refs, not to recreate the backrefs of the extent
allocation tree in a new dedicated tree.

-chris

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo March 29, 2016, 1:47 a.m. UTC | #6
Chris Mason wrote on 2016/03/28 10:09 -0400:
> On Sat, Mar 26, 2016 at 09:11:53PM +0800, Qu Wenruo wrote:
>>
>>
>> On 03/25/2016 11:11 PM, Chris Mason wrote:
>>> On Fri, Mar 25, 2016 at 09:59:39AM +0800, Qu Wenruo wrote:
>>>>
>>>>
>>>> Chris Mason wrote on 2016/03/24 16:58 -0400:
>>>>> Are you storing the entire hash, or just the parts not represented in
>>>>> the key?  I'd like to keep the on-disk part as compact as possible for
>>>>> this part.
>>>>
>>>> Currently, it's entire hash.
>>>>
>>>> More detailed can be checked in another mail.
>>>>
>>>> Although it's OK to truncate the last duplicated 8 bytes(64bit) for me,
>>>> I still quite like current implementation, as one memcpy() is simpler.
>>>
>>> [ sorry FB makes urls look ugly, so I delete them from replys ;) ]
>>>
>>> Right, I saw that but wanted to reply to the specific patch.  One of the
>>> lessons learned from the extent allocation tree and file extent items is
>>> they are just too big.  Lets save those bytes, it'll add up.
>>
>> OK, I'll reduce the duplicated last 8 bytes.
>>
>> And also, removing the "length" member, as it can be always fetched from
>> dedupe_info->block_size.
>
> This would mean dedup_info->block_size is a write once field.  I'm ok
> with that (just like metadata blocksize) but we should make sure the
> ioctls etc don't allow changing it.

Not a problem, current block_size change is done by completely disabling 
dedupe(imply a sync_fs), then re-enable with new block_size.

So it would be OK.

>
>>
>> The length itself is used to verify if we are at the transaction to a new
>> dedupe size, but later we use full sync_fs(), such behavior is not needed
>> any more.
>>
>>
>>>
>>>>
>>>>>
>>>>>> +
>>>>>> +/*
>>>>>> + * Objectid: bytenr
>>>>>> + * Type: BTRFS_DEDUPE_BYTENR_ITEM_KEY
>>>>>> + * offset: Last 64 bit of the hash
>>>>>> + *
>>>>>> + * Used for bytenr <-> hash search (for free_extent)
>>>>>> + * all its content is hash.
>>>>>> + * So no special item struct is needed.
>>>>>> + */
>>>>>> +
>>>>>
>>>>> Can we do this instead with a backref from the extent?  It'll save us a
>>>>> huge amount of IO as we delete things.
>>>>
>>>> That's the original implementation from Liu Bo.
>>>>
>>>> The problem is, it changes the data backref rules(originally, only
>>>> EXTENT_DATA item can cause data backref), and will make dedupe INCOMPACT
>>>> other than current RO_COMPACT.
>>>> So I really don't like to change the data backref rule.
>>>
>>> Let me reread this part, the cost of maintaining the second index is
>>> dramatically higher than adding a backref.  I do agree that's its nice
>>> to be able to delete the dedup trees without impacting the rest, but
>>> over the long term I think we'll regret the added balances.
>>
>> Thanks for pointing the problem. Yes, I didn't even consider this fact.
>>
>> But, on the other hand. such remove only happens when we remove the *last*
>> reference of the extent.
>> So, for medium to high dedupe rate case, such routine is not that frequent,
>> which will reduce the impact.
>> (Which is quite different for non-dedupe case)
>
> It's both addition and removal, and the efficiency hit does depend on
> what level of sharing you're able to achieve.  But what we don't want is
> for metadata usage to explode as people make small non-duplicate changes
> to their FS.
>   If that happens, we'll only end up using dedup in back up
> farms and other highly limited use cases.

Right, with current dedupe-specific backref, it'll bring unavoidable 
metadata overhead.

[[People are trading-off using non-default feature]]
Although IMHO, dedupe is not a generic feature, just like compression 
and possible encryption, people choose them with trade-off in their mind.

For example, compression can achieve quite high performance for easily 
compressible data, but can also get quite low performance for not so 
compressible data, like ISO file or videos.
(In my test with 2 cores VM, virtio blk on HDD, dd ISO into btrfs file 
will causing about 90MB/s for default mount option, while with 
compression, it's only about 40~50MB/s)

If we combine all overhead together (not only metadata overhead), almost 
all current transparent data processing method will only benefit 
specific use case while reducing generic performance.

So increased metadata overhead is acceptable for me, especially when the 
main overhead is CPU time spent on SHA256.

And we have workaround from setting dedupe disable prop to setting 
larger dedupe block_size to avoid small and non-dedupe writes to fill 
dedupe tree.


>
> I do agree that delayed refs are error prone, but that's a good reason
> not fix delayed refs, not to recreate the backrefs of the extent
> allocation tree in a new dedicated tree.

[[We need an idea generic for both backends]]
Also I want to mention is, dedupe now contains 2 different backends, so 
we'd better choose one idea that won't break different backends into 
different incompat/ro_compat flags.

If using backref method, ondisk backend will definitely make dedupe 
incompatible, affecting in-memory backend even it's completely 
backward-compatible.

Or, splitting dedupe flag into DEDUPE_ONDISK and DEDUPE_INMEMORY, and 
former one is INCOMPAT, while latter is at most RO_COMPAT(if using 
dedupe tree).


[[Cleaner layout is less bug-prone]]
The last point of using dedupe specific backref, is to reduce the 
possible bug affection, which for me is more important than performance.

Current implementation will limit dedupe backref bug to dedupe only.
While a new backref bug will definitely impact almost all btrfs function.

Thanks,
Qu

>
> -chris
>
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 022ab61..bed9273 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -100,6 +100,9 @@  struct btrfs_ordered_sum;
 /* tracks free space in block groups. */
 #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
 
+/* on-disk dedupe tree (EXPERIMENTAL) */
+#define BTRFS_DEDUPE_TREE_OBJECTID 11ULL
+
 /* device stats in the device tree */
 #define BTRFS_DEV_STATS_OBJECTID 0ULL
 
@@ -508,6 +511,7 @@  struct btrfs_super_block {
  * ones specified below then we will fail to mount
  */
 #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE	(1ULL << 0)
+#define BTRFS_FEATURE_COMPAT_RO_DEDUPE		(1ULL << 1)
 
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
@@ -537,7 +541,8 @@  struct btrfs_super_block {
 #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
 
 #define BTRFS_FEATURE_COMPAT_RO_SUPP			\
-	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
+	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
+	 BTRFS_FEATURE_COMPAT_RO_DEDUPE)
 
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
@@ -959,6 +964,42 @@  struct btrfs_csum_item {
 	u8 csum;
 } __attribute__ ((__packed__));
 
+/*
+ * Objectid: 0
+ * Type: BTRFS_DEDUPE_STATUS_ITEM_KEY
+ * Offset: 0
+ */
+struct btrfs_dedupe_status_item {
+	__le64 blocksize;
+	__le64 limit_nr;
+	__le16 hash_type;
+	__le16 backend;
+} __attribute__ ((__packed__));
+
+/*
+ * Objectid: Last 64 bit of the hash
+ * Type: BTRFS_DEDUPE_HASH_ITEM_KEY
+ * Offset: Bytenr of the hash
+ *
+ * Used for hash <-> bytenr search
+ */
+struct btrfs_dedupe_hash_item {
+	/* length of dedupe range */
+	__le32 len;
+
+	/* Hash follows */
+} __attribute__ ((__packed__));
+
+/*
+ * Objectid: bytenr
+ * Type: BTRFS_DEDUPE_BYTENR_ITEM_KEY
+ * offset: Last 64 bit of the hash
+ *
+ * Used for bytenr <-> hash search (for free_extent)
+ * all its content is hash.
+ * So no special item struct is needed.
+ */
+
 struct btrfs_dev_stats_item {
 	/*
 	 * grow this item struct at the end for future enhancements and keep
@@ -2167,6 +2208,13 @@  struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_CHUNK_ITEM_KEY	228
 
 /*
+ * Dedup item and status
+ */
+#define BTRFS_DEDUPE_STATUS_ITEM_KEY	230
+#define BTRFS_DEDUPE_HASH_ITEM_KEY	231
+#define BTRFS_DEDUPE_BYTENR_ITEM_KEY	232
+
+/*
  * Records the overall state of the qgroups.
  * There's only one instance of this key present,
  * (0, BTRFS_QGROUP_STATUS_KEY, 0)
@@ -3263,6 +3311,19 @@  static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 	return offsetof(struct btrfs_leaf, items);
 }
 
+/* btrfs_dedupe_status */
+BTRFS_SETGET_FUNCS(dedupe_status_blocksize, struct btrfs_dedupe_status_item,
+		   blocksize, 64);
+BTRFS_SETGET_FUNCS(dedupe_status_limit, struct btrfs_dedupe_status_item,
+		   limit_nr, 64);
+BTRFS_SETGET_FUNCS(dedupe_status_hash_type, struct btrfs_dedupe_status_item,
+		   hash_type, 16);
+BTRFS_SETGET_FUNCS(dedupe_status_backend, struct btrfs_dedupe_status_item,
+		   backend, 16);
+
+/* btrfs_dedupe_hash_item */
+BTRFS_SETGET_FUNCS(dedupe_hash_len, struct btrfs_dedupe_hash_item, len, 32);
+
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index ab1aef7..537f0b8 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -58,6 +58,8 @@  struct btrfs_dedupe_hash {
 	u8 hash[];
 };
 
+struct btrfs_root;
+
 struct btrfs_dedupe_info {
 	/* dedupe blocksize */
 	u64 blocksize;
@@ -73,6 +75,9 @@  struct btrfs_dedupe_info {
 	struct list_head lru_list;
 	u64 limit_nr;
 	u64 current_nr;
+
+	/* for persist data like dedup-hash and dedupe status */
+	struct btrfs_root *dedupe_root;
 };
 
 struct btrfs_trans_handle;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3cf4c11..57ae928 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -183,6 +183,7 @@  static struct btrfs_lockdep_keyset {
 	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
 	{ .id = BTRFS_UUID_TREE_OBJECTID,	.name_stem = "uuid"	},
 	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	.name_stem = "free-space" },
+	{ .id = BTRFS_DEDUPE_TREE_OBJECTID,	.name_stem = "dedupe"	},
 	{ .id = 0,				.name_stem = "tree"	},
 };
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index d866f21..2c3d48a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -47,12 +47,13 @@  struct btrfs_qgroup_operation;
 		{ BTRFS_TREE_RELOC_OBJECTID,	"TREE_RELOC"	},	\
 		{ BTRFS_UUID_TREE_OBJECTID,	"UUID_TREE"	},	\
 		{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },	\
+		{ BTRFS_DEDUPE_TREE_OBJECTID,	"DEDUPE_TREE"	},	\
 		{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })
 
 #define show_root_type(obj)						\
 	obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||		\
 	      (obj >= BTRFS_ROOT_TREE_OBJECTID &&			\
-	       obj <= BTRFS_QUOTA_TREE_OBJECTID)) ? __show_root_type(obj) : "-"
+	       obj <= BTRFS_DEDUPE_TREE_OBJECTID)) ? __show_root_type(obj) : "-"
 
 #define BTRFS_GROUP_FLAGS	\
 	{ BTRFS_BLOCK_GROUP_DATA,	"DATA"},	\