diff mbox

[1/2] Btrfs: online data deduplication

Message ID 1365340369-23537-2-git-send-email-bo.li.liu@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Liu Bo April 7, 2013, 1:12 p.m. UTC
(NOTE: This leads to a FORMAT CHANGE, DO NOT use it on real data.)

This introduce the online data deduplication feature for btrfs.

(1) WHY do we need deduplication?
    To improve our storage effiency.

(2) WHAT is deduplication?
    Two key ways for practical deduplication implementations,
    *  When the data is deduplicated
       (inband vs background)
    *  The granularity of the deduplication.
       (block level vs file level)

    For btrfs, we choose
    *  inband(synchronous)
    *  block level

    We choose them because of the same reason as how zfs does.
    a)  To get an immediate benefit.
    b)  To remove redundant parts within a file.

    So we have an inband, block level data deduplication here.

(3) HOW does deduplication works?
    This makes full use of file extent back reference, the same way as
    IOCTL_CLONE, which lets us easily store multiple copies of a set of
    data as a single copy along with an index of references to the copy.

    Here we have
    a)  a new dedicated tree(DEDUP tree) and
    b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
        (stop 64bits of hash, type, disk offset),
        *  stop 64bits of hash
           It comes from sha256, which is very helpful on avoiding collision.
           And we take the stop 64bits as the index.
        *  disk offset
           It helps to find where the data is stored.

    So the whole deduplication process works as,
    1) write something,
    2) calculate the hash of this "something",
    3) try to find the match of hash value by searching DEDUP keys in
       a dedicated tree, DEDUP tree.
    4) if found, skip real IO and link to the existing copy
       if not, do real IO and insert a DEDUP key to the DEDUP tree.

    For now, we limit the deduplication unit to PAGESIZE, 4096, and we're
    going to increase this unit dynamically in the future.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/ctree.h        |   53 ++++++++
 fs/btrfs/disk-io.c      |   33 +++++-
 fs/btrfs/extent-tree.c  |   22 +++-
 fs/btrfs/extent_io.c    |    8 +-
 fs/btrfs/extent_io.h    |   11 ++
 fs/btrfs/file-item.c    |  186 ++++++++++++++++++++++++++
 fs/btrfs/file.c         |    6 +-
 fs/btrfs/inode.c        |  330 +++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/ioctl.c        |   34 +++++-
 fs/btrfs/ordered-data.c |   25 +++-
 fs/btrfs/ordered-data.h |    9 ++
 fs/btrfs/print-tree.c   |    6 +-
 fs/btrfs/super.c        |    7 +-
 13 files changed, 685 insertions(+), 45 deletions(-)

Comments

Josef Bacik April 8, 2013, 12:54 p.m. UTC | #1
On Sun, Apr 07, 2013 at 07:12:48AM -0600, Liu Bo wrote:
> (NOTE: This leads to a FORMAT CHANGE, DO NOT use it on real data.)
> 
> This introduce the online data deduplication feature for btrfs.
> 
> (1) WHY do we need deduplication?
>     To improve our storage effiency.
> 
> (2) WHAT is deduplication?
>     Two key ways for practical deduplication implementations,
>     *  When the data is deduplicated
>        (inband vs background)
>     *  The granularity of the deduplication.
>        (block level vs file level)
> 
>     For btrfs, we choose
>     *  inband(synchronous)
>     *  block level
> 
>     We choose them because of the same reason as how zfs does.
>     a)  To get an immediate benefit.
>     b)  To remove redundant parts within a file.
> 
>     So we have an inband, block level data deduplication here.
> 
> (3) HOW does deduplication works?
>     This makes full use of file extent back reference, the same way as
>     IOCTL_CLONE, which lets us easily store multiple copies of a set of
>     data as a single copy along with an index of references to the copy.
> 
>     Here we have
>     a)  a new dedicated tree(DEDUP tree) and
>     b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
>         (stop 64bits of hash, type, disk offset),
>         *  stop 64bits of hash
>            It comes from sha256, which is very helpful on avoiding collision.
>            And we take the stop 64bits as the index.
>         *  disk offset
>            It helps to find where the data is stored.
> 
>     So the whole deduplication process works as,
>     1) write something,
>     2) calculate the hash of this "something",
>     3) try to find the match of hash value by searching DEDUP keys in
>        a dedicated tree, DEDUP tree.
>     4) if found, skip real IO and link to the existing copy
>        if not, do real IO and insert a DEDUP key to the DEDUP tree.
> 
>     For now, we limit the deduplication unit to PAGESIZE, 4096, and we're
>     going to increase this unit dynamically in the future.
> 
> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> ---
>  fs/btrfs/ctree.h        |   53 ++++++++
>  fs/btrfs/disk-io.c      |   33 +++++-
>  fs/btrfs/extent-tree.c  |   22 +++-
>  fs/btrfs/extent_io.c    |    8 +-
>  fs/btrfs/extent_io.h    |   11 ++
>  fs/btrfs/file-item.c    |  186 ++++++++++++++++++++++++++
>  fs/btrfs/file.c         |    6 +-
>  fs/btrfs/inode.c        |  330 +++++++++++++++++++++++++++++++++++++++++++----
>  fs/btrfs/ioctl.c        |   34 +++++-
>  fs/btrfs/ordered-data.c |   25 +++-
>  fs/btrfs/ordered-data.h |    9 ++
>  fs/btrfs/print-tree.c   |    6 +-
>  fs/btrfs/super.c        |    7 +-
>  13 files changed, 685 insertions(+), 45 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 0d82922..59339bc 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -32,6 +32,7 @@
>  #include <asm/kmap_types.h>
>  #include <linux/pagemap.h>
>  #include <linux/btrfs.h>
> +#include <crypto/hash.h>
>  #include "extent_io.h"
>  #include "extent_map.h"
>  #include "async-thread.h"
> @@ -94,6 +95,9 @@ struct btrfs_ordered_sum;
>  /* holds quota configuration and tracking */
>  #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
> 
> +/* dedup tree(experimental) */
> +#define BTRFS_DEDUP_TREE_OBJECTID 9ULL
> +
>  /* orhpan objectid for tracking unlinked/truncated files */
>  #define BTRFS_ORPHAN_OBJECTID -5ULL
> 
> @@ -884,12 +888,31 @@ struct btrfs_file_extent_item {
>          */
>         __le64 num_bytes;
> 
> +       /*
> +        * the stop 64bits of sha256 hash value, this helps us find the
> +        * corresponding item in dedup tree.
> +        */
> +       __le64 dedup_hash;
> +
>  } __attribute__ ((__packed__));
> 

Please don't do this, do like what we do with the crc tree, just lookup based on
the disk bytenr when we free the extent and drop refs that way.  Don't further
bloat the file extent item, we want it smaller not larger.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba April 8, 2013, 1:47 p.m. UTC | #2
On Sun, Apr 07, 2013 at 09:12:48PM +0800, Liu Bo wrote:
> (2) WHAT is deduplication?
>     Two key ways for practical deduplication implementations,
>     *  When the data is deduplicated
>        (inband vs background)
>     *  The granularity of the deduplication.
>        (block level vs file level)
> 
>     For btrfs, we choose
>     *  inband(synchronous)
>     *  block level

Block level may be too fine grained leading to excessive fragmentation
and increased metadata usage given that there's a much higher chance to
find duplicate (4k) blocks here and there.

There's always a tradeoff, the practical values that are considered for
granularity range from 8k to 64, see eg. this paper for graphs and analyses

http://static.usenix.org/event/fast11/tech/full_papers/Meyer.pdf .

This also depends on file data type and access patterns, fixing the dedup
basic chunk size to one block does not IMHO fit most usecases.

> (3) HOW does deduplication works?
...
>     Here we have
>     a)  a new dedicated tree(DEDUP tree) and
>     b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
>         (stop 64bits of hash, type, disk offset),
>         *  stop 64bits of hash
>            It comes from sha256, which is very helpful on avoiding collision.
>            And we take the stop 64bits as the index.

Is it safe to use just 64 bits? I'd like to see better reasoning why
this is ok. The limitation of btrfs_key to store only 1-2 64bit items is
clear and must be handled, but it's IMO a critical design point.

>         *  disk offset
>            It helps to find where the data is stored.

Does the disk offset also help to resolving block hash collisions?

>     So the whole deduplication process works as,
>     1) write something,
>     2) calculate the hash of this "something",
>     3) try to find the match of hash value by searching DEDUP keys in
>        a dedicated tree, DEDUP tree.
>     4) if found, skip real IO and link to the existing copy
>        if not, do real IO and insert a DEDUP key to the DEDUP tree.

... how are the hash collisions handled? Using part of a secure has
cannot be considered equally strong (given that there is not other
safety checks like comparing the whole blocks).

Last but not least, there was another dedup proposal (author CCed)

http://thread.gmane.org/gmane.comp.file-systems.btrfs/21722


david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo April 8, 2013, 2:08 p.m. UTC | #3
On Mon, Apr 08, 2013 at 03:47:27PM +0200, David Sterba wrote:
> On Sun, Apr 07, 2013 at 09:12:48PM +0800, Liu Bo wrote:
> > (2) WHAT is deduplication?
> >     Two key ways for practical deduplication implementations,
> >     *  When the data is deduplicated
> >        (inband vs background)
> >     *  The granularity of the deduplication.
> >        (block level vs file level)
> > 
> >     For btrfs, we choose
> >     *  inband(synchronous)
> >     *  block level
> 
> Block level may be too fine grained leading to excessive fragmentation
> and increased metadata usage given that there's a much higher chance to
> find duplicate (4k) blocks here and there.
> 
> There's always a tradeoff, the practical values that are considered for
> granularity range from 8k to 64, see eg. this paper for graphs and analyses
> 
> http://static.usenix.org/event/fast11/tech/full_papers/Meyer.pdf .
> 
> This also depends on file data type and access patterns, fixing the dedup
> basic chunk size to one block does not IMHO fit most usecases.

Agreed, that'd definitely be a TODO.

I'd like to make this easier to control so that everyone can do their own
tradeoff depending on local cases.

> 
> > (3) HOW does deduplication works?
> ...
> >     Here we have
> >     a)  a new dedicated tree(DEDUP tree) and
> >     b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
> >         (stop 64bits of hash, type, disk offset),
> >         *  stop 64bits of hash
> >            It comes from sha256, which is very helpful on avoiding collision.
> >            And we take the stop 64bits as the index.
> 
> Is it safe to use just 64 bits? I'd like to see better reasoning why
> this is ok. The limitation of btrfs_key to store only 1-2 64bit items is
> clear and must be handled, but it's IMO a critical design point.

Actually I use the whole 256bit hash value(stored in its item) to avoid hash
collision, not just the stop 64bit.

Sorry for the bad commit log, seems I lost my mind somewhere at the time...


> 
> >         *  disk offset
> >            It helps to find where the data is stored.
> 
> Does the disk offset also help to resolving block hash collisions?

Ditto.

> 
> >     So the whole deduplication process works as,
> >     1) write something,
> >     2) calculate the hash of this "something",
> >     3) try to find the match of hash value by searching DEDUP keys in
> >        a dedicated tree, DEDUP tree.
> >     4) if found, skip real IO and link to the existing copy
> >        if not, do real IO and insert a DEDUP key to the DEDUP tree.
> 
> ... how are the hash collisions handled? Using part of a secure has
> cannot be considered equally strong (given that there is not other
> safety checks like comparing the whole blocks).
> 
> Last but not least, there was another dedup proposal (author CCed)
> 
> http://thread.gmane.org/gmane.comp.file-systems.btrfs/21722
> 
> 
> david

I waited for 2 months and wanted to see the actual code from the proposal, but I
failed, so I decided to do it myself.

Anyway I'd like to see this feature in btrfs no matter who write it down.

thanks,
liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo April 8, 2013, 2:16 p.m. UTC | #4
On Mon, Apr 08, 2013 at 08:54:50AM -0400, Josef Bacik wrote:
> On Sun, Apr 07, 2013 at 07:12:48AM -0600, Liu Bo wrote:
> > (NOTE: This leads to a FORMAT CHANGE, DO NOT use it on real data.)
> > 
> > This introduce the online data deduplication feature for btrfs.
> > 
> > (1) WHY do we need deduplication?
> >     To improve our storage effiency.
> > 
> > (2) WHAT is deduplication?
> >     Two key ways for practical deduplication implementations,
> >     *  When the data is deduplicated
> >        (inband vs background)
> >     *  The granularity of the deduplication.
> >        (block level vs file level)
> > 
> >     For btrfs, we choose
> >     *  inband(synchronous)
> >     *  block level
> > 
> >     We choose them because of the same reason as how zfs does.
> >     a)  To get an immediate benefit.
> >     b)  To remove redundant parts within a file.
> > 
> >     So we have an inband, block level data deduplication here.
> > 
> > (3) HOW does deduplication works?
> >     This makes full use of file extent back reference, the same way as
> >     IOCTL_CLONE, which lets us easily store multiple copies of a set of
> >     data as a single copy along with an index of references to the copy.
> > 
> >     Here we have
> >     a)  a new dedicated tree(DEDUP tree) and
> >     b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
> >         (stop 64bits of hash, type, disk offset),
> >         *  stop 64bits of hash
> >            It comes from sha256, which is very helpful on avoiding collision.
> >            And we take the stop 64bits as the index.
> >         *  disk offset
> >            It helps to find where the data is stored.
> > 
> >     So the whole deduplication process works as,
> >     1) write something,
> >     2) calculate the hash of this "something",
> >     3) try to find the match of hash value by searching DEDUP keys in
> >        a dedicated tree, DEDUP tree.
> >     4) if found, skip real IO and link to the existing copy
> >        if not, do real IO and insert a DEDUP key to the DEDUP tree.
> > 
> >     For now, we limit the deduplication unit to PAGESIZE, 4096, and we're
> >     going to increase this unit dynamically in the future.
> > 
> > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> > ---
> >  fs/btrfs/ctree.h        |   53 ++++++++
> >  fs/btrfs/disk-io.c      |   33 +++++-
> >  fs/btrfs/extent-tree.c  |   22 +++-
> >  fs/btrfs/extent_io.c    |    8 +-
> >  fs/btrfs/extent_io.h    |   11 ++
> >  fs/btrfs/file-item.c    |  186 ++++++++++++++++++++++++++
> >  fs/btrfs/file.c         |    6 +-
> >  fs/btrfs/inode.c        |  330 +++++++++++++++++++++++++++++++++++++++++++----
> >  fs/btrfs/ioctl.c        |   34 +++++-
> >  fs/btrfs/ordered-data.c |   25 +++-
> >  fs/btrfs/ordered-data.h |    9 ++
> >  fs/btrfs/print-tree.c   |    6 +-
> >  fs/btrfs/super.c        |    7 +-
> >  13 files changed, 685 insertions(+), 45 deletions(-)
> > 
> > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > index 0d82922..59339bc 100644
> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -32,6 +32,7 @@
> >  #include <asm/kmap_types.h>
> >  #include <linux/pagemap.h>
> >  #include <linux/btrfs.h>
> > +#include <crypto/hash.h>
> >  #include "extent_io.h"
> >  #include "extent_map.h"
> >  #include "async-thread.h"
> > @@ -94,6 +95,9 @@ struct btrfs_ordered_sum;
> >  /* holds quota configuration and tracking */
> >  #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
> > 
> > +/* dedup tree(experimental) */
> > +#define BTRFS_DEDUP_TREE_OBJECTID 9ULL
> > +
> >  /* orhpan objectid for tracking unlinked/truncated files */
> >  #define BTRFS_ORPHAN_OBJECTID -5ULL
> > 
> > @@ -884,12 +888,31 @@ struct btrfs_file_extent_item {
> >          */
> >         __le64 num_bytes;
> > 
> > +       /*
> > +        * the stop 64bits of sha256 hash value, this helps us find the
> > +        * corresponding item in dedup tree.
> > +        */
> > +       __le64 dedup_hash;
> > +
> >  } __attribute__ ((__packed__));
> > 
> 
> Please don't do this, do like what we do with the crc tree, just lookup based on
> the disk bytenr when we free the extent and drop refs that way.  Don't further
> bloat the file extent item, we want it smaller not larger.  Thanks,
> 
> Josef

So the real trouble is that I take this hash value as the first field of
btrfs_key, and binary searching without the precise first field is not easy.

Otherwise we may have to add another key type which replaces hash value with
disk bytenr, ie. (disk bytenr, ANOTHER_KEY_TYPE, hash value), then we'll need to
search the tree twice to free this one or drop refs.

Either case is tradeoff, but as this is an initial version, we can try all of
these knobs and choose the better one :)

thanks,
liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik April 8, 2013, 8:37 p.m. UTC | #5
On Mon, Apr 08, 2013 at 08:16:26AM -0600, Liu Bo wrote:
> On Mon, Apr 08, 2013 at 08:54:50AM -0400, Josef Bacik wrote:
> > On Sun, Apr 07, 2013 at 07:12:48AM -0600, Liu Bo wrote:
> > > (NOTE: This leads to a FORMAT CHANGE, DO NOT use it on real data.)
> > > 
> > > This introduce the online data deduplication feature for btrfs.
> > > 
> > > (1) WHY do we need deduplication?
> > >     To improve our storage effiency.
> > > 
> > > (2) WHAT is deduplication?
> > >     Two key ways for practical deduplication implementations,
> > >     *  When the data is deduplicated
> > >        (inband vs background)
> > >     *  The granularity of the deduplication.
> > >        (block level vs file level)
> > > 
> > >     For btrfs, we choose
> > >     *  inband(synchronous)
> > >     *  block level
> > > 
> > >     We choose them because of the same reason as how zfs does.
> > >     a)  To get an immediate benefit.
> > >     b)  To remove redundant parts within a file.
> > > 
> > >     So we have an inband, block level data deduplication here.
> > > 
> > > (3) HOW does deduplication works?
> > >     This makes full use of file extent back reference, the same way as
> > >     IOCTL_CLONE, which lets us easily store multiple copies of a set of
> > >     data as a single copy along with an index of references to the copy.
> > > 
> > >     Here we have
> > >     a)  a new dedicated tree(DEDUP tree) and
> > >     b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
> > >         (stop 64bits of hash, type, disk offset),
> > >         *  stop 64bits of hash
> > >            It comes from sha256, which is very helpful on avoiding collision.
> > >            And we take the stop 64bits as the index.
> > >         *  disk offset
> > >            It helps to find where the data is stored.
> > > 
> > >     So the whole deduplication process works as,
> > >     1) write something,
> > >     2) calculate the hash of this "something",
> > >     3) try to find the match of hash value by searching DEDUP keys in
> > >        a dedicated tree, DEDUP tree.
> > >     4) if found, skip real IO and link to the existing copy
> > >        if not, do real IO and insert a DEDUP key to the DEDUP tree.
> > > 
> > >     For now, we limit the deduplication unit to PAGESIZE, 4096, and we're
> > >     going to increase this unit dynamically in the future.
> > > 
> > > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> > > ---
> > >  fs/btrfs/ctree.h        |   53 ++++++++
> > >  fs/btrfs/disk-io.c      |   33 +++++-
> > >  fs/btrfs/extent-tree.c  |   22 +++-
> > >  fs/btrfs/extent_io.c    |    8 +-
> > >  fs/btrfs/extent_io.h    |   11 ++
> > >  fs/btrfs/file-item.c    |  186 ++++++++++++++++++++++++++
> > >  fs/btrfs/file.c         |    6 +-
> > >  fs/btrfs/inode.c        |  330 +++++++++++++++++++++++++++++++++++++++++++----
> > >  fs/btrfs/ioctl.c        |   34 +++++-
> > >  fs/btrfs/ordered-data.c |   25 +++-
> > >  fs/btrfs/ordered-data.h |    9 ++
> > >  fs/btrfs/print-tree.c   |    6 +-
> > >  fs/btrfs/super.c        |    7 +-
> > >  13 files changed, 685 insertions(+), 45 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > > index 0d82922..59339bc 100644
> > > --- a/fs/btrfs/ctree.h
> > > +++ b/fs/btrfs/ctree.h
> > > @@ -32,6 +32,7 @@
> > >  #include <asm/kmap_types.h>
> > >  #include <linux/pagemap.h>
> > >  #include <linux/btrfs.h>
> > > +#include <crypto/hash.h>
> > >  #include "extent_io.h"
> > >  #include "extent_map.h"
> > >  #include "async-thread.h"
> > > @@ -94,6 +95,9 @@ struct btrfs_ordered_sum;
> > >  /* holds quota configuration and tracking */
> > >  #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
> > > 
> > > +/* dedup tree(experimental) */
> > > +#define BTRFS_DEDUP_TREE_OBJECTID 9ULL
> > > +
> > >  /* orhpan objectid for tracking unlinked/truncated files */
> > >  #define BTRFS_ORPHAN_OBJECTID -5ULL
> > > 
> > > @@ -884,12 +888,31 @@ struct btrfs_file_extent_item {
> > >          */
> > >         __le64 num_bytes;
> > > 
> > > +       /*
> > > +        * the stop 64bits of sha256 hash value, this helps us find the
> > > +        * corresponding item in dedup tree.
> > > +        */
> > > +       __le64 dedup_hash;
> > > +
> > >  } __attribute__ ((__packed__));
> > > 
> > 
> > Please don't do this, do like what we do with the crc tree, just lookup based on
> > the disk bytenr when we free the extent and drop refs that way.  Don't further
> > bloat the file extent item, we want it smaller not larger.  Thanks,
> > 
> > Josef
> 
> So the real trouble is that I take this hash value as the first field of
> btrfs_key, and binary searching without the precise first field is not easy.
> 
> Otherwise we may have to add another key type which replaces hash value with
> disk bytenr, ie. (disk bytenr, ANOTHER_KEY_TYPE, hash value), then we'll need to
> search the tree twice to free this one or drop refs.
> 
> Either case is tradeoff, but as this is an initial version, we can try all of
> these knobs and choose the better one :)
> 

Why would you need to search twice?  You do something like

key.objectid = bytenr;
key.type = whatever your type is
key.offset = first 64bits of your hash

and then you search based on bytenr and you get your hash.  All extents that
share hashes will have the same bytenr so you can just search with

key.objectid = bytenr
key.type = whatever
key.offset = (u64)-1

and then walk backwards, or do 0 and walk forwards, whatever your preference is.
Also make it so the item stores the entire sha.  With Sha256 you are wasting
most of the hash result by just storing the first 64bits.  So just use the first
64bits as the index and then store the full hash in the item so you can do a
proper compare, and then you can have different hash functions later with
different length hash values.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo April 9, 2013, 1:34 a.m. UTC | #6
On Mon, Apr 08, 2013 at 04:37:20PM -0400, Josef Bacik wrote:
> On Mon, Apr 08, 2013 at 08:16:26AM -0600, Liu Bo wrote:
> > On Mon, Apr 08, 2013 at 08:54:50AM -0400, Josef Bacik wrote:
> > > On Sun, Apr 07, 2013 at 07:12:48AM -0600, Liu Bo wrote:
[...]
> > > +       __le64 dedup_hash;
> > > > +
> > > >  } __attribute__ ((__packed__));
> > > > 
> > > 
> > > Please don't do this, do like what we do with the crc tree, just lookup based on
> > > the disk bytenr when we free the extent and drop refs that way.  Don't further
> > > bloat the file extent item, we want it smaller not larger.  Thanks,
> > > 
> > > Josef
> > 
> > So the real trouble is that I take this hash value as the first field of
> > btrfs_key, and binary searching without the precise first field is not easy.
> > 
> > Otherwise we may have to add another key type which replaces hash value with
> > disk bytenr, ie. (disk bytenr, ANOTHER_KEY_TYPE, hash value), then we'll need to
> > search the tree twice to free this one or drop refs.
> > 
> > Either case is tradeoff, but as this is an initial version, we can try all of
> > these knobs and choose the better one :)
> > 
> 
> Why would you need to search twice?  You do something like
> 
> key.objectid = bytenr;
> key.type = whatever your type is
> key.offset = first 64bits of your hash
> 
> and then you search based on bytenr and you get your hash.  All extents that
> share hashes will have the same bytenr so you can just search with
> 
> key.objectid = bytenr
> key.type = whatever
> key.offset = (u64)-1
> 
> and then walk backwards, or do 0 and walk forwards, whatever your preference is.
> Also make it so the item stores the entire sha.  With Sha256 you are wasting
> most of the hash result by just storing the first 64bits.  So just use the first
> 64bits as the index and then store the full hash in the item so you can do a
> proper compare, and then you can have different hash functions later with
> different length hash values.  Thanks,

This is on freeing extent, what about finding dedup when we don't yet have a
disk bytenr but only a hash value from sha256 on file data?

That's why (hash, type, disk bytenr) is necessary.

Searching twice means that if we do what you've suggested, we'd not only update
or free (disk bytenr, type, hash), but also (hash, type, disk bytenr).

Or am I missing something?

thanks,
liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miao Xie April 9, 2013, 1:40 a.m. UTC | #7
On 	mon, 8 Apr 2013 22:16:26 +0800, Liu Bo wrote:
> On Mon, Apr 08, 2013 at 08:54:50AM -0400, Josef Bacik wrote:
>> On Sun, Apr 07, 2013 at 07:12:48AM -0600, Liu Bo wrote:
>>> (NOTE: This leads to a FORMAT CHANGE, DO NOT use it on real data.)
>>>
>>> This introduce the online data deduplication feature for btrfs.
>>>
>>> (1) WHY do we need deduplication?
>>>     To improve our storage effiency.
>>>
>>> (2) WHAT is deduplication?
>>>     Two key ways for practical deduplication implementations,
>>>     *  When the data is deduplicated
>>>        (inband vs background)
>>>     *  The granularity of the deduplication.
>>>        (block level vs file level)
>>>
>>>     For btrfs, we choose
>>>     *  inband(synchronous)
>>>     *  block level
>>>
>>>     We choose them because of the same reason as how zfs does.
>>>     a)  To get an immediate benefit.
>>>     b)  To remove redundant parts within a file.
>>>
>>>     So we have an inband, block level data deduplication here.
>>>
>>> (3) HOW does deduplication works?
>>>     This makes full use of file extent back reference, the same way as
>>>     IOCTL_CLONE, which lets us easily store multiple copies of a set of
>>>     data as a single copy along with an index of references to the copy.
>>>
>>>     Here we have
>>>     a)  a new dedicated tree(DEDUP tree) and
>>>     b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
>>>         (stop 64bits of hash, type, disk offset),
>>>         *  stop 64bits of hash
>>>            It comes from sha256, which is very helpful on avoiding collision.
>>>            And we take the stop 64bits as the index.
>>>         *  disk offset
>>>            It helps to find where the data is stored.
>>>
>>>     So the whole deduplication process works as,
>>>     1) write something,
>>>     2) calculate the hash of this "something",
>>>     3) try to find the match of hash value by searching DEDUP keys in
>>>        a dedicated tree, DEDUP tree.
>>>     4) if found, skip real IO and link to the existing copy
>>>        if not, do real IO and insert a DEDUP key to the DEDUP tree.
>>>
>>>     For now, we limit the deduplication unit to PAGESIZE, 4096, and we're
>>>     going to increase this unit dynamically in the future.
>>>
>>> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
>>> ---
>>>  fs/btrfs/ctree.h        |   53 ++++++++
>>>  fs/btrfs/disk-io.c      |   33 +++++-
>>>  fs/btrfs/extent-tree.c  |   22 +++-
>>>  fs/btrfs/extent_io.c    |    8 +-
>>>  fs/btrfs/extent_io.h    |   11 ++
>>>  fs/btrfs/file-item.c    |  186 ++++++++++++++++++++++++++
>>>  fs/btrfs/file.c         |    6 +-
>>>  fs/btrfs/inode.c        |  330 +++++++++++++++++++++++++++++++++++++++++++----
>>>  fs/btrfs/ioctl.c        |   34 +++++-
>>>  fs/btrfs/ordered-data.c |   25 +++-
>>>  fs/btrfs/ordered-data.h |    9 ++
>>>  fs/btrfs/print-tree.c   |    6 +-
>>>  fs/btrfs/super.c        |    7 +-
>>>  13 files changed, 685 insertions(+), 45 deletions(-)
>>>
>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>> index 0d82922..59339bc 100644
>>> --- a/fs/btrfs/ctree.h
>>> +++ b/fs/btrfs/ctree.h
>>> @@ -32,6 +32,7 @@
>>>  #include <asm/kmap_types.h>
>>>  #include <linux/pagemap.h>
>>>  #include <linux/btrfs.h>
>>> +#include <crypto/hash.h>
>>>  #include "extent_io.h"
>>>  #include "extent_map.h"
>>>  #include "async-thread.h"
>>> @@ -94,6 +95,9 @@ struct btrfs_ordered_sum;
>>>  /* holds quota configuration and tracking */
>>>  #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
>>>
>>> +/* dedup tree(experimental) */
>>> +#define BTRFS_DEDUP_TREE_OBJECTID 9ULL
>>> +
>>>  /* orhpan objectid for tracking unlinked/truncated files */
>>>  #define BTRFS_ORPHAN_OBJECTID -5ULL
>>>
>>> @@ -884,12 +888,31 @@ struct btrfs_file_extent_item {
>>>          */
>>>         __le64 num_bytes;
>>>
>>> +       /*
>>> +        * the stop 64bits of sha256 hash value, this helps us find the
>>> +        * corresponding item in dedup tree.
>>> +        */
>>> +       __le64 dedup_hash;
>>> +
>>>  } __attribute__ ((__packed__));
>>>
>>
>> Please don't do this, do like what we do with the crc tree, just lookup based on
>> the disk bytenr when we free the extent and drop refs that way.  Don't further
>> bloat the file extent item, we want it smaller not larger.  Thanks,
>>
>> Josef
> 
> So the real trouble is that I take this hash value as the first field of
> btrfs_key, and binary searching without the precise first field is not easy.
> 
> Otherwise we may have to add another key type which replaces hash value with
> disk bytenr, ie. (disk bytenr, ANOTHER_KEY_TYPE, hash value), then we'll need to
> search the tree twice to free this one or drop refs.

Why need we store refs in btrfs_dedup_item structure?

I think the following one is better:
key.objectid = the stop 64bits of sha256 hash value
key.type = whatever
key.offset = bytenr	/* the bytenr of block */

struct btrfs_dedup_item {
	__le64 bytenr,		/* the start bytenr of the extent */
	__le64 len,
}

In this way, we use the refs in btrfs_extent_item to make sure the block is not freed.
And when we truncate the file, all thing we need do is delete the dedup item when we
free the extent just like checksum tree.

Thanks
Miao

> 
> Either case is tradeoff, but as this is an initial version, we can try all of
> these knobs and choose the better one :)
> 
> thanks,
> liubo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik April 9, 2013, 1:48 a.m. UTC | #8
On Mon, Apr 8, 2013 at 9:34 PM, Liu Bo <bo.li.liu@oracle.com> wrote:
> On Mon, Apr 08, 2013 at 04:37:20PM -0400, Josef Bacik wrote:
>> On Mon, Apr 08, 2013 at 08:16:26AM -0600, Liu Bo wrote:
>> > On Mon, Apr 08, 2013 at 08:54:50AM -0400, Josef Bacik wrote:
>> > > On Sun, Apr 07, 2013 at 07:12:48AM -0600, Liu Bo wrote:
> [...]
>> > > +       __le64 dedup_hash;
>> > > > +
>> > > >  } __attribute__ ((__packed__));
>> > > >
>> > >
>> > > Please don't do this, do like what we do with the crc tree, just lookup based on
>> > > the disk bytenr when we free the extent and drop refs that way.  Don't further
>> > > bloat the file extent item, we want it smaller not larger.  Thanks,
>> > >
>> > > Josef
>> >
>> > So the real trouble is that I take this hash value as the first field of
>> > btrfs_key, and binary searching without the precise first field is not easy.
>> >
>> > Otherwise we may have to add another key type which replaces hash value with
>> > disk bytenr, ie. (disk bytenr, ANOTHER_KEY_TYPE, hash value), then we'll need to
>> > search the tree twice to free this one or drop refs.
>> >
>> > Either case is tradeoff, but as this is an initial version, we can try all of
>> > these knobs and choose the better one :)
>> >
>>
>> Why would you need to search twice?  You do something like
>>
>> key.objectid = bytenr;
>> key.type = whatever your type is
>> key.offset = first 64bits of your hash
>>
>> and then you search based on bytenr and you get your hash.  All extents that
>> share hashes will have the same bytenr so you can just search with
>>
>> key.objectid = bytenr
>> key.type = whatever
>> key.offset = (u64)-1
>>
>> and then walk backwards, or do 0 and walk forwards, whatever your preference is.
>> Also make it so the item stores the entire sha.  With Sha256 you are wasting
>> most of the hash result by just storing the first 64bits.  So just use the first
>> 64bits as the index and then store the full hash in the item so you can do a
>> proper compare, and then you can have different hash functions later with
>> different length hash values.  Thanks,
>
> This is on freeing extent, what about finding dedup when we don't yet have a
> disk bytenr but only a hash value from sha256 on file data?
>
> That's why (hash, type, disk bytenr) is necessary.
>
> Searching twice means that if we do what you've suggested, we'd not only update
> or free (disk bytenr, type, hash), but also (hash, type, disk bytenr).
>
> Or am I missing something?
>

No sorry I was putting my son to sleep thinking about this and I
realized the problem you were having.  So I say do like we do with
DIR_ITEMs, you have one type indexed by bytenr and one indexed by the
stop 64bits of the hash value.  You just have an empty item for the
index by bytenr and for the index by the hash value you have the
offset of the key be the bytenr and then the item stores the length of
the full hash value and the hash value.  You don't need ref counting
since that's taken care of by the extent tree, you just free the
corresponding dedup items when you free the extent.  It would be nice
to do everything with just one key but I don't think you are going to
be able to do it, unless you can figure out some way to stash the hash
in the extent ref or something.  But growing the file extent item or
the extent ref isn't an option since it affects anybody who doesn't
use dedup, so I think the 2 key option is probably the best bet.
Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miao Xie April 9, 2013, 1:52 a.m. UTC | #9
On 	mon, 8 Apr 2013 15:47:27 +0200, David Sterba wrote:
> On Sun, Apr 07, 2013 at 09:12:48PM +0800, Liu Bo wrote:
>> (2) WHAT is deduplication?
>>     Two key ways for practical deduplication implementations,
>>     *  When the data is deduplicated
>>        (inband vs background)
>>     *  The granularity of the deduplication.
>>        (block level vs file level)
>>
>>     For btrfs, we choose
>>     *  inband(synchronous)
>>     *  block level
> 
> Block level may be too fine grained leading to excessive fragmentation
> and increased metadata usage given that there's a much higher chance to
> find duplicate (4k) blocks here and there.
> 
> There's always a tradeoff, the practical values that are considered for
> granularity range from 8k to 64, see eg. this paper for graphs and analyses
> 
> http://static.usenix.org/event/fast11/tech/full_papers/Meyer.pdf .
> 
> This also depends on file data type and access patterns, fixing the dedup
> basic chunk size to one block does not IMHO fit most usecases.

Maybe we can make btrfs(including dedup) support the bigalloc just like ext4.

Thanks
Miao

> 
>> (3) HOW does deduplication works?
> ...
>>     Here we have
>>     a)  a new dedicated tree(DEDUP tree) and
>>     b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
>>         (stop 64bits of hash, type, disk offset),
>>         *  stop 64bits of hash
>>            It comes from sha256, which is very helpful on avoiding collision.
>>            And we take the stop 64bits as the index.
> 
> Is it safe to use just 64 bits? I'd like to see better reasoning why
> this is ok. The limitation of btrfs_key to store only 1-2 64bit items is
> clear and must be handled, but it's IMO a critical design point.
> 
>>         *  disk offset
>>            It helps to find where the data is stored.
> 
> Does the disk offset also help to resolving block hash collisions?
> 
>>     So the whole deduplication process works as,
>>     1) write something,
>>     2) calculate the hash of this "something",
>>     3) try to find the match of hash value by searching DEDUP keys in
>>        a dedicated tree, DEDUP tree.
>>     4) if found, skip real IO and link to the existing copy
>>        if not, do real IO and insert a DEDUP key to the DEDUP tree.
> 
> ... how are the hash collisions handled? Using part of a secure has
> cannot be considered equally strong (given that there is not other
> safety checks like comparing the whole blocks).
> 
> Last but not least, there was another dedup proposal (author CCed)
> 
> http://thread.gmane.org/gmane.comp.file-systems.btrfs/21722
> 
> 
> david
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marek Otahal April 10, 2013, 12:05 p.m. UTC | #10
Hello, 
this is awesome news! thank you for working on dedup. 

I have some questions about the dedup approach in regards to other layers/features. 

1/ How will the snapshots be handled? 

Whether data would be dedup-ed between snapshots (potentially big saved-space ratio), or would snapshots be considered isolated? Best, if this could be set by the user. My concern is about being error-prone, where with deduping snapshots, actually only 1 copy of the data would exist and a corruption would damage it as well as all snapshots. Or is this not a problem and we say "safety" is handled by RAID? 

2/ Order of dedup/compression? 

What would be done first, compress a file and then compare blocks for duplicates, or the other way around? 

Dedup 1st would save some compression work:
file's block 0000000000 -> hash -> isDup? (if no)-> compress (10x0) -> write
but proble is written data size is unknown (it's not the 1 block at start)

Other way, compress first, would waste compression cpu-operations on duplicate blocks, but would yield reduced dedup-related metadata usage, as 1 million of zeros would be compressed to a single block and that one only is compared/written. Usefullness here depends on the compression ratio of the file. 

I'm not sure which approach here would be better? 




Thank you for your time and explanation. 
Best wishes, Mark
   
On Sunday 07 April 2013 21:12:48 Liu Bo wrote:
> (NOTE: This leads to a FORMAT CHANGE, DO NOT use it on real data.)
> 
> This introduce the online data deduplication feature for btrfs.
> 
> (1) WHY do we need deduplication?
>     To improve our storage effiency.
> 
> (2) WHAT is deduplication?
>     Two key ways for practical deduplication implementations,
>     *  When the data is deduplicated
>        (inband vs background)
>     *  The granularity of the deduplication.
>        (block level vs file level)
> 
>     For btrfs, we choose
>     *  inband(synchronous)
>     *  block level
> 
>     We choose them because of the same reason as how zfs does.
>     a)  To get an immediate benefit.
>     b)  To remove redundant parts within a file.
> 
>     So we have an inband, block level data deduplication here.
> 
> (3) HOW does deduplication works?
>     This makes full use of file extent back reference, the same way as
>     IOCTL_CLONE, which lets us easily store multiple copies of a set of
>     data as a single copy along with an index of references to the copy.
> 
>     Here we have
>     a)  a new dedicated tree(DEDUP tree) and
>     b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
>         (stop 64bits of hash, type, disk offset),
>         *  stop 64bits of hash
>            It comes from sha256, which is very helpful on avoiding collision.
>            And we take the stop 64bits as the index.
>         *  disk offset
>            It helps to find where the data is stored.
> 
>     So the whole deduplication process works as,
>     1) write something,
>     2) calculate the hash of this "something",
>     3) try to find the match of hash value by searching DEDUP keys in
>        a dedicated tree, DEDUP tree.
>     4) if found, skip real IO and link to the existing copy
>        if not, do real IO and insert a DEDUP key to the DEDUP tree.
> 
>     For now, we limit the deduplication unit to PAGESIZE, 4096, and we're
>     going to increase this unit dynamically in the future.
> 
> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Liu Bo April 10, 2013, 2:14 p.m. UTC | #11
On Wed, Apr 10, 2013 at 02:05:32PM +0200, Marek Otahal wrote:
> Hello, 
> this is awesome news! thank you for working on dedup. 

Hi,

Your previous thread floating on the LIST did give me some light, thanks :)

> 
> I have some questions about the dedup approach in regards to other layers/features. 
> 
> 1/ How will the snapshots be handled? 
> 
> Whether data would be dedup-ed between snapshots (potentially big saved-space ratio), or would snapshots be considered isolated? Best, if this could be set by the user. My concern is about being error-prone, where with deduping snapshots, actually only 1 copy of the data would exist and a corruption would damage it as well as all snapshots. Or is this not a problem and we say "safety" is handled by RAID? 

I don't think that snapshot should be responsible on data security, even without
dedup, snapshot's files can still share the same disk space/extent with the
source.

So I won't treat dedup on snapshot as a special case.

> 
> 2/ Order of dedup/compression? 
> 
> What would be done first, compress a file and then compare blocks for duplicates, or the other way around? 
> 
> Dedup 1st would save some compression work:
> file's block 0000000000 -> hash -> isDup? (if no)-> compress (10x0) -> write
> but proble is written data size is unknown (it's not the 1 block at start)
> 
> Other way, compress first, would waste compression cpu-operations on duplicate blocks, but would yield reduced dedup-related metadata usage, as 1 million of zeros would be compressed to a single block and that one only is compared/written. Usefullness here depends on the compression ratio of the file. 
> 
> I'm not sure which approach here would be better? 
> 

In my opinion, dedup is a special kind of compression, it's not worth doing
dedup on compressed data as I want to keep this feature to be simple.

I prefer to dedup first rather than compression(but I could be wrong).

thanks,
liubo

> 
> 
> 
> Thank you for your time and explanation. 
> Best wishes, Mark
>    
> On Sunday 07 April 2013 21:12:48 Liu Bo wrote:
> > (NOTE: This leads to a FORMAT CHANGE, DO NOT use it on real data.)
> > 
> > This introduce the online data deduplication feature for btrfs.
> > 
> > (1) WHY do we need deduplication?
> >     To improve our storage effiency.
> > 
> > (2) WHAT is deduplication?
> >     Two key ways for practical deduplication implementations,
> >     *  When the data is deduplicated
> >        (inband vs background)
> >     *  The granularity of the deduplication.
> >        (block level vs file level)
> > 
> >     For btrfs, we choose
> >     *  inband(synchronous)
> >     *  block level
> > 
> >     We choose them because of the same reason as how zfs does.
> >     a)  To get an immediate benefit.
> >     b)  To remove redundant parts within a file.
> > 
> >     So we have an inband, block level data deduplication here.
> > 
> > (3) HOW does deduplication works?
> >     This makes full use of file extent back reference, the same way as
> >     IOCTL_CLONE, which lets us easily store multiple copies of a set of
> >     data as a single copy along with an index of references to the copy.
> > 
> >     Here we have
> >     a)  a new dedicated tree(DEDUP tree) and
> >     b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
> >         (stop 64bits of hash, type, disk offset),
> >         *  stop 64bits of hash
> >            It comes from sha256, which is very helpful on avoiding collision.
> >            And we take the stop 64bits as the index.
> >         *  disk offset
> >            It helps to find where the data is stored.
> > 
> >     So the whole deduplication process works as,
> >     1) write something,
> >     2) calculate the hash of this "something",
> >     3) try to find the match of hash value by searching DEDUP keys in
> >        a dedicated tree, DEDUP tree.
> >     4) if found, skip real IO and link to the existing copy
> >        if not, do real IO and insert a DEDUP key to the DEDUP tree.
> > 
> >     For now, we limit the deduplication unit to PAGESIZE, 4096, and we're
> >     going to increase this unit dynamically in the future.
> > 
> > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> 
> -- 
> 
> Marek Otahal :o)
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo April 10, 2013, 2:21 p.m. UTC | #12
On Mon, Apr 08, 2013 at 09:48:16PM -0400, Josef Bacik wrote:
> On Mon, Apr 8, 2013 at 9:34 PM, Liu Bo <bo.li.liu@oracle.com> wrote:
> > On Mon, Apr 08, 2013 at 04:37:20PM -0400, Josef Bacik wrote:
> >> On Mon, Apr 08, 2013 at 08:16:26AM -0600, Liu Bo wrote:
> >> > On Mon, Apr 08, 2013 at 08:54:50AM -0400, Josef Bacik wrote:
> >> > > On Sun, Apr 07, 2013 at 07:12:48AM -0600, Liu Bo wrote:
> > [...]
> >> > > +       __le64 dedup_hash;
> >> > > > +
> >> > > >  } __attribute__ ((__packed__));
> >> > > >
> >> > >
> >> > > Please don't do this, do like what we do with the crc tree, just lookup based on
> >> > > the disk bytenr when we free the extent and drop refs that way.  Don't further
> >> > > bloat the file extent item, we want it smaller not larger.  Thanks,
> >> > >
> >> > > Josef
> >> >
> >> > So the real trouble is that I take this hash value as the first field of
> >> > btrfs_key, and binary searching without the precise first field is not easy.
> >> >
> >> > Otherwise we may have to add another key type which replaces hash value with
> >> > disk bytenr, ie. (disk bytenr, ANOTHER_KEY_TYPE, hash value), then we'll need to
> >> > search the tree twice to free this one or drop refs.
> >> >
> >> > Either case is tradeoff, but as this is an initial version, we can try all of
> >> > these knobs and choose the better one :)
> >> >
> >>
> >> Why would you need to search twice?  You do something like
> >>
> >> key.objectid = bytenr;
> >> key.type = whatever your type is
> >> key.offset = first 64bits of your hash
> >>
> >> and then you search based on bytenr and you get your hash.  All extents that
> >> share hashes will have the same bytenr so you can just search with
> >>
> >> key.objectid = bytenr
> >> key.type = whatever
> >> key.offset = (u64)-1
> >>
> >> and then walk backwards, or do 0 and walk forwards, whatever your preference is.
> >> Also make it so the item stores the entire sha.  With Sha256 you are wasting
> >> most of the hash result by just storing the first 64bits.  So just use the first
> >> 64bits as the index and then store the full hash in the item so you can do a
> >> proper compare, and then you can have different hash functions later with
> >> different length hash values.  Thanks,
> >
> > This is on freeing extent, what about finding dedup when we don't yet have a
> > disk bytenr but only a hash value from sha256 on file data?
> >
> > That's why (hash, type, disk bytenr) is necessary.
> >
> > Searching twice means that if we do what you've suggested, we'd not only update
> > or free (disk bytenr, type, hash), but also (hash, type, disk bytenr).
> >
> > Or am I missing something?
> >
> 
> No sorry I was putting my son to sleep thinking about this and I
> realized the problem you were having.  So I say do like we do with
> DIR_ITEMs, you have one type indexed by bytenr and one indexed by the
> stop 64bits of the hash value.  You just have an empty item for the
> index by bytenr and for the index by the hash value you have the
> offset of the key be the bytenr and then the item stores the length of
> the full hash value and the hash value.  You don't need ref counting
> since that's taken care of by the extent tree, you just free the
> corresponding dedup items when you free the extent.  It would be nice
> to do everything with just one key but I don't think you are going to
> be able to do it, unless you can figure out some way to stash the hash
> in the extent ref or something.  But growing the file extent item or
> the extent ref isn't an option since it affects anybody who doesn't
> use dedup, so I think the 2 key option is probably the best bet.
> Thanks,
> 
> Josef

Yeah, this makes sense.

All right, I've finished shipping it to two key style(from Josef) and getting
rid of the extra ref(from Josef and Miao).

But I still need to build a framework to enlarge the dedup blocksize rather than
a page size(from Dave).

thanks,
liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba April 10, 2013, 3:42 p.m. UTC | #13
On Mon, Apr 08, 2013 at 10:08:54PM +0800, Liu Bo wrote:
> > Is it safe to use just 64 bits? I'd like to see better reasoning why
> > this is ok. The limitation of btrfs_key to store only 1-2 64bit items is
> > clear and must be handled, but it's IMO a critical design point.
> 
> Actually I use the whole 256bit hash value(stored in its item) to avoid hash
> collision, not just the stop 64bit.

Ok. It's good to point such things in the changelog/design docs, such
things may not be immediatelly clear from the code.

> > Last but not least, there was another dedup proposal (author CCed)
> > 
> > http://thread.gmane.org/gmane.comp.file-systems.btrfs/21722
> 
> I waited for 2 months and wanted to see the actual code from the proposal, but I
> failed, so I decided to do it myself.

Quoting from the mail:

"My goal is to study Btrfs design, the offline deduplication patch [1]
and to come up with a design for the online dedup, this semester. I will
be implementing the feature next semester (spring, that is)."

So the implementation starts about now ...

> Anyway I'd like to see this feature in btrfs no matter who write it down.

We'll end up with 2 implementations that are likely doing different
tradeoffs and once one is merged the other one can be thrown away
wasting time and efforts.

Not my problem, but I don't uderstand why there's another implementation
when the dedup project has been informally claimed (via mailinglist).


david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba April 10, 2013, 3:52 p.m. UTC | #14
On Tue, Apr 09, 2013 at 09:52:42AM +0800, Miao Xie wrote:
> On 	mon, 8 Apr 2013 15:47:27 +0200, David Sterba wrote:
> > This also depends on file data type and access patterns, fixing the dedup
> > basic chunk size to one block does not IMHO fit most usecases.
> 
> Maybe we can make btrfs(including dedup) support the bigalloc just like ext4.

dedup does not strictly need this, I agree it could make a few things
easier to implement. But I think it removes some flexibility once the
bigalloc cluster size is set, what if I want to change the dedup chunk
size because the structure of my data changed?

david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d82922..59339bc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -32,6 +32,7 @@ 
 #include <asm/kmap_types.h>
 #include <linux/pagemap.h>
 #include <linux/btrfs.h>
+#include <crypto/hash.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -94,6 +95,9 @@  struct btrfs_ordered_sum;
 /* holds quota configuration and tracking */
 #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
 
+/* dedup tree(experimental) */
+#define BTRFS_DEDUP_TREE_OBJECTID 9ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -884,12 +888,31 @@  struct btrfs_file_extent_item {
 	 */
 	__le64 num_bytes;
 
+	/*
+	 * the stop 64bits of sha256 hash value, this helps us find the
+	 * corresponding item in dedup tree.
+	 */
+	__le64 dedup_hash;
+
 } __attribute__ ((__packed__));
 
 struct btrfs_csum_item {
 	u8 csum;
 } __attribute__ ((__packed__));
 
+/* dedup */
+struct btrfs_dedup_item {
+	__le32 refs;
+	__le64 len;
+} __attribute__ ((__packed__));
+
+#define BTRFS_DEDUP_HASH_SIZE 32
+#define BTRFS_DEDUP_HASH_LEN 4
+
+struct btrfs_dedup_hash_item {
+	__le64 hash[BTRFS_DEDUP_HASH_LEN];
+} __attribute__ ((__packed__));
+
 struct btrfs_dev_stats_item {
 	/*
 	 * grow this item struct at the end for future enhancements and keep
@@ -1287,6 +1310,7 @@  struct btrfs_fs_info {
 	struct btrfs_root *dev_root;
 	struct btrfs_root *fs_root;
 	struct btrfs_root *csum_root;
+	struct btrfs_root *dedup_root;
 	struct btrfs_root *quota_root;
 
 	/* the log root tree is a directory of all the other log roots */
@@ -1605,6 +1629,9 @@  struct btrfs_fs_info {
 	struct btrfs_dev_replace dev_replace;
 
 	atomic_t mutually_exclusive_operation_running;
+
+	/* reference to deduplication algorithm drvier via cryptoapi */
+	struct crypto_shash *dedup_driver;
 };
 
 /*
@@ -1866,6 +1893,8 @@  struct btrfs_ioctl_defrag_range_args {
  */
 #define BTRFS_DEV_REPLACE_KEY	250
 
+#define BTRFS_DEDUP_ITEM_KEY	252
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -1900,6 +1929,7 @@  struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY	(1 << 20)
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	(1 << 22)
+#define BTRFS_MOUNT_DEDUP		(1 << 23)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -2810,6 +2840,8 @@  BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
 		   encryption, 8);
 BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
 		   other_encoding, 16);
+BTRFS_SETGET_FUNCS(file_extent_dedup_hash, struct btrfs_file_extent_item,
+		   dedup_hash, 64);
 
 /* this returns the number of file bytes represented by the inline item.
  * If an item is compressed, this is the uncompressed size
@@ -2833,6 +2865,10 @@  static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
 	return btrfs_item_size(eb, e) - offset;
 }
 
+/* btrfs_dedup_item */
+BTRFS_SETGET_FUNCS(dedup_refs, struct btrfs_dedup_item, refs, 32);
+BTRFS_SETGET_FUNCS(dedup_len, struct btrfs_dedup_item, len, 64);
+
 /* btrfs_dev_stats_item */
 static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
 					struct btrfs_dev_stats_item *ptr,
@@ -3057,6 +3093,10 @@  int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
 		      u64 owner, u64 offset, int for_cow);
+int noinline_for_stack btrfs_free_extent_hash(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int for_cow, u64 *hash);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -3300,6 +3340,8 @@  static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 }
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+	if (fs_info->dedup_driver)
+		crypto_free_shash(fs_info->dedup_driver);
 	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
 	kfree(fs_info->extent_root);
@@ -3475,6 +3517,17 @@  int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			u64 isize);
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			     struct list_head *list, int search_commit);
+
+int noinline_for_stack
+btrfs_find_dedup_extent(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, u64 *hash, u64 *bytenr_ret);
+int noinline_for_stack
+btrfs_insert_dedup_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 *hash, u64 bytenr);
+int noinline_for_stack
+btrfs_update_dedup_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 hash, u64 bytenr,
+			  int add);
 /* inode.c */
 struct btrfs_delalloc_work {
 	struct inode *inode;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6d19a0a..0573d6f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1946,6 +1946,10 @@  static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 	free_extent_buffer(info->extent_root->commit_root);
 	free_extent_buffer(info->csum_root->node);
 	free_extent_buffer(info->csum_root->commit_root);
+	if (info->dedup_root) {
+		free_extent_buffer(info->dedup_root->node);
+		free_extent_buffer(info->dedup_root->commit_root);
+	}
 	if (info->quota_root) {
 		free_extent_buffer(info->quota_root->node);
 		free_extent_buffer(info->quota_root->commit_root);
@@ -1959,6 +1963,10 @@  static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 	info->extent_root->commit_root = NULL;
 	info->csum_root->node = NULL;
 	info->csum_root->commit_root = NULL;
+	if (info->dedup_root) {
+		info->dedup_root->node = NULL;
+		info->dedup_root->commit_root = NULL;
+	}
 	if (info->quota_root) {
 		info->quota_root->node = NULL;
 		info->quota_root->commit_root = NULL;
@@ -1994,6 +2002,7 @@  int open_ctree(struct super_block *sb,
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
 	struct btrfs_root *quota_root;
+	struct btrfs_root *dedup_root;
 	struct btrfs_root *log_tree_root;
 	int ret;
 	int err = -EINVAL;
@@ -2006,9 +2015,10 @@  int open_ctree(struct super_block *sb,
 	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
 	dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
 	quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
+	dedup_root = fs_info->dedup_root = btrfs_alloc_root(fs_info);
 
 	if (!tree_root || !extent_root || !csum_root ||
-	    !chunk_root || !dev_root || !quota_root) {
+	    !chunk_root || !dev_root || !quota_root || !dedup_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -2331,6 +2341,14 @@  int open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
+	fs_info->dedup_driver = crypto_alloc_shash("sha256", 0, 0);
+	if (IS_ERR(fs_info->dedup_driver)) {
+		printk(KERN_INFO "Cannot load sha256 driver\n");
+		err = PTR_ERR(fs_info->dedup_driver);
+		fs_info->dedup_driver = NULL;
+		goto fail_alloc;
+	}
+
 	btrfs_init_workers(&fs_info->generic_worker,
 			   "genwork", 1, NULL);
 
@@ -2545,6 +2563,15 @@  retry_root_backup:
 	csum_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_DEDUP_TREE_OBJECTID, dedup_root);
+	if (ret) {
+		kfree(dedup_root);
+		dedup_root = fs_info->dedup_root = NULL;
+	} else {
+		dedup_root->track_dirty = 1;
+	}
+
+	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_QUOTA_TREE_OBJECTID, quota_root);
 	if (ret) {
 		kfree(quota_root);
@@ -3436,6 +3463,10 @@  int close_ctree(struct btrfs_root *root)
 	free_extent_buffer(fs_info->dev_root->commit_root);
 	free_extent_buffer(fs_info->csum_root->node);
 	free_extent_buffer(fs_info->csum_root->commit_root);
+	if (fs_info->dedup_root) {
+		free_extent_buffer(fs_info->dedup_root->node);
+		free_extent_buffer(fs_info->dedup_root->commit_root);
+	}
 	if (fs_info->quota_root) {
 		free_extent_buffer(fs_info->quota_root->node);
 		free_extent_buffer(fs_info->quota_root->commit_root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0d84787..e169796 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5651,9 +5651,11 @@  out:
 }
 
 /* Can return -ENOMEM */
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-		      u64 owner, u64 offset, int for_cow)
+int noinline_for_stack
+btrfs_free_extent_hash(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+		       u64 parent, u64 root_objectid, u64 owner, u64 offset,
+		       int for_cow, u64 *hash)
 {
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -5673,6 +5675,10 @@  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					parent, root_objectid, (int)owner,
 					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
 	} else {
+		if (hash) {
+			ret = btrfs_update_dedup_extent(trans, root, (*hash),
+							bytenr, 0);
+		}
 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
 						num_bytes,
 						parent, root_objectid, owner,
@@ -5682,6 +5688,16 @@  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return ret;
 }
 
+/* Can return -ENOMEM */
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int for_cow)
+{
+	return btrfs_free_extent_hash(trans, root, bytenr, num_bytes, parent,
+				      root_objectid, owner, offset, for_cow,
+				      NULL);
+}
+
 static u64 stripe_align(struct btrfs_root *root,
 			struct btrfs_block_group_cache *cache,
 			u64 val, u64 num_bytes)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cdee391..a4fd048 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2317,7 +2317,7 @@  int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-static void end_bio_extent_writepage(struct bio *bio, int err)
+void end_bio_extent_writepage(struct bio *bio, int err)
 {
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct extent_io_tree *tree;
@@ -2496,8 +2496,8 @@  btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	return bio;
 }
 
-static int __must_check submit_one_bio(int rw, struct bio *bio,
-				       int mirror_num, unsigned long bio_flags)
+int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num,
+				unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -2536,7 +2536,7 @@  static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
 
 }
 
-static int submit_extent_page(int rw, struct extent_io_tree *tree,
+int submit_extent_page(int rw, struct extent_io_tree *tree,
 			      struct page *page, sector_t sector,
 			      size_t size, unsigned long offset,
 			      struct block_device *bdev,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 258c921..4176112 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@ 
 #define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_NEED_WAIT (1 << 13)
 #define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_DEDUP (1 << 15)	/* reserved for dedup */
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
@@ -343,4 +344,14 @@  int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num);
+int submit_extent_page(int rw, struct extent_io_tree *tree, struct page *page,
+		       sector_t sector, size_t size, unsigned long offset,
+		       struct block_device *bdev, struct bio **bio_ret,
+		       unsigned long max_pages, bio_end_io_t end_io_func,
+		       int mirror_num, unsigned long prev_bio_flags,
+		       unsigned long bio_flags);
+void end_bio_extent_writepage(struct bio *bio, int err);
+int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num,
+				unsigned long bio_flags);
+
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c4628a2..781b4a4 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -906,3 +906,189 @@  out:
 fail_unlock:
 	goto out;
 }
+
+/* 1 means we find one, 0 means we dont. */
+int noinline_for_stack
+btrfs_find_dedup_extent(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, u64 *hash, u64 *bytenr_ret)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_root *dedup_root = root->fs_info->dedup_root;
+	struct btrfs_dedup_item *item;
+	struct btrfs_dedup_hash_item *hash_item;
+	u64 *hash_in_item[4];
+	u64 hash_value;
+	int found = 0;
+	int ret;
+
+	if (!dedup_root) {
+		printk(KERN_INFO "%s: dedup not enabled\n", __func__);
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return found;
+
+	hash_value = hash[BTRFS_DEDUP_HASH_LEN - 1];
+
+	key.objectid = hash_value;	/* hash value */
+	key.offset = -1;		/* logical offset */
+	btrfs_set_key_type(&key, BTRFS_DEDUP_ITEM_KEY);
+
+	ret = btrfs_search_slot(trans, dedup_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+prev_slot:
+	/* this will do match checks. */
+	ret = btrfs_previous_item(dedup_root, path, hash_value,
+				  BTRFS_DEDUP_ITEM_KEY);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dedup_item);
+	hash_item = (struct btrfs_dedup_hash_item *)(item + 1);
+	BUG_ON(sizeof(*hash_item) != BTRFS_DEDUP_HASH_SIZE);
+
+	read_extent_buffer(leaf, hash_in_item, ((unsigned long)hash_item),
+			   BTRFS_DEDUP_HASH_SIZE);
+	if (memcmp((char *)hash_in_item, (char *)hash, sizeof(*hash_item))) {
+		printk(KERN_INFO "goto prev\n");
+		goto prev_slot;
+	}
+
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	BUG_ON(!bytenr_ret);
+	*bytenr_ret = key.offset;
+	found = 1;
+out:
+	btrfs_free_path(path);
+	return found;
+}
+
+int noinline_for_stack
+btrfs_insert_dedup_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 *hash, u64 bytenr)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_root *dedup_root = root->fs_info->dedup_root;
+	struct btrfs_dedup_item *dedup_item;
+	struct btrfs_dedup_hash_item *hash_item;
+	u64 ins_size;
+	int ret;
+
+	if (!dedup_root) {
+		printk(KERN_INFO "%s: dedup not enabled\n", __func__);
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ins_size = sizeof(*dedup_item) + sizeof(*hash_item);
+
+	key.objectid = hash[3];	/* hash value */
+	key.offset = bytenr;	/* logical offset */
+	btrfs_set_key_type(&key, BTRFS_DEDUP_ITEM_KEY);
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, dedup_root, path, &key,
+				      ins_size);
+	if (ret < 0)
+		goto out;
+	leaf = path->nodes[0];
+
+	dedup_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_dedup_item);
+	btrfs_set_dedup_refs(leaf, dedup_item, 1);
+	/* length is for dedup unit */
+	btrfs_set_dedup_len(leaf, dedup_item, dedup_root->sectorsize);
+
+	hash_item = (struct btrfs_dedup_hash_item *)(dedup_item + 1);
+	BUG_ON(sizeof(*hash_item) != BTRFS_DEDUP_HASH_SIZE);
+
+	write_extent_buffer(leaf, hash, (unsigned long)hash_item,
+			    BTRFS_DEDUP_HASH_SIZE);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+out:
+	/* TODO: EEXIST means that we need to mark it as a dup one */
+	WARN_ON(ret == -EEXIST);
+	btrfs_free_path(path);
+	return ret;
+}
+
+int noinline_for_stack
+btrfs_update_dedup_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 hash, u64 bytenr,
+			  int add)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_root *dedup_root = root->fs_info->dedup_root;
+	struct btrfs_dedup_item *dedup_item = NULL;
+	u32 refs = 0;
+	int ret = 0;
+
+	if (!dedup_root) {
+		printk(KERN_INFO "%s: dedup not enabled\n", __func__);
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return ret;
+
+	key.objectid = hash;	/* hash value */
+	key.offset = bytenr;	/* logical offset */
+	btrfs_set_key_type(&key, BTRFS_DEDUP_ITEM_KEY);
+
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(trans, dedup_root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (btrfs_key_type(&key) != BTRFS_DEDUP_ITEM_KEY)
+		goto out;
+	if (key.objectid != hash || key.offset != bytenr)
+		goto out;
+
+	dedup_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_dedup_item);
+	refs = btrfs_dedup_refs(leaf, dedup_item);
+	BUG_ON(refs < 1);
+	if (add)
+		refs += 1;
+	else
+		refs -= 1;
+
+	if (refs > 0) {
+		btrfs_set_dedup_refs(leaf, dedup_item, refs);
+		btrfs_mark_buffer_dirty(leaf);
+	} else {
+		ret = btrfs_del_item(trans, dedup_root, path);
+	}
+
+out:
+	btrfs_free_path(path);
+	return 0;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1be25b9..88ff459 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -687,6 +687,7 @@  int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	u64 num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 extent_end = 0;
+	u64 dedup_hash = 0;
 	int del_nr = 0;
 	int del_slot = 0;
 	int extent_type;
@@ -747,6 +748,7 @@  next_slot:
 			extent_offset = btrfs_file_extent_offset(leaf, fi);
 			extent_end = key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
+			dedup_hash = btrfs_file_extent_dedup_hash(leaf, fi);
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			extent_end = key.offset +
 				btrfs_file_extent_inline_len(leaf, fi);
@@ -874,11 +876,11 @@  next_slot:
 				extent_end = ALIGN(extent_end,
 						   root->sectorsize);
 			} else if (update_refs && disk_bytenr > 0) {
-				ret = btrfs_free_extent(trans, root,
+				ret = btrfs_free_extent_hash(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
-						extent_offset, 0);
+						extent_offset, 0, &dedup_hash);
 				BUG_ON(ret); /* -ENOMEM */
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b883815..137f7fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -802,6 +802,227 @@  out_free:
 	goto again;
 }
 
+static int btrfs_calc_dedup_hash(struct btrfs_root *root, const char *data,
+				 unsigned int length, u64 *hash)
+{
+	struct crypto_shash *tfm = root->fs_info->dedup_driver;
+	struct {
+		struct shash_desc desc;
+		char ctx[crypto_shash_descsize(tfm)];
+	} sdesc;
+	int err;
+
+	sdesc.desc.tfm = tfm;
+	sdesc.desc.flags = 0;
+
+	err = crypto_shash_digest(&sdesc.desc, data, length, (char *)hash);
+	BUG_ON(err);
+
+#if 0
+	err = crypto_shash_init(&sdesc.desc);
+	BUG_ON(err);
+
+	err = crypto_shash_update(&sdesc.desc, data, length);
+	BUG_ON(err);
+
+	err = crypto_shash_final(&sdesc.desc, (char *)hash);
+	BUG_ON(err);
+#endif
+
+	return err;
+}
+
+static noinline int
+run_delalloc_dedup(struct inode *inode, struct page *locked_page, u64 start,
+		   u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans = NULL;
+	struct bio *bio = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em;
+	struct page *page = NULL;
+	struct block_device *bdev;
+	struct btrfs_key ins;
+	u64 blocksize = root->sectorsize;
+	u64 num_bytes;
+	u64 cur_alloc_size;
+	u64 alloc_hint = 0;
+	u64 iosize;
+	u64 ram_size;
+	sector_t sector;
+	int ret = 0;
+
+	BUG_ON(btrfs_is_free_space_inode(inode));
+
+	num_bytes = ALIGN(end - start + 1, blocksize);
+	num_bytes = max(blocksize,  num_bytes);
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+	while (num_bytes > 0) {
+		unsigned long op;
+		int found = 0;
+		u64 hash[4];
+		u64 dedup_bytenr;
+		char *data;
+
+		/* for now we limit the dedup unit to page size, ie, 4k */
+		cur_alloc_size = min_t(u64, num_bytes, blocksize);
+
+		/* page has been locked by caller */
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+		BUG_ON(!page);
+		data = kmap_atomic(page);
+		ret = btrfs_calc_dedup_hash(root, data, PAGE_CACHE_SIZE, hash);
+		kunmap_atomic(data);
+
+		found = btrfs_find_dedup_extent(trans, root, hash,
+						&dedup_bytenr);
+		if (!found) {
+			ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+					   root->sectorsize, 0, alloc_hint,
+					   &ins, 1);
+			if (ret < 0) {
+				SetPageError(page);
+				page_cache_release(page);
+				btrfs_abort_transaction(trans, root, ret);
+				goto out_unlock;
+			}
+		} else { /* found same hash */
+			ins.objectid = dedup_bytenr;
+			ins.offset = cur_alloc_size;
+		}
+
+		em = alloc_extent_map();
+		BUG_ON(!em); /* -ENOMEM */
+		em->start = start;
+		em->orig_start = em->start;
+		ram_size = ins.offset;
+		em->len = ins.offset;
+		em->mod_start = em->start;
+		em->mod_len = em->len;
+
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		em->generation = -1;
+		sector = em->block_start >> 9;
+		iosize = ALIGN(num_bytes, blocksize);
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			if (!ret)
+				list_move(&em->list,
+					  &em_tree->modified_extents);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, start,
+						start + ram_size - 1, 0);
+		}
+
+		cur_alloc_size = ins.offset;
+		ret = btrfs_add_ordered_extent_dedup(inode, start, ins.objectid,
+						     ram_size, cur_alloc_size,
+						     0, found, hash);
+		BUG_ON(ret); /* -ENOMEM */
+
+		/*
+		 * Do set the Private2 bit so we know this page was properly
+		 * setup for writepage
+		 */
+		op = EXTENT_CLEAR_UNLOCK;
+		op |= EXTENT_CLEAR_DELALLOC | EXTENT_SET_PRIVATE2 |
+		      EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK;
+
+		extent_clear_unlock_delalloc(inode, tree,
+					     start, start + ram_size - 1,
+					     locked_page, op);
+
+		if (page == locked_page)
+			set_page_writeback(page);
+
+		if (!found) {
+			/* TODO: rw can be WRTIE_SYNC */
+			ret = submit_extent_page(WRITE, tree, page, sector,
+						 iosize, 0, bdev, &bio,
+						 0, /* max_nr is no used */
+						 end_bio_extent_writepage,
+						 0, 0, 0);
+			if (ret)
+				break;
+		} else {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				ret = tree->ops->writepage_end_io_hook(page,
+							start,
+							start + ram_size - 1,
+							NULL, 1);
+			/* we need to do this ourselves because we skip IO */
+			end_page_writeback(page);
+		}
+
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+
+		num_bytes -= cur_alloc_size;
+		alloc_hint = ins.objectid + ins.offset;
+		start += cur_alloc_size;
+		cond_resched();
+	}
+
+	if (ret && page)
+		SetPageError(page);
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	if (bio) {
+		ret = submit_one_bio(WRITE, bio, 0, 0);
+		BUG_ON(ret < 0);
+		bio = NULL;
+	}
+
+	btrfs_end_transaction(trans, root);
+
+	return ret;
+
+out_unlock:
+	if (ret && page)
+		SetPageError(page);
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	btrfs_end_transaction(trans, root);
+out:
+	extent_clear_unlock_delalloc(inode, tree,
+			     start, start + num_bytes - 1,
+			     NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+			     EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+			     EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK |
+			     EXTENT_END_WRITEBACK);
+
+	return ret;
+}
+
 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 				      u64 num_bytes)
 {
@@ -1044,15 +1265,20 @@  static noinline int cow_file_range(struct inode *inode,
 static noinline void async_cow_start(struct btrfs_work *work)
 {
 	struct async_cow *async_cow;
-	int num_added = 0;
 	async_cow = container_of(work, struct async_cow, work);
 
-	compress_file_range(async_cow->inode, async_cow->locked_page,
-			    async_cow->start, async_cow->end, async_cow,
-			    &num_added);
-	if (num_added == 0) {
-		btrfs_add_delayed_iput(async_cow->inode);
-		async_cow->inode = NULL;
+	if (btrfs_test_opt(async_cow->root, DEDUP)) {
+		run_delalloc_dedup(async_cow->inode, async_cow->locked_page,
+				   async_cow->start, async_cow->end);
+	} else {
+		int num_added = 0;
+		compress_file_range(async_cow->inode, async_cow->locked_page,
+				    async_cow->start, async_cow->end, async_cow,
+				    &num_added);
+		if (num_added == 0) {
+			btrfs_add_delayed_iput(async_cow->inode);
+			async_cow->inode = NULL;
+		}
 	}
 }
 
@@ -1076,8 +1302,10 @@  static noinline void async_cow_submit(struct btrfs_work *work)
 	    waitqueue_active(&root->fs_info->async_submit_wait))
 		wake_up(&root->fs_info->async_submit_wait);
 
-	if (async_cow->inode)
-		submit_compressed_extents(async_cow->inode, async_cow);
+	if (async_cow->inode) {
+		if (!btrfs_test_opt(root, DEDUP))
+			submit_compressed_extents(async_cow->inode, async_cow);
+	}
 }
 
 static noinline void async_cow_free(struct btrfs_work *work)
@@ -1467,21 +1695,23 @@  static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 {
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_inode *bi = BTRFS_I(inode);
 
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+	if (bi->flags & BTRFS_INODE_NODATACOW) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 1, nr_written);
-	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
+	} else if (bi->flags & BTRFS_INODE_PREALLOC) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
 	} else if (!btrfs_test_opt(root, COMPRESS) &&
-		   !(BTRFS_I(inode)->force_compress) &&
-		   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
+		   !(bi->force_compress) &&
+		   !(bi->flags & BTRFS_INODE_COMPRESS) &&
+		   !btrfs_test_opt(root, DEDUP)) {
 		ret = cow_file_range(inode, locked_page, start, end,
 				      page_started, nr_written, 1);
 	} else {
 		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-			&BTRFS_I(inode)->runtime_flags);
+			&bi->runtime_flags);
 		ret = cow_file_range_async(inode, locked_page, start, end,
 					   page_started, nr_written);
 	}
@@ -1876,12 +2106,13 @@  static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	return -EBUSY;
 }
 
-static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
-				       struct inode *inode, u64 file_pos,
-				       u64 disk_bytenr, u64 disk_num_bytes,
-				       u64 num_bytes, u64 ram_bytes,
-				       u8 compression, u8 encryption,
-				       u16 other_encoding, int extent_type)
+static int __insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+					 struct inode *inode, u64 file_pos,
+					 u64 disk_bytenr, u64 disk_num_bytes,
+					 u64 num_bytes, u64 ram_bytes,
+					 u8 compression, u8 encryption,
+					 u16 other_encoding, int extent_type,
+					 int dedup, u64 *hash)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_file_extent_item *fi;
@@ -1929,6 +2160,11 @@  static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_compression(leaf, fi, compression);
 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+	if (hash)
+		btrfs_set_file_extent_dedup_hash(leaf, fi,
+					hash[BTRFS_DEDUP_HASH_LEN - 1]);
+	else
+		btrfs_set_file_extent_dedup_hash(leaf, fi, 0);
 
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
@@ -1938,15 +2174,49 @@  static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ins.objectid = disk_bytenr;
 	ins.offset = disk_num_bytes;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
-	ret = btrfs_alloc_reserved_file_extent(trans, root,
-					root->root_key.objectid,
-					btrfs_ino(inode), file_pos, &ins);
+
+	if (!dedup) {
+		if (hash)
+			ret = btrfs_insert_dedup_extent(trans, root, hash,
+							disk_bytenr);
+
+		ret = btrfs_alloc_reserved_file_extent(trans, root,
+						       root->root_key.objectid,
+						       btrfs_ino(inode),
+						       file_pos, &ins);
+	} else {
+		if (hash)
+			ret = btrfs_update_dedup_extent(trans, root,
+						hash[BTRFS_DEDUP_HASH_LEN - 1],
+						disk_bytenr, 1);
+
+		ret = btrfs_inc_extent_ref(trans, root, ins.objectid,
+					   ins.offset, 0,
+					   root->root_key.objectid,
+					   btrfs_ino(inode),
+					   file_pos, /* file_pos - 0 */
+					   0);
+	}
 out:
 	btrfs_free_path(path);
 
 	return ret;
 }
 
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+				       struct inode *inode, u64 file_pos,
+				       u64 disk_bytenr, u64 disk_num_bytes,
+				       u64 num_bytes, u64 ram_bytes,
+				       u8 compression, u8 encryption,
+				       u16 other_encoding, int extent_type)
+{
+	return __insert_reserved_file_extent(trans, inode, file_pos,
+					     disk_bytenr, disk_num_bytes,
+					     num_bytes, ram_bytes, compression,
+					     encryption, other_encoding,
+					     extent_type, 0, NULL);
+}
+
 /* snapshot-aware defrag */
 struct sa_defrag_extent_backref {
 	struct rb_node node;
@@ -2671,14 +2941,16 @@  static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 						ordered_extent->len);
 	} else {
 		BUG_ON(root == root->fs_info->tree_root);
-		ret = insert_reserved_file_extent(trans, inode,
+		ret = __insert_reserved_file_extent(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->start,
 						ordered_extent->disk_len,
 						ordered_extent->len,
 						ordered_extent->len,
 						compress_type, 0, 0,
-						BTRFS_FILE_EXTENT_REG);
+						BTRFS_FILE_EXTENT_REG,
+						ordered_extent->dedup,
+						ordered_extent->hash);
 	}
 	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
 			   ordered_extent->file_offset, ordered_extent->len,
@@ -4031,6 +4303,7 @@  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 extent_offset = 0;
 	u64 item_end = 0;
 	u32 found_type = (u8)-1;
+	u64 dedup_hash = 0;
 	int found_extent;
 	int del_item;
 	int pending_del_nr = 0;
@@ -4149,6 +4422,8 @@  search_again:
 									 fi);
 				extent_offset = found_key.offset -
 					btrfs_file_extent_offset(leaf, fi);
+				dedup_hash = btrfs_file_extent_dedup_hash(leaf,
+									  fi);
 
 				/* FIXME blocksize != 4096 */
 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
@@ -4202,10 +4477,11 @@  delete:
 		if (found_extent && (root->ref_cows ||
 				     root == root->fs_info->tree_root)) {
 			btrfs_set_path_blocking(path);
-			ret = btrfs_free_extent(trans, root, extent_start,
+			ret = btrfs_free_extent_hash(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
-						ino, extent_offset, 0);
+						ino, extent_offset, 0,
+						&dedup_hash);
 			BUG_ON(ret);
 		}
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 898c572..89efad4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4017,6 +4017,37 @@  out_unlock:
 	return ret;
 }
 
+static long btrfs_ioctl_enable_dedup(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_root *dedup_root;
+	int ret = 0;
+
+	trans = btrfs_start_transaction(root, 2);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		printk(KERN_INFO "trans error %d\n", ret);
+		return ret;
+	}
+
+	dedup_root = btrfs_create_tree(trans, fs_info,
+				       BTRFS_DEDUP_TREE_OBJECTID);
+	if (IS_ERR(dedup_root))
+		ret = PTR_ERR(dedup_root);
+
+	if (ret)
+		ret = btrfs_end_transaction(trans, root);
+	else
+		ret = btrfs_commit_transaction(trans, root);
+
+	if (!ret) {
+		printk(KERN_INFO "dedup enabled\n");
+		fs_info->dedup_root = dedup_root;
+	}
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -4083,7 +4114,8 @@  long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SPACE_INFO:
 		return btrfs_ioctl_space_info(root, argp);
 	case BTRFS_IOC_SYNC:
-		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+//		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		btrfs_ioctl_enable_dedup(root);
 		return 0;
 	case BTRFS_IOC_START_SYNC:
 		return btrfs_ioctl_start_sync(root, argp);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 005c45d..306a7ef 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -182,7 +182,8 @@  static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 				      u64 start, u64 len, u64 disk_len,
-				      int type, int dio, int compress_type)
+				      int type, int dio, int compress_type,
+				      int dedup, u64 *hash)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -201,6 +202,13 @@  static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 		entry->csum_bytes_left = disk_len;
 	entry->disk_len = disk_len;
 	entry->bytes_left = len;
+	entry->dedup = dedup;
+
+	if (hash) {
+		BUG_ON(sizeof(entry->hash) != BTRFS_DEDUP_HASH_SIZE);
+		memcpy(entry->hash, hash, sizeof(entry->hash));
+	}
+
 	entry->inode = igrab(inode);
 	entry->compress_type = compress_type;
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
@@ -240,7 +248,16 @@  int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 {
 	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
 					  disk_len, type, 0,
-					  BTRFS_COMPRESS_NONE);
+					  BTRFS_COMPRESS_NONE, 0, NULL);
+}
+
+int btrfs_add_ordered_extent_dedup(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len, u64 disk_len, int type,
+			     int dedup, u64 *hash)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 0,
+					  BTRFS_COMPRESS_NONE, dedup, hash);
 }
 
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
@@ -248,7 +265,7 @@  int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 {
 	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
 					  disk_len, type, 1,
-					  BTRFS_COMPRESS_NONE);
+					  BTRFS_COMPRESS_NONE, 0, NULL);
 }
 
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
@@ -257,7 +274,7 @@  int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
 {
 	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
 					  disk_len, type, 0,
-					  compress_type);
+					  compress_type, 0, NULL);
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8eadfe4..d953295 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -114,6 +114,9 @@  struct btrfs_ordered_extent {
 	/* compression algorithm */
 	int compress_type;
 
+	/* if this is for dedup or not */
+	int dedup;
+
 	/* reference count */
 	atomic_t refs;
 
@@ -140,6 +143,9 @@  struct btrfs_ordered_extent {
 	struct completion completion;
 	struct btrfs_work flush_work;
 	struct list_head work_list;
+
+	/* dedup hash of sha256 type */
+	u64 hash[BTRFS_DEDUP_HASH_LEN];
 };
 
 /*
@@ -176,6 +182,9 @@  int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 				   int uptodate);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dedup(struct inode *inode, u64 file_offset,
+				   u64 start, u64 len, u64 disk_len, int type,
+				   int dedup, u64 *hash);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 				 u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 920957e..50ac9db 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -250,13 +250,15 @@  void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			       (unsigned long long)
 			       btrfs_file_extent_disk_num_bytes(l, fi));
 			printk(KERN_INFO "\t\textent data offset %llu "
-			       "nr %llu ram %llu\n",
+			       "nr %llu ram %llu dedup_hash %llu\n",
 			       (unsigned long long)
 			       btrfs_file_extent_offset(l, fi),
 			       (unsigned long long)
 			       btrfs_file_extent_num_bytes(l, fi),
 			       (unsigned long long)
-			       btrfs_file_extent_ram_bytes(l, fi));
+			       btrfs_file_extent_ram_bytes(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_dedup_hash(l, fi));
 			break;
 		case BTRFS_EXTENT_REF_V0_KEY:
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 68a29a1..37ca62a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -320,7 +320,7 @@  enum {
 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
 	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
 	Opt_check_integrity, Opt_check_integrity_including_extent_data,
-	Opt_check_integrity_print_mask, Opt_fatal_errors,
+	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_dedup,
 	Opt_err,
 };
 
@@ -361,6 +361,7 @@  static match_table_t tokens = {
 	{Opt_check_integrity_including_extent_data, "check_int_data"},
 	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
 	{Opt_fatal_errors, "fatal_errors=%s"},
+	{Opt_dedup, "dedup"},
 	{Opt_err, NULL},
 };
 
@@ -626,6 +627,10 @@  int btrfs_parse_options(struct btrfs_root *root, char *options)
 				goto out;
 			}
 			break;
+		case Opt_dedup:
+			printk(KERN_INFO "btrfs: use deduplication\n");
+			btrfs_set_opt(info->mount_opt, DEDUP);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);