[1/4] btrfs: skip superblocks during discard
diff mbox

Message ID 1434036062-21597-2-git-send-email-jeffm@suse.com
State Superseded
Headers show

Commit Message

Jeff Mahoney June 11, 2015, 3:20 p.m. UTC
From: Jeff Mahoney <jeffm@suse.com>

Btrfs doesn't track superblocks with extent records so there is nothing
persistent on-disk to indicate that those blocks are in use.  We track
the superblocks in memory to ensure they don't get used by removing them
from the free space cache when we load a block group from disk.  Prior
to 47ab2a6c6a (Btrfs: remove empty block groups automatically), that
was fine since the block group would never be reclaimed so the superblock
was always safe.  Once we started removing the empty block groups, we
were protected by the fact that discards weren't being properly issued
for unused space either via FITRIM or -odiscard.  The block groups were
still being released, but the blocks remained on disk.

In order to properly discard unused block groups, we need to filter out
the superblocks from the discard range.  Superblocks are located at fixed
locations on each device, so it makes sense to filter them out in
btrfs_issue_discard, which is used by both -odiscard and FITRIM.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/btrfs/extent-tree.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 6 deletions(-)

Comments

Jeff Mahoney June 11, 2015, 3:25 p.m. UTC | #1
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 6/11/15 11:20 AM, jeffm@suse.com wrote:
> From: Jeff Mahoney <jeffm@suse.com>
> 
> Btrfs doesn't track superblocks with extent records so there is
> nothing persistent on-disk to indicate that those blocks are in
> use.  We track the superblocks in memory to ensure they don't get
> used by removing them from the free space cache when we load a
> block group from disk.  Prior to 47ab2a6c6a (Btrfs: remove empty
> block groups automatically), that was fine since the block group
> would never be reclaimed so the superblock was always safe.  Once
> we started removing the empty block groups, we were protected by
> the fact that discards weren't being properly issued for unused
> space either via FITRIM or -odiscard.  The block groups were still
> being released, but the blocks remained on disk.
> 
> In order to properly discard unused block groups, we need to filter
> out the superblocks from the discard range.  Superblocks are
> located at fixed locations on each device, so it makes sense to
> filter them out in btrfs_issue_discard, which is used by both
> -odiscard and FITRIM.
> 
> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- 
> fs/btrfs/extent-tree.c | 50
> ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed,
> 44 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index
> 0ec3acd..75d0226 100644 --- a/fs/btrfs/extent-tree.c +++
> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static int
> remove_extent_backref(struct btrfs_trans_handle *trans, return
> ret; }
> 
> -static int btrfs_issue_discard(struct block_device *bdev, -				u64
> start, u64 len) +#define in_range(b, first, len)	((b) >= (first) &&
> (b) < (first) + (len)) +static int btrfs_issue_discard(struct
> block_device *bdev, u64 start, u64 len, +			       u64
> *discarded_bytes) { -	return blkdev_issue_discard(bdev, start >> 9,
> len >> 9, GFP_NOFS, 0); +	u64 skipped = 0; +	u64 bytes_left = len; 
> +	int ret = 0 , j; + +	*discarded_bytes = 0; + +	/* Skip any
> superblocks on this device. */ +	for (j = 0; j <
> BTRFS_SUPER_MIRROR_MAX; j++) { +		u64 sb_offset =
> btrfs_sb_offset(j); +		u64 size = sb_offset - start; + +		if
> (!in_range(sb_offset, start, bytes_left)) +			continue; + +		if
> (size) { +			ret = blkdev_issue_discard(bdev, start >> 9, size >>
> 9, +						   GFP_NOFS, 0); +			if (!ret) +				*discarded_bytes +=
> size; +			else if (ret != -EOPNOTSUPP) +				return ret; +		} + +
> start = sb_offset + BTRFS_SUPER_INFO_SIZE; +		if (start > len) +
> start = len; +		bytes_left = len - start; +		skipped +=
> BTRFS_SUPER_INFO_SIZE;

Whoops. This is an unused artifact from an earlier version.

- -Jeff

> +	} + +	if (bytes_left) { +		ret = blkdev_issue_discard(bdev, start
> >> 9, bytes_left >> 9, +					   GFP_NOFS, 0); +		if (!ret) +
> *discarded_bytes += bytes_left; +	} +	return ret; }
> 
> int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, @@
> -1906,16 +1943,17 @@ int btrfs_discard_extent(struct btrfs_root
> *root, u64 bytenr, struct btrfs_bio_stripe *stripe =
> bbio->stripes; int i;
> 
> - for (i = 0; i < bbio->num_stripes; i++, stripe++) { +			u64
> bytes; + if (!stripe->dev->can_discard) continue;
> 
> ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, -
> stripe->length); +						  stripe->length, &bytes); if (!ret) -
> discarded_bytes += stripe->length; +				discarded_bytes += bytes; 
> else if (ret != -EOPNOTSUPP) break; /* Logic errors or -ENOMEM, or
> -EIO but I don't know how that could happen JDM */
> 
> 


- -- 
Jeff Mahoney
SUSE Labs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG/MacGPG2 v2.0.19 (Darwin)

iQIcBAEBAgAGBQJVeahbAAoJEB57S2MheeWyA3YP/00sK18TqJi6ohu0l18otsd2
vG2lAtPUDT6f4dFM1GvO4PYLssuwm0Vy98qFYO30lvcHbdbajhsGd1qk+IzEq3Dr
+Qxze9lIrX8267nRE3aqu1I8/y2Gn5Wcy5Xr6fwhN0g+m5J+fWZHy1mfTvqbNt+M
U7Cg4hgvTFDcuV5adrg3JoQ3W/w6IvgIid27oNZ0MJMaK4f2jcLfmhGVCTcCnydp
7fEUsMow2firGJew7161ORYkYjTn76JW2HXB8ETOgWUcMpZon1KWduVXOTD0r9sk
UrfMJ8dLw6334T265gmQrBXvb3yOBLbyRYuksUFhXFUo1/xDSQMLGMBrkEf6T/Vo
mwnXVw5hnPM41vZiJ4Pc56vg1Yy4JGMXs28XEyHrj7gQWo3b1jRw0YhosZE/TR5+
BsJb4KqBTPor/upPLmDwIOYU4Ia+GZ5V2k1jYsE9qs6khHXlX9is8WHeB/fMzfCe
zkZEl8YI3ek50wXcqBk+QHFj8LT89Eyq59O3iaCFceIhvjO87ldRDBtTtBR34ogf
mZphx9UzVTcY31wftDYKMMi2Xs+8BZgXq7H7IXc2muugFlK9AGsMv4iXPbVkmUmE
T4pS2vKqCwfkW9bGf7ufQaOpVgwzQjDkC/Ngtybl8sULroOjI3lm6p+2/jMuqyYd
twV2FaYAJAI/tiQkTken
=1g/b
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Filipe Manana June 11, 2015, 4:47 p.m. UTC | #2
On Thu, Jun 11, 2015 at 4:20 PM,  <jeffm@suse.com> wrote:
> From: Jeff Mahoney <jeffm@suse.com>
>
> Btrfs doesn't track superblocks with extent records so there is nothing
> persistent on-disk to indicate that those blocks are in use.  We track
> the superblocks in memory to ensure they don't get used by removing them
> from the free space cache when we load a block group from disk.  Prior
> to 47ab2a6c6a (Btrfs: remove empty block groups automatically), that
> was fine since the block group would never be reclaimed so the superblock
> was always safe.  Once we started removing the empty block groups, we
> were protected by the fact that discards weren't being properly issued
> for unused space either via FITRIM or -odiscard.  The block groups were
> still being released, but the blocks remained on disk.
>
> In order to properly discard unused block groups, we need to filter out
> the superblocks from the discard range.  Superblocks are located at fixed
> locations on each device, so it makes sense to filter them out in
> btrfs_issue_discard, which is used by both -odiscard and FITRIM.
>
> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> ---
>  fs/btrfs/extent-tree.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 44 insertions(+), 6 deletions(-)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 0ec3acd..75d0226 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -1884,10 +1884,47 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
>         return ret;
>  }
>
> -static int btrfs_issue_discard(struct block_device *bdev,
> -                               u64 start, u64 len)
> +#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))

Hi Jeff,

So this will work if every caller behaves well and passes a region
whose start and end offsets are a multiple of the sector size (4096)
which currently matches the superblock size.

However, I think it would be safer to check for the case where the
start offset of a superblock mirror is < (first) and (sb_offset +
sb_len) > (first).  Just to deal with cases where for example the 2nd
half of the sb starts at offset (first).

I guess this sectorsize becoming less than 4096 will happen sooner or
later with the subpage sectorsize patch set, so it wouldn't hurt to
make it more bullet proof already.

Otherwise it looks good to me.
I'll give a test on this patchset soon.

thanks

> +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
> +                              u64 *discarded_bytes)
>  {
> -       return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
> +       u64 skipped = 0;
> +       u64 bytes_left = len;
> +       int ret = 0 , j;
> +
> +       *discarded_bytes = 0;
> +
> +       /* Skip any superblocks on this device. */
> +       for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
> +               u64 sb_offset = btrfs_sb_offset(j);
> +               u64 size = sb_offset - start;
> +
> +               if (!in_range(sb_offset, start, bytes_left))
> +                       continue;
> +
> +               if (size) {
> +                       ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
> +                                                  GFP_NOFS, 0);
> +                       if (!ret)
> +                               *discarded_bytes += size;
> +                       else if (ret != -EOPNOTSUPP)
> +                               return ret;
> +               }
> +
> +               start = sb_offset + BTRFS_SUPER_INFO_SIZE;
> +               if (start > len)
> +                       start = len;
> +               bytes_left = len - start;
> +               skipped += BTRFS_SUPER_INFO_SIZE;
> +       }
> +
> +       if (bytes_left) {
> +               ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
> +                                          GFP_NOFS, 0);
> +               if (!ret)
> +                       *discarded_bytes += bytes_left;
> +       }
> +       return ret;
>  }
>
>  int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
> @@ -1906,16 +1943,17 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
>                 struct btrfs_bio_stripe *stripe = bbio->stripes;
>                 int i;
>
> -
>                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
> +                       u64 bytes;
> +
>                         if (!stripe->dev->can_discard)
>                                 continue;
>
>                         ret = btrfs_issue_discard(stripe->dev->bdev,
>                                                   stripe->physical,
> -                                                 stripe->length);
> +                                                 stripe->length, &bytes);
>                         if (!ret)
> -                               discarded_bytes += stripe->length;
> +                               discarded_bytes += bytes;
>                         else if (ret != -EOPNOTSUPP)
>                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
>
> --
> 1.8.4.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Mahoney June 11, 2015, 6:17 p.m. UTC | #3
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 6/11/15 12:47 PM, Filipe David Manana wrote:
> On Thu, Jun 11, 2015 at 4:20 PM,  <jeffm@suse.com> wrote:
>> From: Jeff Mahoney <jeffm@suse.com>
>> 
>> Btrfs doesn't track superblocks with extent records so there is
>> nothing persistent on-disk to indicate that those blocks are in
>> use.  We track the superblocks in memory to ensure they don't get
>> used by removing them from the free space cache when we load a
>> block group from disk.  Prior to 47ab2a6c6a (Btrfs: remove empty
>> block groups automatically), that was fine since the block group
>> would never be reclaimed so the superblock was always safe.  Once
>> we started removing the empty block groups, we were protected by
>> the fact that discards weren't being properly issued for unused
>> space either via FITRIM or -odiscard.  The block groups were 
>> still being released, but the blocks remained on disk.
>> 
>> In order to properly discard unused block groups, we need to
>> filter out the superblocks from the discard range.  Superblocks
>> are located at fixed locations on each device, so it makes sense
>> to filter them out in btrfs_issue_discard, which is used by both
>> -odiscard and FITRIM.
>> 
>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- 
>> fs/btrfs/extent-tree.c | 50
>> ++++++++++++++++++++++++++++++++++++++++++++------ 1 file
>> changed, 44 insertions(+), 6 deletions(-)
>> 
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c 
>> index 0ec3acd..75d0226 100644 --- a/fs/btrfs/extent-tree.c +++
>> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static int
>> remove_extent_backref(struct btrfs_trans_handle *trans, return
>> ret; }
>> 
>> -static int btrfs_issue_discard(struct block_device *bdev, -
>> u64 start, u64 len) +#define in_range(b, first, len)        ((b)
>> >= (first) && (b) < (first) + (len))
> 
> Hi Jeff,
> 
> So this will work if every caller behaves well and passes a region 
> whose start and end offsets are a multiple of the sector size
> (4096) which currently matches the superblock size.
> 
> However, I think it would be safer to check for the case where the 
> start offset of a superblock mirror is < (first) and (sb_offset + 
> sb_len) > (first).  Just to deal with cases where for example the
> 2nd half of the sb starts at offset (first).
> 
> I guess this sectorsize becoming less than 4096 will happen sooner
> or later with the subpage sectorsize patch set, so it wouldn't hurt
> to make it more bullet proof already.

Is that something anyone intends to support?  While I suppose the
subpage sector patch /could/ be used to allow file systems with a node
size under 4k, the intention is the other way around -- systems that
have higher order page sizes currently don't work with btrfs file
system created on systems with smaller order page sizes like x86.
Btrfs already has high enough metadata overhead.  Pretty much all new
hardware has, at least, a native 4k sector size even if it's
abstracted behind a RMW layer.  The sectors are only going to get
larger.  With the metadata overhead that btrfs already incurs, I can't
imagine any production use case with smaller sector sizes.

Are we looking to support <4k nodes to test the subpage sector code on
x86?  If so, then I'll change this to handle the possibility of
superblocks crossing sector boundaries.  Otherwise, it's protecting
against a use case that just shouldn't happen.

> Otherwise it looks good to me. I'll give a test on this patchset
> soon.

Thanks,

- -Jeff


- -- 
Jeff Mahoney
SUSE Labs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG/MacGPG2 v2.0.19 (Darwin)

iQIcBAEBAgAGBQJVedCiAAoJEB57S2MheeWymA4P/2Y+YGwAkcbYxrlMVH/wIGpN
j+qO/y1YHNJQNiRHt/1ahM8wy2DEEbqrD36xUuX5lDYRREo9jeqIGGajOOHg/KFw
7q6zIWVEEwom/RKd9CX48TC2pHly5Fw8ESyi+A857KJ1ZHcpKdyNcIwle/Jsoe0q
a+SX6hJPlHFVai/QZhBRO00ZgXlTwjXAGyfgmfuHERY6lS5PBmoA8tYxnigpnBOa
PUrgw+Cdn4glrZUTpt3WN4H5oE+pD6cGMQ+GnFXQYaytssyQNuPpCWdQ1Aferg7u
Af3E6iBj776bQIRTWZSwjXTMLWHVjnBmdU8D+wXE7Ar3oU1POL7NLvnGm4Yr9TWZ
n1NXcBhJ4QQQXBprK3bI+WNSzMzMLdvJHIb5t2Z9BO8wd5Ws0QlmT8Gl+u/ZofZU
8eouhLgu1hZzPeJgXPuDu0S8QaFtpI0zlupOwJByHp9QSzpUw98xqlFXIRtnLc48
n/ZMFi98EMroaQx0hzFLGMwp/57tUFWUUroDkfP1NngoE482ohPHAdREtr7+RZe6
h+pfF+//5CycYxk8BciBmCyLWWTW1WdDL/MzQsXdKE+797cNk79+W5EKYLmQiAbW
IGyRXMj+XPpWNC1VM12JJXdoOujSGocmJa28jKanZRzqw0HKJOlTUBqNoBV83Bph
ChcTZyhAi3PCsib+mNnW
=wb8l
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Filipe Manana June 11, 2015, 6:44 p.m. UTC | #4
On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com> wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> On 6/11/15 12:47 PM, Filipe David Manana wrote:
>> On Thu, Jun 11, 2015 at 4:20 PM,  <jeffm@suse.com> wrote:
>>> From: Jeff Mahoney <jeffm@suse.com>
>>>
>>> Btrfs doesn't track superblocks with extent records so there is
>>> nothing persistent on-disk to indicate that those blocks are in
>>> use.  We track the superblocks in memory to ensure they don't get
>>> used by removing them from the free space cache when we load a
>>> block group from disk.  Prior to 47ab2a6c6a (Btrfs: remove empty
>>> block groups automatically), that was fine since the block group
>>> would never be reclaimed so the superblock was always safe.  Once
>>> we started removing the empty block groups, we were protected by
>>> the fact that discards weren't being properly issued for unused
>>> space either via FITRIM or -odiscard.  The block groups were
>>> still being released, but the blocks remained on disk.
>>>
>>> In order to properly discard unused block groups, we need to
>>> filter out the superblocks from the discard range.  Superblocks
>>> are located at fixed locations on each device, so it makes sense
>>> to filter them out in btrfs_issue_discard, which is used by both
>>> -odiscard and FITRIM.
>>>
>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> ---
>>> fs/btrfs/extent-tree.c | 50
>>> ++++++++++++++++++++++++++++++++++++++++++++------ 1 file
>>> changed, 44 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>>> index 0ec3acd..75d0226 100644 --- a/fs/btrfs/extent-tree.c +++
>>> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static int
>>> remove_extent_backref(struct btrfs_trans_handle *trans, return
>>> ret; }
>>>
>>> -static int btrfs_issue_discard(struct block_device *bdev, -
>>> u64 start, u64 len) +#define in_range(b, first, len)        ((b)
>>> >= (first) && (b) < (first) + (len))
>>
>> Hi Jeff,
>>
>> So this will work if every caller behaves well and passes a region
>> whose start and end offsets are a multiple of the sector size
>> (4096) which currently matches the superblock size.
>>
>> However, I think it would be safer to check for the case where the
>> start offset of a superblock mirror is < (first) and (sb_offset +
>> sb_len) > (first).  Just to deal with cases where for example the
>> 2nd half of the sb starts at offset (first).
>>
>> I guess this sectorsize becoming less than 4096 will happen sooner
>> or later with the subpage sectorsize patch set, so it wouldn't hurt
>> to make it more bullet proof already.
>
> Is that something anyone intends to support?  While I suppose the
> subpage sector patch /could/ be used to allow file systems with a node
> size under 4k, the intention is the other way around -- systems that
> have higher order page sizes currently don't work with btrfs file
> system created on systems with smaller order page sizes like x86.
> Btrfs already has high enough metadata overhead.  Pretty much all new
> hardware has, at least, a native 4k sector size even if it's
> abstracted behind a RMW layer.  The sectors are only going to get
> larger.  With the metadata overhead that btrfs already incurs, I can't
> imagine any production use case with smaller sector sizes.
>
> Are we looking to support <4k nodes to test the subpage sector code on
> x86?  If so, then I'll change this to handle the possibility of
> superblocks crossing sector boundaries.  Otherwise, it's protecting
> against a use case that just shouldn't happen.

I understand your point.
I'm probably being too paranoid. But it's exactly because it's not
supposed to happen that at least an assertion or something should be
added imho. A lot of "not supposed not happen things" happen often,
and that's often how people lose data, and get into other bad issues.

And I think I've heard once of supporting <4k nodes (sectorsizes) for
testing at least on x86 for e.g, but I might have not understood it
correctly. Having such a check would help detect bugs during
development where some caller passes a wrong range to discard - better
to find it during development/RCs rather than in production.

But anyway, just a personal preference.

thanks

>
>> Otherwise it looks good to me. I'll give a test on this patchset
>> soon.
>
> Thanks,
>
> - -Jeff
>
>
> - --
> Jeff Mahoney
> SUSE Labs
> -----BEGIN PGP SIGNATURE-----
> Version: GnuPG/MacGPG2 v2.0.19 (Darwin)
>
> iQIcBAEBAgAGBQJVedCiAAoJEB57S2MheeWymA4P/2Y+YGwAkcbYxrlMVH/wIGpN
> j+qO/y1YHNJQNiRHt/1ahM8wy2DEEbqrD36xUuX5lDYRREo9jeqIGGajOOHg/KFw
> 7q6zIWVEEwom/RKd9CX48TC2pHly5Fw8ESyi+A857KJ1ZHcpKdyNcIwle/Jsoe0q
> a+SX6hJPlHFVai/QZhBRO00ZgXlTwjXAGyfgmfuHERY6lS5PBmoA8tYxnigpnBOa
> PUrgw+Cdn4glrZUTpt3WN4H5oE+pD6cGMQ+GnFXQYaytssyQNuPpCWdQ1Aferg7u
> Af3E6iBj776bQIRTWZSwjXTMLWHVjnBmdU8D+wXE7Ar3oU1POL7NLvnGm4Yr9TWZ
> n1NXcBhJ4QQQXBprK3bI+WNSzMzMLdvJHIb5t2Z9BO8wd5Ws0QlmT8Gl+u/ZofZU
> 8eouhLgu1hZzPeJgXPuDu0S8QaFtpI0zlupOwJByHp9QSzpUw98xqlFXIRtnLc48
> n/ZMFi98EMroaQx0hzFLGMwp/57tUFWUUroDkfP1NngoE482ohPHAdREtr7+RZe6
> h+pfF+//5CycYxk8BciBmCyLWWTW1WdDL/MzQsXdKE+797cNk79+W5EKYLmQiAbW
> IGyRXMj+XPpWNC1VM12JJXdoOujSGocmJa28jKanZRzqw0HKJOlTUBqNoBV83Bph
> ChcTZyhAi3PCsib+mNnW
> =wb8l
> -----END PGP SIGNATURE-----
Jeff Mahoney June 11, 2015, 7:15 p.m. UTC | #5
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 6/11/15 2:44 PM, Filipe David Manana wrote:
> On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com>
> wrote: On 6/11/15 12:47 PM, Filipe David Manana wrote:
>>>> On Thu, Jun 11, 2015 at 4:20 PM,  <jeffm@suse.com> wrote:
>>>>> From: Jeff Mahoney <jeffm@suse.com>
>>>>> 
>>>>> Btrfs doesn't track superblocks with extent records so
>>>>> there is nothing persistent on-disk to indicate that those
>>>>> blocks are in use.  We track the superblocks in memory to
>>>>> ensure they don't get used by removing them from the free
>>>>> space cache when we load a block group from disk.  Prior to
>>>>> 47ab2a6c6a (Btrfs: remove empty block groups
>>>>> automatically), that was fine since the block group would
>>>>> never be reclaimed so the superblock was always safe.
>>>>> Once we started removing the empty block groups, we were
>>>>> protected by the fact that discards weren't being properly
>>>>> issued for unused space either via FITRIM or -odiscard.
>>>>> The block groups were still being released, but the blocks
>>>>> remained on disk.
>>>>> 
>>>>> In order to properly discard unused block groups, we need
>>>>> to filter out the superblocks from the discard range.
>>>>> Superblocks are located at fixed locations on each device,
>>>>> so it makes sense to filter them out in
>>>>> btrfs_issue_discard, which is used by both -odiscard and
>>>>> FITRIM.
>>>>> 
>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- 
>>>>> fs/btrfs/extent-tree.c | 50 
>>>>> ++++++++++++++++++++++++++++++++++++++++++++------ 1 file 
>>>>> changed, 44 insertions(+), 6 deletions(-)
>>>>> 
>>>>> diff --git a/fs/btrfs/extent-tree.c
>>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 100644 ---
>>>>> a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@
>>>>> -1884,10 +1884,47 @@ static int 
>>>>> remove_extent_backref(struct btrfs_trans_handle *trans,
>>>>> return ret; }
>>>>> 
>>>>> -static int btrfs_issue_discard(struct block_device *bdev,
>>>>> - u64 start, u64 len) +#define in_range(b, first, len)
>>>>> ((b)
>>>>>> = (first) && (b) < (first) + (len))
>>>> 
>>>> Hi Jeff,
>>>> 
>>>> So this will work if every caller behaves well and passes a
>>>> region whose start and end offsets are a multiple of the
>>>> sector size (4096) which currently matches the superblock
>>>> size.
>>>> 
>>>> However, I think it would be safer to check for the case
>>>> where the start offset of a superblock mirror is < (first)
>>>> and (sb_offset + sb_len) > (first).  Just to deal with cases
>>>> where for example the 2nd half of the sb starts at offset
>>>> (first).
>>>> 
>>>> I guess this sectorsize becoming less than 4096 will happen
>>>> sooner or later with the subpage sectorsize patch set, so it
>>>> wouldn't hurt to make it more bullet proof already.
> 
> Is that something anyone intends to support?  While I suppose the 
> subpage sector patch /could/ be used to allow file systems with a
> node size under 4k, the intention is the other way around --
> systems that have higher order page sizes currently don't work with
> btrfs file system created on systems with smaller order page sizes
> like x86. Btrfs already has high enough metadata overhead.  Pretty
> much all new hardware has, at least, a native 4k sector size even
> if it's abstracted behind a RMW layer.  The sectors are only going
> to get larger.  With the metadata overhead that btrfs already
> incurs, I can't imagine any production use case with smaller sector
> sizes.
> 
> Are we looking to support <4k nodes to test the subpage sector code
> on x86?  If so, then I'll change this to handle the possibility of 
> superblocks crossing sector boundaries.  Otherwise, it's
> protecting against a use case that just shouldn't happen.
> 
>> I understand your point. I'm probably being too paranoid. But
>> it's exactly because it's not supposed to happen that at least an
>> assertion or something should be added imho. A lot of "not
>> supposed not happen things" happen often, and that's often how
>> people lose data, and get into other bad issues.
> 
>> And I think I've heard once of supporting <4k nodes (sectorsizes)
>> for testing at least on x86 for e.g, but I might have not
>> understood it correctly. Having such a check would help detect
>> bugs during development where some caller passes a wrong range to
>> discard - better to find it during development/RCs rather than in
>> production.

Yeah, you're right. I'll make it more bulletproof. I'm just looking to
be done with this particular mess. :)

- -Jeff

- -- 
Jeff Mahoney
SUSE Labs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG/MacGPG2 v2.0.19 (Darwin)

iQIcBAEBAgAGBQJVed5SAAoJEB57S2MheeWypa4QAJIrhdZNg3d1I972dkkBBFjk
723lHLXWdHkJifQKYa7b91hwtWplKxZlTuEoc7/AwacoemR7m6V3JMApykHDgHrV
mmHrUvcJVUFjhbn1vXkeMhbz8COLbOUIUF6/hrwzkg00SSDwtjmv5IJQEIXikKEo
j/iI2IyyNsA2JgCa5R/2JvWRssWUF8VhupLw/T6WOfjTwhEEDT/DqZIuD/avGrwk
cxo8NqUkUNrFA7JxhO9TS/Mz9IIxiLu75XQiFzHF3IkLGUGAT4R1JyRMSZhFNjgc
MPS9d9V1ZUZ5swKSRWR07LDtBntiRDYS7atChLsWrOuJs3lsyS3nC4LFkbcqkFdX
pLFOsftfe/FX3OGz5Jry9LyKjr0m7hjveZQ2hnEHB1D/wtpEhRw8WQc8cpL6/Fyr
11g67vsdMbb+KeWsOSLyNk47oRuqULSoqRWRBqeyjfXVRPUBAHME31mZ3QN8+wJz
7Ua6xYoKV5RT8A3w2ceVEhpwqY/DuSGKJ/frteFyLiy12myZz4ZutKLIxwvww4uT
sP3w3NnXFtf3UdA1D0hgm5PQuDk5vBtrFJ1f78nkMXHoKgbIfoSF/LE3Zxb7SFw7
1GscISYtTHumXQdiv6bPw0rvNt1NDj0vKwJG6aRf9IDukT+05a37e33SLBGHGRbv
tXxOaVLT67hJK7jQIWAM
=yNKP
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Mason June 11, 2015, 7:24 p.m. UTC | #6
On 06/11/2015 03:15 PM, Jeff Mahoney wrote:
> On 6/11/15 2:44 PM, Filipe David Manana wrote:
>> On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com>
>> wrote: On 6/11/15 12:47 PM, Filipe David Manana wrote:
>>>>> On Thu, Jun 11, 2015 at 4:20 PM,  <jeffm@suse.com> wrote:
>>>>>> From: Jeff Mahoney <jeffm@suse.com>
>>>>>>
>>>>>> Btrfs doesn't track superblocks with extent records so
>>>>>> there is nothing persistent on-disk to indicate that those
>>>>>> blocks are in use.  We track the superblocks in memory to
>>>>>> ensure they don't get used by removing them from the free
>>>>>> space cache when we load a block group from disk.  Prior to
>>>>>> 47ab2a6c6a (Btrfs: remove empty block groups
>>>>>> automatically), that was fine since the block group would
>>>>>> never be reclaimed so the superblock was always safe.
>>>>>> Once we started removing the empty block groups, we were
>>>>>> protected by the fact that discards weren't being properly
>>>>>> issued for unused space either via FITRIM or -odiscard.
>>>>>> The block groups were still being released, but the blocks
>>>>>> remained on disk.
>>>>>>
>>>>>> In order to properly discard unused block groups, we need
>>>>>> to filter out the superblocks from the discard range.
>>>>>> Superblocks are located at fixed locations on each device,
>>>>>> so it makes sense to filter them out in
>>>>>> btrfs_issue_discard, which is used by both -odiscard and
>>>>>> FITRIM.
>>>>>>
>>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- 
>>>>>> fs/btrfs/extent-tree.c | 50 
>>>>>> ++++++++++++++++++++++++++++++++++++++++++++------ 1 file 
>>>>>> changed, 44 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/btrfs/extent-tree.c
>>>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 100644 ---
>>>>>> a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@
>>>>>> -1884,10 +1884,47 @@ static int 
>>>>>> remove_extent_backref(struct btrfs_trans_handle *trans,
>>>>>> return ret; }
>>>>>>
>>>>>> -static int btrfs_issue_discard(struct block_device *bdev,
>>>>>> - u64 start, u64 len) +#define in_range(b, first, len)
>>>>>> ((b)
>>>>>>> = (first) && (b) < (first) + (len))
>>>>>
>>>>> Hi Jeff,
>>>>>
>>>>> So this will work if every caller behaves well and passes a
>>>>> region whose start and end offsets are a multiple of the
>>>>> sector size (4096) which currently matches the superblock
>>>>> size.
>>>>>
>>>>> However, I think it would be safer to check for the case
>>>>> where the start offset of a superblock mirror is < (first)
>>>>> and (sb_offset + sb_len) > (first).  Just to deal with cases
>>>>> where for example the 2nd half of the sb starts at offset
>>>>> (first).
>>>>>
>>>>> I guess this sectorsize becoming less than 4096 will happen
>>>>> sooner or later with the subpage sectorsize patch set, so it
>>>>> wouldn't hurt to make it more bullet proof already.
> 
>> Is that something anyone intends to support?  While I suppose the 
>> subpage sector patch /could/ be used to allow file systems with a
>> node size under 4k, the intention is the other way around --
>> systems that have higher order page sizes currently don't work with
>> btrfs file system created on systems with smaller order page sizes
>> like x86.

The best use of smaller node sizes is just to test the subpagesize
patches on more common hardware.  I wouldn't expect anyone to use a 1K
node size in production.

Thanks for doing these Jeff.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Mahoney June 11, 2015, 7:27 p.m. UTC | #7
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 6/11/15 3:24 PM, Chris Mason wrote:
> On 06/11/2015 03:15 PM, Jeff Mahoney wrote:
>> On 6/11/15 2:44 PM, Filipe David Manana wrote:
>>> On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com> 
>>> wrote: On 6/11/15 12:47 PM, Filipe David Manana wrote:
>>>>>> On Thu, Jun 11, 2015 at 4:20 PM,  <jeffm@suse.com>
>>>>>> wrote:
>>>>>>> From: Jeff Mahoney <jeffm@suse.com>
>>>>>>> 
>>>>>>> Btrfs doesn't track superblocks with extent records so 
>>>>>>> there is nothing persistent on-disk to indicate that
>>>>>>> those blocks are in use.  We track the superblocks in
>>>>>>> memory to ensure they don't get used by removing them
>>>>>>> from the free space cache when we load a block group
>>>>>>> from disk.  Prior to 47ab2a6c6a (Btrfs: remove empty
>>>>>>> block groups automatically), that was fine since the
>>>>>>> block group would never be reclaimed so the superblock
>>>>>>> was always safe. Once we started removing the empty
>>>>>>> block groups, we were protected by the fact that
>>>>>>> discards weren't being properly issued for unused space
>>>>>>> either via FITRIM or -odiscard. The block groups were
>>>>>>> still being released, but the blocks remained on disk.
>>>>>>> 
>>>>>>> In order to properly discard unused block groups, we
>>>>>>> need to filter out the superblocks from the discard
>>>>>>> range. Superblocks are located at fixed locations on
>>>>>>> each device, so it makes sense to filter them out in 
>>>>>>> btrfs_issue_discard, which is used by both -odiscard
>>>>>>> and FITRIM.
>>>>>>> 
>>>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- 
>>>>>>> fs/btrfs/extent-tree.c | 50 
>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++------ 1
>>>>>>> file changed, 44 insertions(+), 6 deletions(-)
>>>>>>> 
>>>>>>> diff --git a/fs/btrfs/extent-tree.c 
>>>>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 100644
>>>>>>> --- a/fs/btrfs/extent-tree.c +++
>>>>>>> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static
>>>>>>> int remove_extent_backref(struct btrfs_trans_handle
>>>>>>> *trans, return ret; }
>>>>>>> 
>>>>>>> -static int btrfs_issue_discard(struct block_device
>>>>>>> *bdev, - u64 start, u64 len) +#define in_range(b,
>>>>>>> first, len) ((b)
>>>>>>>> = (first) && (b) < (first) + (len))
>>>>>> 
>>>>>> Hi Jeff,
>>>>>> 
>>>>>> So this will work if every caller behaves well and passes
>>>>>> a region whose start and end offsets are a multiple of
>>>>>> the sector size (4096) which currently matches the
>>>>>> superblock size.
>>>>>> 
>>>>>> However, I think it would be safer to check for the case 
>>>>>> where the start offset of a superblock mirror is <
>>>>>> (first) and (sb_offset + sb_len) > (first).  Just to deal
>>>>>> with cases where for example the 2nd half of the sb
>>>>>> starts at offset (first).
>>>>>> 
>>>>>> I guess this sectorsize becoming less than 4096 will
>>>>>> happen sooner or later with the subpage sectorsize patch
>>>>>> set, so it wouldn't hurt to make it more bullet proof
>>>>>> already.
>> 
>>> Is that something anyone intends to support?  While I suppose
>>> the subpage sector patch /could/ be used to allow file systems
>>> with a node size under 4k, the intention is the other way
>>> around -- systems that have higher order page sizes currently
>>> don't work with btrfs file system created on systems with
>>> smaller order page sizes like x86.
> 
> The best use of smaller node sizes is just to test the subpagesize 
> patches on more common hardware.  I wouldn't expect anyone to use a
> 1K node size in production.

Any chance we can enforce that?  Like with a compile-time option? :)

- -Jeff

- -- 
Jeff Mahoney
SUSE Labs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG/MacGPG2 v2.0.19 (Darwin)

iQIcBAEBAgAGBQJVeeE5AAoJEB57S2MheeWyGh8P/RvSEQxL0M7Od6YltKz3yg0H
b/pi1ZukSTNFzQQiK/tt2rgPd0XH8p139SmJ9bWiHMPD14KDN0OtsLlAIjurLUVd
Pf9jBBnaV+D8i8GVMfVBrvMqqK5xZ/6LGEFMgyqzET1WkhYZ1D7XQ5mSPKDNTqnk
YwHrbeJ7+wN0zaaqL7I2ed06Yt5e5GczpIVRXxWKsNvYsQFie4rSdnG/QUDjAReI
W4cynK6NRxQcjc0avqHRGdxFn7woyGe47tPdzr+eERE7yXr+hqrrXyzCXqWu8Nm0
f1mYX/RNRggPg6MmmjIxKsC/ySfs8p6CR6t6DejevtdQRw2uEFciqzx63y8/H3kX
L+wN/ITEOGfeCs5ndwlfL204FhZ73brY7dNIXd97yqHJhGBxadJlsoc7eI9J/fux
jNJy/wHF2/Epdfk9nBNJiRFUuL0eioYBkoE0pEdBkdoGtPAodQ+TIn+rEkxKC//n
ZovStjTJYUVuDfAi9Wpraxd6oaGmqWQU+eYxxLQBXc+ADk+lh15wmWFD9tYAw5uQ
kOwoz2PIxKCOVFBR3ixoPCy9nJsHRQX69L5xV76VqTDnGrBEDeGKHSy4Dhi8aTCN
pSJkF2STAgxxh/CHUT0FmRsIcUiLUp26pdewIjEgXQ8S6cok2/a/+HpOJRSfD0M3
bEPZLzdZREM2FhyrBcCY
=qoEa
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Mason June 11, 2015, 7:35 p.m. UTC | #8
On 06/11/2015 03:27 PM, Jeff Mahoney wrote:
> On 6/11/15 3:24 PM, Chris Mason wrote:
>> On 06/11/2015 03:15 PM, Jeff Mahoney wrote:
>>> On 6/11/15 2:44 PM, Filipe David Manana wrote:
>>>> On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com> 
>>>> wrote: On 6/11/15 12:47 PM, Filipe David Manana wrote:
>>>>>>> On Thu, Jun 11, 2015 at 4:20 PM,  <jeffm@suse.com>
>>>>>>> wrote:
>>>>>>>> From: Jeff Mahoney <jeffm@suse.com>
>>>>>>>>
>>>>>>>> Btrfs doesn't track superblocks with extent records so 
>>>>>>>> there is nothing persistent on-disk to indicate that
>>>>>>>> those blocks are in use.  We track the superblocks in
>>>>>>>> memory to ensure they don't get used by removing them
>>>>>>>> from the free space cache when we load a block group
>>>>>>>> from disk.  Prior to 47ab2a6c6a (Btrfs: remove empty
>>>>>>>> block groups automatically), that was fine since the
>>>>>>>> block group would never be reclaimed so the superblock
>>>>>>>> was always safe. Once we started removing the empty
>>>>>>>> block groups, we were protected by the fact that
>>>>>>>> discards weren't being properly issued for unused space
>>>>>>>> either via FITRIM or -odiscard. The block groups were
>>>>>>>> still being released, but the blocks remained on disk.
>>>>>>>>
>>>>>>>> In order to properly discard unused block groups, we
>>>>>>>> need to filter out the superblocks from the discard
>>>>>>>> range. Superblocks are located at fixed locations on
>>>>>>>> each device, so it makes sense to filter them out in 
>>>>>>>> btrfs_issue_discard, which is used by both -odiscard
>>>>>>>> and FITRIM.
>>>>>>>>
>>>>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- 
>>>>>>>> fs/btrfs/extent-tree.c | 50 
>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++------ 1
>>>>>>>> file changed, 44 insertions(+), 6 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/btrfs/extent-tree.c 
>>>>>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 100644
>>>>>>>> --- a/fs/btrfs/extent-tree.c +++
>>>>>>>> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static
>>>>>>>> int remove_extent_backref(struct btrfs_trans_handle
>>>>>>>> *trans, return ret; }
>>>>>>>>
>>>>>>>> -static int btrfs_issue_discard(struct block_device
>>>>>>>> *bdev, - u64 start, u64 len) +#define in_range(b,
>>>>>>>> first, len) ((b)
>>>>>>>>> = (first) && (b) < (first) + (len))
>>>>>>>
>>>>>>> Hi Jeff,
>>>>>>>
>>>>>>> So this will work if every caller behaves well and passes
>>>>>>> a region whose start and end offsets are a multiple of
>>>>>>> the sector size (4096) which currently matches the
>>>>>>> superblock size.
>>>>>>>
>>>>>>> However, I think it would be safer to check for the case 
>>>>>>> where the start offset of a superblock mirror is <
>>>>>>> (first) and (sb_offset + sb_len) > (first).  Just to deal
>>>>>>> with cases where for example the 2nd half of the sb
>>>>>>> starts at offset (first).
>>>>>>>
>>>>>>> I guess this sectorsize becoming less than 4096 will
>>>>>>> happen sooner or later with the subpage sectorsize patch
>>>>>>> set, so it wouldn't hurt to make it more bullet proof
>>>>>>> already.
>>>
>>>> Is that something anyone intends to support?  While I suppose
>>>> the subpage sector patch /could/ be used to allow file systems
>>>> with a node size under 4k, the intention is the other way
>>>> around -- systems that have higher order page sizes currently
>>>> don't work with btrfs file system created on systems with
>>>> smaller order page sizes like x86.
> 
>> The best use of smaller node sizes is just to test the subpagesize 
>> patches on more common hardware.  I wouldn't expect anyone to use a
>> 1K node size in production.
> 
> Any chance we can enforce that?  Like with a compile-time option? :)

We can make mkfs.btrfs advise strongly against it ;)

But, since I wasn't horribly clear, I'd love one extra if statement in
the discard function.  Silently eating bytes is horribly hard to track down.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Mahoney June 11, 2015, 7:46 p.m. UTC | #9
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 6/11/15 3:35 PM, Chris Mason wrote:
> On 06/11/2015 03:27 PM, Jeff Mahoney wrote:
>> On 6/11/15 3:24 PM, Chris Mason wrote:
>>> On 06/11/2015 03:15 PM, Jeff Mahoney wrote:
>>>> On 6/11/15 2:44 PM, Filipe David Manana wrote:
>>>>> On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney 
>>>>> <jeffm@suse.com> wrote: On 6/11/15 12:47 PM, Filipe David 
>>>>> Manana wrote:
>>>>>>>> On Thu, Jun 11, 2015 at 4:20 PM,  <jeffm@suse.com> 
>>>>>>>> wrote:
>>>>>>>>> From: Jeff Mahoney <jeffm@suse.com>
>>>>>>>>> 
>>>>>>>>> Btrfs doesn't track superblocks with extent
>>>>>>>>> records so there is nothing persistent on-disk to
>>>>>>>>> indicate that those blocks are in use.  We track
>>>>>>>>> the superblocks in memory to ensure they don't get
>>>>>>>>> used by removing them from the free space cache
>>>>>>>>> when we load a block group from disk.  Prior to
>>>>>>>>> 47ab2a6c6a (Btrfs: remove empty block groups
>>>>>>>>> automatically), that was fine since the block group
>>>>>>>>> would never be reclaimed so the superblock was
>>>>>>>>> always safe. Once we started removing the empty
>>>>>>>>> block groups, we were protected by the fact that
>>>>>>>>> discards weren't being properly issued for unused
>>>>>>>>> space either via FITRIM or -odiscard. The block
>>>>>>>>> groups were still being released, but the blocks
>>>>>>>>> remained on disk.
>>>>>>>>> 
>>>>>>>>> In order to properly discard unused block groups, 
>>>>>>>>> we need to filter out the superblocks from the 
>>>>>>>>> discard range. Superblocks are located at fixed 
>>>>>>>>> locations on each device, so it makes sense to 
>>>>>>>>> filter them out in btrfs_issue_discard, which is 
>>>>>>>>> used by both -odiscard and FITRIM.
>>>>>>>>> 
>>>>>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- 
>>>>>>>>> fs/btrfs/extent-tree.c | 50 
>>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++------ 
>>>>>>>>> 1 file changed, 44 insertions(+), 6 deletions(-)
>>>>>>>>> 
>>>>>>>>> diff --git a/fs/btrfs/extent-tree.c 
>>>>>>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 
>>>>>>>>> 100644 --- a/fs/btrfs/extent-tree.c +++ 
>>>>>>>>> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ 
>>>>>>>>> static int remove_extent_backref(struct 
>>>>>>>>> btrfs_trans_handle *trans, return ret; }
>>>>>>>>> 
>>>>>>>>> -static int btrfs_issue_discard(struct block_device
>>>>>>>>> *bdev, - u64 start, u64 len) +#define in_range(b,
>>>>>>>>> first, len) ((b)
>>>>>>>>>> = (first) && (b) < (first) + (len))
>>>>>>>> 
>>>>>>>> Hi Jeff,
>>>>>>>> 
>>>>>>>> So this will work if every caller behaves well and 
>>>>>>>> passes a region whose start and end offsets are a 
>>>>>>>> multiple of the sector size (4096) which currently 
>>>>>>>> matches the superblock size.
>>>>>>>> 
>>>>>>>> However, I think it would be safer to check for the 
>>>>>>>> case where the start offset of a superblock mirror
>>>>>>>> is < (first) and (sb_offset + sb_len) > (first).
>>>>>>>> Just to deal with cases where for example the 2nd
>>>>>>>> half of the sb starts at offset (first).
>>>>>>>> 
>>>>>>>> I guess this sectorsize becoming less than 4096 will
>>>>>>>>  happen sooner or later with the subpage sectorsize 
>>>>>>>> patch set, so it wouldn't hurt to make it more
>>>>>>>> bullet proof already.
>>>> 
>>>>> Is that something anyone intends to support?  While I 
>>>>> suppose the subpage sector patch /could/ be used to allow 
>>>>> file systems with a node size under 4k, the intention is 
>>>>> the other way around -- systems that have higher order
>>>>> page sizes currently don't work with btrfs file system
>>>>> created on systems with smaller order page sizes like x86.
>> 
>>> The best use of smaller node sizes is just to test the 
>>> subpagesize patches on more common hardware.  I wouldn't
>>> expect anyone to use a 1K node size in production.
>> 
>> Any chance we can enforce that?  Like with a compile-time
>> option? :)
> 
> We can make mkfs.btrfs advise strongly against it ;)
> 
> But, since I wasn't horribly clear, I'd love one extra if
> statement in the discard function.  Silently eating bytes is
> horribly hard to track down.

Heh, yeah.  I'm making it bulletproof now.  If the goal is to also
catch potential misbehavior, I'm catching some other cases as well.  A
few extra conditionals will still take a small percentage of the time
a discard takes.

- -Jeff

- -- 
Jeff Mahoney
SUSE Labs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG/MacGPG2 v2.0.19 (Darwin)

iQIcBAEBAgAGBQJVeeWDAAoJEB57S2MheeWygMEQAMPCNf8ZIMfYRDkzbpW0mezB
6Vbu7PM5WNAqOU2XdJXq47Z+jvLzsbBG0Z1hDLdavkQiOfjOQeBDvwVQQwFPizJ9
lRA4HB6P0nMKVl4x4PcXzgRinrIIy46nFY7VFZBe/cO0aEq7bsB3/vjlRj4LKvsp
eeMg212Sc4V6yuVbSfLSgYTtMGcAsmE9rUWl+2+kV6aTGqZr72YG1033YVu9Y+0F
vnelEIKFSmYF1y7FqO8Ejpk7G6fOoKYXGIxjcyC5v6kAKygZkxuUFYt9wPgpxl4X
eTYnPwjRwE3qRHlZtCGmb0SKvIkFMeKaI5Dy8KXUSHu6Q4NZ8q+kftgzNTGHcEzD
EgGrsbMa3N6necDYsmKYrIWVq21Nj2vSZc7YmLDKYtVQJRH2ScPOvHQlosEX8JsA
h4DfSp8fLVWu8hAORrUvByrGfw7DkFOlv1bF4B76MokP7sb4ITnpBUJtW+0Uiw3x
n1OJ94RiFOXpxWvEYquZUnK/9k1cg/eCwDpaFTCSDrTOVfW78lnoso1VKhQ1CJLg
Ed3I77RA0jPE004hpwtLdGE2AMiOZfAMKTAPkErnnWMfcrBh9O8DUBWVXds3IBSg
mv6lKPz24P28ymOINkqFC22D1vyXBH4Xiel0ZuPHHjnrxPUwovrF//XRbwcc7lCf
jzsGyTnEnAf00/R8s7sP
=v4r5
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ec3acd..75d0226 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1884,10 +1884,47 @@  static int remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int btrfs_issue_discard(struct block_device *bdev,
-				u64 start, u64 len)
+#define in_range(b, first, len)	((b) >= (first) && (b) < (first) + (len))
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+			       u64 *discarded_bytes)
 {
-	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+	u64 skipped = 0;
+	u64 bytes_left = len;
+	int ret = 0 , j;
+
+	*discarded_bytes = 0;
+
+	/* Skip any superblocks on this device. */
+	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+		u64 sb_offset = btrfs_sb_offset(j);
+		u64 size = sb_offset - start;
+
+		if (!in_range(sb_offset, start, bytes_left))
+			continue;
+
+		if (size) {
+			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+						   GFP_NOFS, 0);
+			if (!ret)
+				*discarded_bytes += size;
+			else if (ret != -EOPNOTSUPP)
+				return ret;
+		}
+
+		start = sb_offset + BTRFS_SUPER_INFO_SIZE;
+		if (start > len)
+			start = len;
+		bytes_left = len - start;
+		skipped += BTRFS_SUPER_INFO_SIZE;
+	}
+
+	if (bytes_left) {
+		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+					   GFP_NOFS, 0);
+		if (!ret)
+			*discarded_bytes += bytes_left;
+	}
+	return ret;
 }
 
 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1906,16 +1943,17 @@  int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 		struct btrfs_bio_stripe *stripe = bbio->stripes;
 		int i;
 
-
 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+			u64 bytes;
+
 			if (!stripe->dev->can_discard)
 				continue;
 
 			ret = btrfs_issue_discard(stripe->dev->bdev,
 						  stripe->physical,
-						  stripe->length);
+						  stripe->length, &bytes);
 			if (!ret)
-				discarded_bytes += stripe->length;
+				discarded_bytes += bytes;
 			else if (ret != -EOPNOTSUPP)
 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */