Message ID | 1434036062-21597-2-git-send-email-jeffm@suse.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 6/11/15 11:20 AM, jeffm@suse.com wrote: > From: Jeff Mahoney <jeffm@suse.com> > > Btrfs doesn't track superblocks with extent records so there is > nothing persistent on-disk to indicate that those blocks are in > use. We track the superblocks in memory to ensure they don't get > used by removing them from the free space cache when we load a > block group from disk. Prior to 47ab2a6c6a (Btrfs: remove empty > block groups automatically), that was fine since the block group > would never be reclaimed so the superblock was always safe. Once > we started removing the empty block groups, we were protected by > the fact that discards weren't being properly issued for unused > space either via FITRIM or -odiscard. The block groups were still > being released, but the blocks remained on disk. > > In order to properly discard unused block groups, we need to filter > out the superblocks from the discard range. Superblocks are > located at fixed locations on each device, so it makes sense to > filter them out in btrfs_issue_discard, which is used by both > -odiscard and FITRIM. > > Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- > fs/btrfs/extent-tree.c | 50 > ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, > 44 insertions(+), 6 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index > 0ec3acd..75d0226 100644 --- a/fs/btrfs/extent-tree.c +++ > b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static int > remove_extent_backref(struct btrfs_trans_handle *trans, return > ret; } > > -static int btrfs_issue_discard(struct block_device *bdev, - u64 > start, u64 len) +#define in_range(b, first, len) ((b) >= (first) && > (b) < (first) + (len)) +static int btrfs_issue_discard(struct > block_device *bdev, u64 start, u64 len, + u64 > *discarded_bytes) { - return blkdev_issue_discard(bdev, start >> 9, > len >> 9, GFP_NOFS, 0); + u64 skipped = 0; + u64 bytes_left = len; > + int ret = 0 , j; + + *discarded_bytes = 0; + + /* Skip any > superblocks on this device. */ + for (j = 0; j < > BTRFS_SUPER_MIRROR_MAX; j++) { + u64 sb_offset = > btrfs_sb_offset(j); + u64 size = sb_offset - start; + + if > (!in_range(sb_offset, start, bytes_left)) + continue; + + if > (size) { + ret = blkdev_issue_discard(bdev, start >> 9, size >> > 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += > size; + else if (ret != -EOPNOTSUPP) + return ret; + } + + > start = sb_offset + BTRFS_SUPER_INFO_SIZE; + if (start > len) + > start = len; + bytes_left = len - start; + skipped += > BTRFS_SUPER_INFO_SIZE; Whoops. This is an unused artifact from an earlier version. - -Jeff > + } + + if (bytes_left) { + ret = blkdev_issue_discard(bdev, start > >> 9, bytes_left >> 9, + GFP_NOFS, 0); + if (!ret) + > *discarded_bytes += bytes_left; + } + return ret; } > > int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, @@ > -1906,16 +1943,17 @@ int btrfs_discard_extent(struct btrfs_root > *root, u64 bytenr, struct btrfs_bio_stripe *stripe = > bbio->stripes; int i; > > - for (i = 0; i < bbio->num_stripes; i++, stripe++) { + u64 > bytes; + if (!stripe->dev->can_discard) continue; > > ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, - > stripe->length); + stripe->length, &bytes); if (!ret) - > discarded_bytes += stripe->length; + discarded_bytes += bytes; > else if (ret != -EOPNOTSUPP) break; /* Logic errors or -ENOMEM, or > -EIO but I don't know how that could happen JDM */ > > - -- Jeff Mahoney SUSE Labs -----BEGIN PGP SIGNATURE----- Version: GnuPG/MacGPG2 v2.0.19 (Darwin) iQIcBAEBAgAGBQJVeahbAAoJEB57S2MheeWyA3YP/00sK18TqJi6ohu0l18otsd2 vG2lAtPUDT6f4dFM1GvO4PYLssuwm0Vy98qFYO30lvcHbdbajhsGd1qk+IzEq3Dr +Qxze9lIrX8267nRE3aqu1I8/y2Gn5Wcy5Xr6fwhN0g+m5J+fWZHy1mfTvqbNt+M U7Cg4hgvTFDcuV5adrg3JoQ3W/w6IvgIid27oNZ0MJMaK4f2jcLfmhGVCTcCnydp 7fEUsMow2firGJew7161ORYkYjTn76JW2HXB8ETOgWUcMpZon1KWduVXOTD0r9sk UrfMJ8dLw6334T265gmQrBXvb3yOBLbyRYuksUFhXFUo1/xDSQMLGMBrkEf6T/Vo mwnXVw5hnPM41vZiJ4Pc56vg1Yy4JGMXs28XEyHrj7gQWo3b1jRw0YhosZE/TR5+ BsJb4KqBTPor/upPLmDwIOYU4Ia+GZ5V2k1jYsE9qs6khHXlX9is8WHeB/fMzfCe zkZEl8YI3ek50wXcqBk+QHFj8LT89Eyq59O3iaCFceIhvjO87ldRDBtTtBR34ogf mZphx9UzVTcY31wftDYKMMi2Xs+8BZgXq7H7IXc2muugFlK9AGsMv4iXPbVkmUmE T4pS2vKqCwfkW9bGf7ufQaOpVgwzQjDkC/Ngtybl8sULroOjI3lm6p+2/jMuqyYd twV2FaYAJAI/tiQkTken =1g/b -----END PGP SIGNATURE----- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jun 11, 2015 at 4:20 PM, <jeffm@suse.com> wrote: > From: Jeff Mahoney <jeffm@suse.com> > > Btrfs doesn't track superblocks with extent records so there is nothing > persistent on-disk to indicate that those blocks are in use. We track > the superblocks in memory to ensure they don't get used by removing them > from the free space cache when we load a block group from disk. Prior > to 47ab2a6c6a (Btrfs: remove empty block groups automatically), that > was fine since the block group would never be reclaimed so the superblock > was always safe. Once we started removing the empty block groups, we > were protected by the fact that discards weren't being properly issued > for unused space either via FITRIM or -odiscard. The block groups were > still being released, but the blocks remained on disk. > > In order to properly discard unused block groups, we need to filter out > the superblocks from the discard range. Superblocks are located at fixed > locations on each device, so it makes sense to filter them out in > btrfs_issue_discard, which is used by both -odiscard and FITRIM. > > Signed-off-by: Jeff Mahoney <jeffm@suse.com> > --- > fs/btrfs/extent-tree.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 44 insertions(+), 6 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 0ec3acd..75d0226 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -1884,10 +1884,47 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, > return ret; > } > > -static int btrfs_issue_discard(struct block_device *bdev, > - u64 start, u64 len) > +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) Hi Jeff, So this will work if every caller behaves well and passes a region whose start and end offsets are a multiple of the sector size (4096) which currently matches the superblock size. However, I think it would be safer to check for the case where the start offset of a superblock mirror is < (first) and (sb_offset + sb_len) > (first). Just to deal with cases where for example the 2nd half of the sb starts at offset (first). I guess this sectorsize becoming less than 4096 will happen sooner or later with the subpage sectorsize patch set, so it wouldn't hurt to make it more bullet proof already. Otherwise it looks good to me. I'll give a test on this patchset soon. thanks > +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, > + u64 *discarded_bytes) > { > - return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); > + u64 skipped = 0; > + u64 bytes_left = len; > + int ret = 0 , j; > + > + *discarded_bytes = 0; > + > + /* Skip any superblocks on this device. */ > + for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { > + u64 sb_offset = btrfs_sb_offset(j); > + u64 size = sb_offset - start; > + > + if (!in_range(sb_offset, start, bytes_left)) > + continue; > + > + if (size) { > + ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, > + GFP_NOFS, 0); > + if (!ret) > + *discarded_bytes += size; > + else if (ret != -EOPNOTSUPP) > + return ret; > + } > + > + start = sb_offset + BTRFS_SUPER_INFO_SIZE; > + if (start > len) > + start = len; > + bytes_left = len - start; > + skipped += BTRFS_SUPER_INFO_SIZE; > + } > + > + if (bytes_left) { > + ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, > + GFP_NOFS, 0); > + if (!ret) > + *discarded_bytes += bytes_left; > + } > + return ret; > } > > int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, > @@ -1906,16 +1943,17 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, > struct btrfs_bio_stripe *stripe = bbio->stripes; > int i; > > - > for (i = 0; i < bbio->num_stripes; i++, stripe++) { > + u64 bytes; > + > if (!stripe->dev->can_discard) > continue; > > ret = btrfs_issue_discard(stripe->dev->bdev, > stripe->physical, > - stripe->length); > + stripe->length, &bytes); > if (!ret) > - discarded_bytes += stripe->length; > + discarded_bytes += bytes; > else if (ret != -EOPNOTSUPP) > break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ > > -- > 1.8.4.5 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 6/11/15 12:47 PM, Filipe David Manana wrote: > On Thu, Jun 11, 2015 at 4:20 PM, <jeffm@suse.com> wrote: >> From: Jeff Mahoney <jeffm@suse.com> >> >> Btrfs doesn't track superblocks with extent records so there is >> nothing persistent on-disk to indicate that those blocks are in >> use. We track the superblocks in memory to ensure they don't get >> used by removing them from the free space cache when we load a >> block group from disk. Prior to 47ab2a6c6a (Btrfs: remove empty >> block groups automatically), that was fine since the block group >> would never be reclaimed so the superblock was always safe. Once >> we started removing the empty block groups, we were protected by >> the fact that discards weren't being properly issued for unused >> space either via FITRIM or -odiscard. The block groups were >> still being released, but the blocks remained on disk. >> >> In order to properly discard unused block groups, we need to >> filter out the superblocks from the discard range. Superblocks >> are located at fixed locations on each device, so it makes sense >> to filter them out in btrfs_issue_discard, which is used by both >> -odiscard and FITRIM. >> >> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- >> fs/btrfs/extent-tree.c | 50 >> ++++++++++++++++++++++++++++++++++++++++++++------ 1 file >> changed, 44 insertions(+), 6 deletions(-) >> >> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c >> index 0ec3acd..75d0226 100644 --- a/fs/btrfs/extent-tree.c +++ >> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static int >> remove_extent_backref(struct btrfs_trans_handle *trans, return >> ret; } >> >> -static int btrfs_issue_discard(struct block_device *bdev, - >> u64 start, u64 len) +#define in_range(b, first, len) ((b) >> >= (first) && (b) < (first) + (len)) > > Hi Jeff, > > So this will work if every caller behaves well and passes a region > whose start and end offsets are a multiple of the sector size > (4096) which currently matches the superblock size. > > However, I think it would be safer to check for the case where the > start offset of a superblock mirror is < (first) and (sb_offset + > sb_len) > (first). Just to deal with cases where for example the > 2nd half of the sb starts at offset (first). > > I guess this sectorsize becoming less than 4096 will happen sooner > or later with the subpage sectorsize patch set, so it wouldn't hurt > to make it more bullet proof already. Is that something anyone intends to support? While I suppose the subpage sector patch /could/ be used to allow file systems with a node size under 4k, the intention is the other way around -- systems that have higher order page sizes currently don't work with btrfs file system created on systems with smaller order page sizes like x86. Btrfs already has high enough metadata overhead. Pretty much all new hardware has, at least, a native 4k sector size even if it's abstracted behind a RMW layer. The sectors are only going to get larger. With the metadata overhead that btrfs already incurs, I can't imagine any production use case with smaller sector sizes. Are we looking to support <4k nodes to test the subpage sector code on x86? If so, then I'll change this to handle the possibility of superblocks crossing sector boundaries. Otherwise, it's protecting against a use case that just shouldn't happen. > Otherwise it looks good to me. I'll give a test on this patchset > soon. Thanks, - -Jeff - -- Jeff Mahoney SUSE Labs -----BEGIN PGP SIGNATURE----- Version: GnuPG/MacGPG2 v2.0.19 (Darwin) iQIcBAEBAgAGBQJVedCiAAoJEB57S2MheeWymA4P/2Y+YGwAkcbYxrlMVH/wIGpN j+qO/y1YHNJQNiRHt/1ahM8wy2DEEbqrD36xUuX5lDYRREo9jeqIGGajOOHg/KFw 7q6zIWVEEwom/RKd9CX48TC2pHly5Fw8ESyi+A857KJ1ZHcpKdyNcIwle/Jsoe0q a+SX6hJPlHFVai/QZhBRO00ZgXlTwjXAGyfgmfuHERY6lS5PBmoA8tYxnigpnBOa PUrgw+Cdn4glrZUTpt3WN4H5oE+pD6cGMQ+GnFXQYaytssyQNuPpCWdQ1Aferg7u Af3E6iBj776bQIRTWZSwjXTMLWHVjnBmdU8D+wXE7Ar3oU1POL7NLvnGm4Yr9TWZ n1NXcBhJ4QQQXBprK3bI+WNSzMzMLdvJHIb5t2Z9BO8wd5Ws0QlmT8Gl+u/ZofZU 8eouhLgu1hZzPeJgXPuDu0S8QaFtpI0zlupOwJByHp9QSzpUw98xqlFXIRtnLc48 n/ZMFi98EMroaQx0hzFLGMwp/57tUFWUUroDkfP1NngoE482ohPHAdREtr7+RZe6 h+pfF+//5CycYxk8BciBmCyLWWTW1WdDL/MzQsXdKE+797cNk79+W5EKYLmQiAbW IGyRXMj+XPpWNC1VM12JJXdoOujSGocmJa28jKanZRzqw0HKJOlTUBqNoBV83Bph ChcTZyhAi3PCsib+mNnW =wb8l -----END PGP SIGNATURE----- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com> wrote: > -----BEGIN PGP SIGNED MESSAGE----- > Hash: SHA1 > > On 6/11/15 12:47 PM, Filipe David Manana wrote: >> On Thu, Jun 11, 2015 at 4:20 PM, <jeffm@suse.com> wrote: >>> From: Jeff Mahoney <jeffm@suse.com> >>> >>> Btrfs doesn't track superblocks with extent records so there is >>> nothing persistent on-disk to indicate that those blocks are in >>> use. We track the superblocks in memory to ensure they don't get >>> used by removing them from the free space cache when we load a >>> block group from disk. Prior to 47ab2a6c6a (Btrfs: remove empty >>> block groups automatically), that was fine since the block group >>> would never be reclaimed so the superblock was always safe. Once >>> we started removing the empty block groups, we were protected by >>> the fact that discards weren't being properly issued for unused >>> space either via FITRIM or -odiscard. The block groups were >>> still being released, but the blocks remained on disk. >>> >>> In order to properly discard unused block groups, we need to >>> filter out the superblocks from the discard range. Superblocks >>> are located at fixed locations on each device, so it makes sense >>> to filter them out in btrfs_issue_discard, which is used by both >>> -odiscard and FITRIM. >>> >>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- >>> fs/btrfs/extent-tree.c | 50 >>> ++++++++++++++++++++++++++++++++++++++++++++------ 1 file >>> changed, 44 insertions(+), 6 deletions(-) >>> >>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c >>> index 0ec3acd..75d0226 100644 --- a/fs/btrfs/extent-tree.c +++ >>> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static int >>> remove_extent_backref(struct btrfs_trans_handle *trans, return >>> ret; } >>> >>> -static int btrfs_issue_discard(struct block_device *bdev, - >>> u64 start, u64 len) +#define in_range(b, first, len) ((b) >>> >= (first) && (b) < (first) + (len)) >> >> Hi Jeff, >> >> So this will work if every caller behaves well and passes a region >> whose start and end offsets are a multiple of the sector size >> (4096) which currently matches the superblock size. >> >> However, I think it would be safer to check for the case where the >> start offset of a superblock mirror is < (first) and (sb_offset + >> sb_len) > (first). Just to deal with cases where for example the >> 2nd half of the sb starts at offset (first). >> >> I guess this sectorsize becoming less than 4096 will happen sooner >> or later with the subpage sectorsize patch set, so it wouldn't hurt >> to make it more bullet proof already. > > Is that something anyone intends to support? While I suppose the > subpage sector patch /could/ be used to allow file systems with a node > size under 4k, the intention is the other way around -- systems that > have higher order page sizes currently don't work with btrfs file > system created on systems with smaller order page sizes like x86. > Btrfs already has high enough metadata overhead. Pretty much all new > hardware has, at least, a native 4k sector size even if it's > abstracted behind a RMW layer. The sectors are only going to get > larger. With the metadata overhead that btrfs already incurs, I can't > imagine any production use case with smaller sector sizes. > > Are we looking to support <4k nodes to test the subpage sector code on > x86? If so, then I'll change this to handle the possibility of > superblocks crossing sector boundaries. Otherwise, it's protecting > against a use case that just shouldn't happen. I understand your point. I'm probably being too paranoid. But it's exactly because it's not supposed to happen that at least an assertion or something should be added imho. A lot of "not supposed not happen things" happen often, and that's often how people lose data, and get into other bad issues. And I think I've heard once of supporting <4k nodes (sectorsizes) for testing at least on x86 for e.g, but I might have not understood it correctly. Having such a check would help detect bugs during development where some caller passes a wrong range to discard - better to find it during development/RCs rather than in production. But anyway, just a personal preference. thanks > >> Otherwise it looks good to me. I'll give a test on this patchset >> soon. > > Thanks, > > - -Jeff > > > - -- > Jeff Mahoney > SUSE Labs > -----BEGIN PGP SIGNATURE----- > Version: GnuPG/MacGPG2 v2.0.19 (Darwin) > > iQIcBAEBAgAGBQJVedCiAAoJEB57S2MheeWymA4P/2Y+YGwAkcbYxrlMVH/wIGpN > j+qO/y1YHNJQNiRHt/1ahM8wy2DEEbqrD36xUuX5lDYRREo9jeqIGGajOOHg/KFw > 7q6zIWVEEwom/RKd9CX48TC2pHly5Fw8ESyi+A857KJ1ZHcpKdyNcIwle/Jsoe0q > a+SX6hJPlHFVai/QZhBRO00ZgXlTwjXAGyfgmfuHERY6lS5PBmoA8tYxnigpnBOa > PUrgw+Cdn4glrZUTpt3WN4H5oE+pD6cGMQ+GnFXQYaytssyQNuPpCWdQ1Aferg7u > Af3E6iBj776bQIRTWZSwjXTMLWHVjnBmdU8D+wXE7Ar3oU1POL7NLvnGm4Yr9TWZ > n1NXcBhJ4QQQXBprK3bI+WNSzMzMLdvJHIb5t2Z9BO8wd5Ws0QlmT8Gl+u/ZofZU > 8eouhLgu1hZzPeJgXPuDu0S8QaFtpI0zlupOwJByHp9QSzpUw98xqlFXIRtnLc48 > n/ZMFi98EMroaQx0hzFLGMwp/57tUFWUUroDkfP1NngoE482ohPHAdREtr7+RZe6 > h+pfF+//5CycYxk8BciBmCyLWWTW1WdDL/MzQsXdKE+797cNk79+W5EKYLmQiAbW > IGyRXMj+XPpWNC1VM12JJXdoOujSGocmJa28jKanZRzqw0HKJOlTUBqNoBV83Bph > ChcTZyhAi3PCsib+mNnW > =wb8l > -----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 6/11/15 2:44 PM, Filipe David Manana wrote: > On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com> > wrote: On 6/11/15 12:47 PM, Filipe David Manana wrote: >>>> On Thu, Jun 11, 2015 at 4:20 PM, <jeffm@suse.com> wrote: >>>>> From: Jeff Mahoney <jeffm@suse.com> >>>>> >>>>> Btrfs doesn't track superblocks with extent records so >>>>> there is nothing persistent on-disk to indicate that those >>>>> blocks are in use. We track the superblocks in memory to >>>>> ensure they don't get used by removing them from the free >>>>> space cache when we load a block group from disk. Prior to >>>>> 47ab2a6c6a (Btrfs: remove empty block groups >>>>> automatically), that was fine since the block group would >>>>> never be reclaimed so the superblock was always safe. >>>>> Once we started removing the empty block groups, we were >>>>> protected by the fact that discards weren't being properly >>>>> issued for unused space either via FITRIM or -odiscard. >>>>> The block groups were still being released, but the blocks >>>>> remained on disk. >>>>> >>>>> In order to properly discard unused block groups, we need >>>>> to filter out the superblocks from the discard range. >>>>> Superblocks are located at fixed locations on each device, >>>>> so it makes sense to filter them out in >>>>> btrfs_issue_discard, which is used by both -odiscard and >>>>> FITRIM. >>>>> >>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- >>>>> fs/btrfs/extent-tree.c | 50 >>>>> ++++++++++++++++++++++++++++++++++++++++++++------ 1 file >>>>> changed, 44 insertions(+), 6 deletions(-) >>>>> >>>>> diff --git a/fs/btrfs/extent-tree.c >>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 100644 --- >>>>> a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ >>>>> -1884,10 +1884,47 @@ static int >>>>> remove_extent_backref(struct btrfs_trans_handle *trans, >>>>> return ret; } >>>>> >>>>> -static int btrfs_issue_discard(struct block_device *bdev, >>>>> - u64 start, u64 len) +#define in_range(b, first, len) >>>>> ((b) >>>>>> = (first) && (b) < (first) + (len)) >>>> >>>> Hi Jeff, >>>> >>>> So this will work if every caller behaves well and passes a >>>> region whose start and end offsets are a multiple of the >>>> sector size (4096) which currently matches the superblock >>>> size. >>>> >>>> However, I think it would be safer to check for the case >>>> where the start offset of a superblock mirror is < (first) >>>> and (sb_offset + sb_len) > (first). Just to deal with cases >>>> where for example the 2nd half of the sb starts at offset >>>> (first). >>>> >>>> I guess this sectorsize becoming less than 4096 will happen >>>> sooner or later with the subpage sectorsize patch set, so it >>>> wouldn't hurt to make it more bullet proof already. > > Is that something anyone intends to support? While I suppose the > subpage sector patch /could/ be used to allow file systems with a > node size under 4k, the intention is the other way around -- > systems that have higher order page sizes currently don't work with > btrfs file system created on systems with smaller order page sizes > like x86. Btrfs already has high enough metadata overhead. Pretty > much all new hardware has, at least, a native 4k sector size even > if it's abstracted behind a RMW layer. The sectors are only going > to get larger. With the metadata overhead that btrfs already > incurs, I can't imagine any production use case with smaller sector > sizes. > > Are we looking to support <4k nodes to test the subpage sector code > on x86? If so, then I'll change this to handle the possibility of > superblocks crossing sector boundaries. Otherwise, it's > protecting against a use case that just shouldn't happen. > >> I understand your point. I'm probably being too paranoid. But >> it's exactly because it's not supposed to happen that at least an >> assertion or something should be added imho. A lot of "not >> supposed not happen things" happen often, and that's often how >> people lose data, and get into other bad issues. > >> And I think I've heard once of supporting <4k nodes (sectorsizes) >> for testing at least on x86 for e.g, but I might have not >> understood it correctly. Having such a check would help detect >> bugs during development where some caller passes a wrong range to >> discard - better to find it during development/RCs rather than in >> production. Yeah, you're right. I'll make it more bulletproof. I'm just looking to be done with this particular mess. :) - -Jeff - -- Jeff Mahoney SUSE Labs -----BEGIN PGP SIGNATURE----- Version: GnuPG/MacGPG2 v2.0.19 (Darwin) iQIcBAEBAgAGBQJVed5SAAoJEB57S2MheeWypa4QAJIrhdZNg3d1I972dkkBBFjk 723lHLXWdHkJifQKYa7b91hwtWplKxZlTuEoc7/AwacoemR7m6V3JMApykHDgHrV mmHrUvcJVUFjhbn1vXkeMhbz8COLbOUIUF6/hrwzkg00SSDwtjmv5IJQEIXikKEo j/iI2IyyNsA2JgCa5R/2JvWRssWUF8VhupLw/T6WOfjTwhEEDT/DqZIuD/avGrwk cxo8NqUkUNrFA7JxhO9TS/Mz9IIxiLu75XQiFzHF3IkLGUGAT4R1JyRMSZhFNjgc MPS9d9V1ZUZ5swKSRWR07LDtBntiRDYS7atChLsWrOuJs3lsyS3nC4LFkbcqkFdX pLFOsftfe/FX3OGz5Jry9LyKjr0m7hjveZQ2hnEHB1D/wtpEhRw8WQc8cpL6/Fyr 11g67vsdMbb+KeWsOSLyNk47oRuqULSoqRWRBqeyjfXVRPUBAHME31mZ3QN8+wJz 7Ua6xYoKV5RT8A3w2ceVEhpwqY/DuSGKJ/frteFyLiy12myZz4ZutKLIxwvww4uT sP3w3NnXFtf3UdA1D0hgm5PQuDk5vBtrFJ1f78nkMXHoKgbIfoSF/LE3Zxb7SFw7 1GscISYtTHumXQdiv6bPw0rvNt1NDj0vKwJG6aRf9IDukT+05a37e33SLBGHGRbv tXxOaVLT67hJK7jQIWAM =yNKP -----END PGP SIGNATURE----- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/11/2015 03:15 PM, Jeff Mahoney wrote: > On 6/11/15 2:44 PM, Filipe David Manana wrote: >> On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com> >> wrote: On 6/11/15 12:47 PM, Filipe David Manana wrote: >>>>> On Thu, Jun 11, 2015 at 4:20 PM, <jeffm@suse.com> wrote: >>>>>> From: Jeff Mahoney <jeffm@suse.com> >>>>>> >>>>>> Btrfs doesn't track superblocks with extent records so >>>>>> there is nothing persistent on-disk to indicate that those >>>>>> blocks are in use. We track the superblocks in memory to >>>>>> ensure they don't get used by removing them from the free >>>>>> space cache when we load a block group from disk. Prior to >>>>>> 47ab2a6c6a (Btrfs: remove empty block groups >>>>>> automatically), that was fine since the block group would >>>>>> never be reclaimed so the superblock was always safe. >>>>>> Once we started removing the empty block groups, we were >>>>>> protected by the fact that discards weren't being properly >>>>>> issued for unused space either via FITRIM or -odiscard. >>>>>> The block groups were still being released, but the blocks >>>>>> remained on disk. >>>>>> >>>>>> In order to properly discard unused block groups, we need >>>>>> to filter out the superblocks from the discard range. >>>>>> Superblocks are located at fixed locations on each device, >>>>>> so it makes sense to filter them out in >>>>>> btrfs_issue_discard, which is used by both -odiscard and >>>>>> FITRIM. >>>>>> >>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- >>>>>> fs/btrfs/extent-tree.c | 50 >>>>>> ++++++++++++++++++++++++++++++++++++++++++++------ 1 file >>>>>> changed, 44 insertions(+), 6 deletions(-) >>>>>> >>>>>> diff --git a/fs/btrfs/extent-tree.c >>>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 100644 --- >>>>>> a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ >>>>>> -1884,10 +1884,47 @@ static int >>>>>> remove_extent_backref(struct btrfs_trans_handle *trans, >>>>>> return ret; } >>>>>> >>>>>> -static int btrfs_issue_discard(struct block_device *bdev, >>>>>> - u64 start, u64 len) +#define in_range(b, first, len) >>>>>> ((b) >>>>>>> = (first) && (b) < (first) + (len)) >>>>> >>>>> Hi Jeff, >>>>> >>>>> So this will work if every caller behaves well and passes a >>>>> region whose start and end offsets are a multiple of the >>>>> sector size (4096) which currently matches the superblock >>>>> size. >>>>> >>>>> However, I think it would be safer to check for the case >>>>> where the start offset of a superblock mirror is < (first) >>>>> and (sb_offset + sb_len) > (first). Just to deal with cases >>>>> where for example the 2nd half of the sb starts at offset >>>>> (first). >>>>> >>>>> I guess this sectorsize becoming less than 4096 will happen >>>>> sooner or later with the subpage sectorsize patch set, so it >>>>> wouldn't hurt to make it more bullet proof already. > >> Is that something anyone intends to support? While I suppose the >> subpage sector patch /could/ be used to allow file systems with a >> node size under 4k, the intention is the other way around -- >> systems that have higher order page sizes currently don't work with >> btrfs file system created on systems with smaller order page sizes >> like x86. The best use of smaller node sizes is just to test the subpagesize patches on more common hardware. I wouldn't expect anyone to use a 1K node size in production. Thanks for doing these Jeff. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 6/11/15 3:24 PM, Chris Mason wrote: > On 06/11/2015 03:15 PM, Jeff Mahoney wrote: >> On 6/11/15 2:44 PM, Filipe David Manana wrote: >>> On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com> >>> wrote: On 6/11/15 12:47 PM, Filipe David Manana wrote: >>>>>> On Thu, Jun 11, 2015 at 4:20 PM, <jeffm@suse.com> >>>>>> wrote: >>>>>>> From: Jeff Mahoney <jeffm@suse.com> >>>>>>> >>>>>>> Btrfs doesn't track superblocks with extent records so >>>>>>> there is nothing persistent on-disk to indicate that >>>>>>> those blocks are in use. We track the superblocks in >>>>>>> memory to ensure they don't get used by removing them >>>>>>> from the free space cache when we load a block group >>>>>>> from disk. Prior to 47ab2a6c6a (Btrfs: remove empty >>>>>>> block groups automatically), that was fine since the >>>>>>> block group would never be reclaimed so the superblock >>>>>>> was always safe. Once we started removing the empty >>>>>>> block groups, we were protected by the fact that >>>>>>> discards weren't being properly issued for unused space >>>>>>> either via FITRIM or -odiscard. The block groups were >>>>>>> still being released, but the blocks remained on disk. >>>>>>> >>>>>>> In order to properly discard unused block groups, we >>>>>>> need to filter out the superblocks from the discard >>>>>>> range. Superblocks are located at fixed locations on >>>>>>> each device, so it makes sense to filter them out in >>>>>>> btrfs_issue_discard, which is used by both -odiscard >>>>>>> and FITRIM. >>>>>>> >>>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- >>>>>>> fs/btrfs/extent-tree.c | 50 >>>>>>> ++++++++++++++++++++++++++++++++++++++++++++------ 1 >>>>>>> file changed, 44 insertions(+), 6 deletions(-) >>>>>>> >>>>>>> diff --git a/fs/btrfs/extent-tree.c >>>>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 100644 >>>>>>> --- a/fs/btrfs/extent-tree.c +++ >>>>>>> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static >>>>>>> int remove_extent_backref(struct btrfs_trans_handle >>>>>>> *trans, return ret; } >>>>>>> >>>>>>> -static int btrfs_issue_discard(struct block_device >>>>>>> *bdev, - u64 start, u64 len) +#define in_range(b, >>>>>>> first, len) ((b) >>>>>>>> = (first) && (b) < (first) + (len)) >>>>>> >>>>>> Hi Jeff, >>>>>> >>>>>> So this will work if every caller behaves well and passes >>>>>> a region whose start and end offsets are a multiple of >>>>>> the sector size (4096) which currently matches the >>>>>> superblock size. >>>>>> >>>>>> However, I think it would be safer to check for the case >>>>>> where the start offset of a superblock mirror is < >>>>>> (first) and (sb_offset + sb_len) > (first). Just to deal >>>>>> with cases where for example the 2nd half of the sb >>>>>> starts at offset (first). >>>>>> >>>>>> I guess this sectorsize becoming less than 4096 will >>>>>> happen sooner or later with the subpage sectorsize patch >>>>>> set, so it wouldn't hurt to make it more bullet proof >>>>>> already. >> >>> Is that something anyone intends to support? While I suppose >>> the subpage sector patch /could/ be used to allow file systems >>> with a node size under 4k, the intention is the other way >>> around -- systems that have higher order page sizes currently >>> don't work with btrfs file system created on systems with >>> smaller order page sizes like x86. > > The best use of smaller node sizes is just to test the subpagesize > patches on more common hardware. I wouldn't expect anyone to use a > 1K node size in production. Any chance we can enforce that? Like with a compile-time option? :) - -Jeff - -- Jeff Mahoney SUSE Labs -----BEGIN PGP SIGNATURE----- Version: GnuPG/MacGPG2 v2.0.19 (Darwin) iQIcBAEBAgAGBQJVeeE5AAoJEB57S2MheeWyGh8P/RvSEQxL0M7Od6YltKz3yg0H b/pi1ZukSTNFzQQiK/tt2rgPd0XH8p139SmJ9bWiHMPD14KDN0OtsLlAIjurLUVd Pf9jBBnaV+D8i8GVMfVBrvMqqK5xZ/6LGEFMgyqzET1WkhYZ1D7XQ5mSPKDNTqnk YwHrbeJ7+wN0zaaqL7I2ed06Yt5e5GczpIVRXxWKsNvYsQFie4rSdnG/QUDjAReI W4cynK6NRxQcjc0avqHRGdxFn7woyGe47tPdzr+eERE7yXr+hqrrXyzCXqWu8Nm0 f1mYX/RNRggPg6MmmjIxKsC/ySfs8p6CR6t6DejevtdQRw2uEFciqzx63y8/H3kX L+wN/ITEOGfeCs5ndwlfL204FhZ73brY7dNIXd97yqHJhGBxadJlsoc7eI9J/fux jNJy/wHF2/Epdfk9nBNJiRFUuL0eioYBkoE0pEdBkdoGtPAodQ+TIn+rEkxKC//n ZovStjTJYUVuDfAi9Wpraxd6oaGmqWQU+eYxxLQBXc+ADk+lh15wmWFD9tYAw5uQ kOwoz2PIxKCOVFBR3ixoPCy9nJsHRQX69L5xV76VqTDnGrBEDeGKHSy4Dhi8aTCN pSJkF2STAgxxh/CHUT0FmRsIcUiLUp26pdewIjEgXQ8S6cok2/a/+HpOJRSfD0M3 bEPZLzdZREM2FhyrBcCY =qoEa -----END PGP SIGNATURE----- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/11/2015 03:27 PM, Jeff Mahoney wrote: > On 6/11/15 3:24 PM, Chris Mason wrote: >> On 06/11/2015 03:15 PM, Jeff Mahoney wrote: >>> On 6/11/15 2:44 PM, Filipe David Manana wrote: >>>> On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney <jeffm@suse.com> >>>> wrote: On 6/11/15 12:47 PM, Filipe David Manana wrote: >>>>>>> On Thu, Jun 11, 2015 at 4:20 PM, <jeffm@suse.com> >>>>>>> wrote: >>>>>>>> From: Jeff Mahoney <jeffm@suse.com> >>>>>>>> >>>>>>>> Btrfs doesn't track superblocks with extent records so >>>>>>>> there is nothing persistent on-disk to indicate that >>>>>>>> those blocks are in use. We track the superblocks in >>>>>>>> memory to ensure they don't get used by removing them >>>>>>>> from the free space cache when we load a block group >>>>>>>> from disk. Prior to 47ab2a6c6a (Btrfs: remove empty >>>>>>>> block groups automatically), that was fine since the >>>>>>>> block group would never be reclaimed so the superblock >>>>>>>> was always safe. Once we started removing the empty >>>>>>>> block groups, we were protected by the fact that >>>>>>>> discards weren't being properly issued for unused space >>>>>>>> either via FITRIM or -odiscard. The block groups were >>>>>>>> still being released, but the blocks remained on disk. >>>>>>>> >>>>>>>> In order to properly discard unused block groups, we >>>>>>>> need to filter out the superblocks from the discard >>>>>>>> range. Superblocks are located at fixed locations on >>>>>>>> each device, so it makes sense to filter them out in >>>>>>>> btrfs_issue_discard, which is used by both -odiscard >>>>>>>> and FITRIM. >>>>>>>> >>>>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- >>>>>>>> fs/btrfs/extent-tree.c | 50 >>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++------ 1 >>>>>>>> file changed, 44 insertions(+), 6 deletions(-) >>>>>>>> >>>>>>>> diff --git a/fs/btrfs/extent-tree.c >>>>>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 100644 >>>>>>>> --- a/fs/btrfs/extent-tree.c +++ >>>>>>>> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static >>>>>>>> int remove_extent_backref(struct btrfs_trans_handle >>>>>>>> *trans, return ret; } >>>>>>>> >>>>>>>> -static int btrfs_issue_discard(struct block_device >>>>>>>> *bdev, - u64 start, u64 len) +#define in_range(b, >>>>>>>> first, len) ((b) >>>>>>>>> = (first) && (b) < (first) + (len)) >>>>>>> >>>>>>> Hi Jeff, >>>>>>> >>>>>>> So this will work if every caller behaves well and passes >>>>>>> a region whose start and end offsets are a multiple of >>>>>>> the sector size (4096) which currently matches the >>>>>>> superblock size. >>>>>>> >>>>>>> However, I think it would be safer to check for the case >>>>>>> where the start offset of a superblock mirror is < >>>>>>> (first) and (sb_offset + sb_len) > (first). Just to deal >>>>>>> with cases where for example the 2nd half of the sb >>>>>>> starts at offset (first). >>>>>>> >>>>>>> I guess this sectorsize becoming less than 4096 will >>>>>>> happen sooner or later with the subpage sectorsize patch >>>>>>> set, so it wouldn't hurt to make it more bullet proof >>>>>>> already. >>> >>>> Is that something anyone intends to support? While I suppose >>>> the subpage sector patch /could/ be used to allow file systems >>>> with a node size under 4k, the intention is the other way >>>> around -- systems that have higher order page sizes currently >>>> don't work with btrfs file system created on systems with >>>> smaller order page sizes like x86. > >> The best use of smaller node sizes is just to test the subpagesize >> patches on more common hardware. I wouldn't expect anyone to use a >> 1K node size in production. > > Any chance we can enforce that? Like with a compile-time option? :) We can make mkfs.btrfs advise strongly against it ;) But, since I wasn't horribly clear, I'd love one extra if statement in the discard function. Silently eating bytes is horribly hard to track down. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 6/11/15 3:35 PM, Chris Mason wrote: > On 06/11/2015 03:27 PM, Jeff Mahoney wrote: >> On 6/11/15 3:24 PM, Chris Mason wrote: >>> On 06/11/2015 03:15 PM, Jeff Mahoney wrote: >>>> On 6/11/15 2:44 PM, Filipe David Manana wrote: >>>>> On Thu, Jun 11, 2015 at 7:17 PM, Jeff Mahoney >>>>> <jeffm@suse.com> wrote: On 6/11/15 12:47 PM, Filipe David >>>>> Manana wrote: >>>>>>>> On Thu, Jun 11, 2015 at 4:20 PM, <jeffm@suse.com> >>>>>>>> wrote: >>>>>>>>> From: Jeff Mahoney <jeffm@suse.com> >>>>>>>>> >>>>>>>>> Btrfs doesn't track superblocks with extent >>>>>>>>> records so there is nothing persistent on-disk to >>>>>>>>> indicate that those blocks are in use. We track >>>>>>>>> the superblocks in memory to ensure they don't get >>>>>>>>> used by removing them from the free space cache >>>>>>>>> when we load a block group from disk. Prior to >>>>>>>>> 47ab2a6c6a (Btrfs: remove empty block groups >>>>>>>>> automatically), that was fine since the block group >>>>>>>>> would never be reclaimed so the superblock was >>>>>>>>> always safe. Once we started removing the empty >>>>>>>>> block groups, we were protected by the fact that >>>>>>>>> discards weren't being properly issued for unused >>>>>>>>> space either via FITRIM or -odiscard. The block >>>>>>>>> groups were still being released, but the blocks >>>>>>>>> remained on disk. >>>>>>>>> >>>>>>>>> In order to properly discard unused block groups, >>>>>>>>> we need to filter out the superblocks from the >>>>>>>>> discard range. Superblocks are located at fixed >>>>>>>>> locations on each device, so it makes sense to >>>>>>>>> filter them out in btrfs_issue_discard, which is >>>>>>>>> used by both -odiscard and FITRIM. >>>>>>>>> >>>>>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> --- >>>>>>>>> fs/btrfs/extent-tree.c | 50 >>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++------ >>>>>>>>> 1 file changed, 44 insertions(+), 6 deletions(-) >>>>>>>>> >>>>>>>>> diff --git a/fs/btrfs/extent-tree.c >>>>>>>>> b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 >>>>>>>>> 100644 --- a/fs/btrfs/extent-tree.c +++ >>>>>>>>> b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ >>>>>>>>> static int remove_extent_backref(struct >>>>>>>>> btrfs_trans_handle *trans, return ret; } >>>>>>>>> >>>>>>>>> -static int btrfs_issue_discard(struct block_device >>>>>>>>> *bdev, - u64 start, u64 len) +#define in_range(b, >>>>>>>>> first, len) ((b) >>>>>>>>>> = (first) && (b) < (first) + (len)) >>>>>>>> >>>>>>>> Hi Jeff, >>>>>>>> >>>>>>>> So this will work if every caller behaves well and >>>>>>>> passes a region whose start and end offsets are a >>>>>>>> multiple of the sector size (4096) which currently >>>>>>>> matches the superblock size. >>>>>>>> >>>>>>>> However, I think it would be safer to check for the >>>>>>>> case where the start offset of a superblock mirror >>>>>>>> is < (first) and (sb_offset + sb_len) > (first). >>>>>>>> Just to deal with cases where for example the 2nd >>>>>>>> half of the sb starts at offset (first). >>>>>>>> >>>>>>>> I guess this sectorsize becoming less than 4096 will >>>>>>>> happen sooner or later with the subpage sectorsize >>>>>>>> patch set, so it wouldn't hurt to make it more >>>>>>>> bullet proof already. >>>> >>>>> Is that something anyone intends to support? While I >>>>> suppose the subpage sector patch /could/ be used to allow >>>>> file systems with a node size under 4k, the intention is >>>>> the other way around -- systems that have higher order >>>>> page sizes currently don't work with btrfs file system >>>>> created on systems with smaller order page sizes like x86. >> >>> The best use of smaller node sizes is just to test the >>> subpagesize patches on more common hardware. I wouldn't >>> expect anyone to use a 1K node size in production. >> >> Any chance we can enforce that? Like with a compile-time >> option? :) > > We can make mkfs.btrfs advise strongly against it ;) > > But, since I wasn't horribly clear, I'd love one extra if > statement in the discard function. Silently eating bytes is > horribly hard to track down. Heh, yeah. I'm making it bulletproof now. If the goal is to also catch potential misbehavior, I'm catching some other cases as well. A few extra conditionals will still take a small percentage of the time a discard takes. - -Jeff - -- Jeff Mahoney SUSE Labs -----BEGIN PGP SIGNATURE----- Version: GnuPG/MacGPG2 v2.0.19 (Darwin) iQIcBAEBAgAGBQJVeeWDAAoJEB57S2MheeWygMEQAMPCNf8ZIMfYRDkzbpW0mezB 6Vbu7PM5WNAqOU2XdJXq47Z+jvLzsbBG0Z1hDLdavkQiOfjOQeBDvwVQQwFPizJ9 lRA4HB6P0nMKVl4x4PcXzgRinrIIy46nFY7VFZBe/cO0aEq7bsB3/vjlRj4LKvsp eeMg212Sc4V6yuVbSfLSgYTtMGcAsmE9rUWl+2+kV6aTGqZr72YG1033YVu9Y+0F vnelEIKFSmYF1y7FqO8Ejpk7G6fOoKYXGIxjcyC5v6kAKygZkxuUFYt9wPgpxl4X eTYnPwjRwE3qRHlZtCGmb0SKvIkFMeKaI5Dy8KXUSHu6Q4NZ8q+kftgzNTGHcEzD EgGrsbMa3N6necDYsmKYrIWVq21Nj2vSZc7YmLDKYtVQJRH2ScPOvHQlosEX8JsA h4DfSp8fLVWu8hAORrUvByrGfw7DkFOlv1bF4B76MokP7sb4ITnpBUJtW+0Uiw3x n1OJ94RiFOXpxWvEYquZUnK/9k1cg/eCwDpaFTCSDrTOVfW78lnoso1VKhQ1CJLg Ed3I77RA0jPE004hpwtLdGE2AMiOZfAMKTAPkErnnWMfcrBh9O8DUBWVXds3IBSg mv6lKPz24P28ymOINkqFC22D1vyXBH4Xiel0ZuPHHjnrxPUwovrF//XRbwcc7lCf jzsGyTnEnAf00/R8s7sP =v4r5 -----END PGP SIGNATURE----- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0ec3acd..75d0226 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1884,10 +1884,47 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int btrfs_issue_discard(struct block_device *bdev, - u64 start, u64 len) +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, + u64 *discarded_bytes) { - return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); + u64 skipped = 0; + u64 bytes_left = len; + int ret = 0 , j; + + *discarded_bytes = 0; + + /* Skip any superblocks on this device. */ + for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { + u64 sb_offset = btrfs_sb_offset(j); + u64 size = sb_offset - start; + + if (!in_range(sb_offset, start, bytes_left)) + continue; + + if (size) { + ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += size; + else if (ret != -EOPNOTSUPP) + return ret; + } + + start = sb_offset + BTRFS_SUPER_INFO_SIZE; + if (start > len) + start = len; + bytes_left = len - start; + skipped += BTRFS_SUPER_INFO_SIZE; + } + + if (bytes_left) { + ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += bytes_left; + } + return ret; } int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, @@ -1906,16 +1943,17 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, struct btrfs_bio_stripe *stripe = bbio->stripes; int i; - for (i = 0; i < bbio->num_stripes; i++, stripe++) { + u64 bytes; + if (!stripe->dev->can_discard) continue; ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, - stripe->length); + stripe->length, &bytes); if (!ret) - discarded_bytes += stripe->length; + discarded_bytes += bytes; else if (ret != -EOPNOTSUPP) break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */