Message ID | 20240312144532.1044427-2-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/5] block: move discard checks into the ioctl handler | expand |
On Tue, Mar 12, 2024 at 08:45:27AM -0600, Christoph Hellwig wrote: > @@ -95,6 +95,8 @@ static int compat_blkpg_ioctl(struct block_device *bdev, > static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, > unsigned long arg) > { > + sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1; > + sector_t sector, nr_sects; > uint64_t range[2]; > uint64_t start, len; > struct inode *inode = bdev->bd_inode; > @@ -105,18 +107,21 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, > > if (!bdev_max_discard_sectors(bdev)) > return -EOPNOTSUPP; > + if (bdev_read_only(bdev)) > + return -EPERM; > > if (copy_from_user(range, (void __user *)arg, sizeof(range))) > return -EFAULT; > > start = range[0]; > len = range[1]; > + sector = start >> SECTOR_SHIFT; > + nr_sects = len >> SECTOR_SHIFT; > > - if (start & 511) > + if (!nr_sects) > return -EINVAL; > - if (len & 511) > + if ((sector | nr_sects) & bs_mask) > return -EINVAL; > - > if (start + len > bdev_nr_bytes(bdev)) > return -EINVAL; Maybe you want to shift lower bytes out of consideration, but it is different, right? For example, if I call this ioctl with start=5 and len=555, it would return EINVAL, but your change would let it succeed the same as if start=0, len=512.
On Tue, Mar 12, 2024 at 04:12:54PM -0600, Keith Busch wrote: > > + if (!nr_sects) > > return -EINVAL; > > + if ((sector | nr_sects) & bs_mask) > > return -EINVAL; > > - > > if (start + len > bdev_nr_bytes(bdev)) > > return -EINVAL; > > Maybe you want to shift lower bytes out of consideration, but it is > different, right? For example, if I call this ioctl with start=5 and > len=555, it would return EINVAL, but your change would let it succeed > the same as if start=0, len=512. We did the same before, just down in __blkdev_issue_discard instead of in the ioctl handler.
On Tue, Mar 12, 2024 at 11:31:31PM +0100, Christoph Hellwig wrote: > On Tue, Mar 12, 2024 at 04:12:54PM -0600, Keith Busch wrote: > > > + if (!nr_sects) > > > return -EINVAL; > > > + if ((sector | nr_sects) & bs_mask) > > > return -EINVAL; > > > - > > > if (start + len > bdev_nr_bytes(bdev)) > > > return -EINVAL; > > > > Maybe you want to shift lower bytes out of consideration, but it is > > different, right? For example, if I call this ioctl with start=5 and > > len=555, it would return EINVAL, but your change would let it succeed > > the same as if start=0, len=512. > > We did the same before, just down in __blkdev_issue_discard instead of > in the ioctl handler. Here's an example program demonstrating the difference: discard-test.c: --- #include <stdio.h> #include <stdint.h> #include <fcntl.h> #include <linux/fs.h> #include <sys/ioctl.h> int main(int argc, char **argv) { uint64_t range[2]; int fd; if (argc < 2) return -1; fd = open(argv[1], O_RDWR); if (fd < 0) return fd; range[0] = 5; range[1] = 555; ioctl(fd, BLKDISCARD, &range); perror("BLKDISCARD"); return 0; } -- Before: # ./discard-test /dev/nvme0n1 BLKDISCARD: Invalid argument After: # ./discard-test /dev/nvme0n1 BLKDISCARD: Success
On Tue, Mar 12, 2024 at 08:45:27AM -0600, Christoph Hellwig wrote: > @@ -95,6 +95,8 @@ static int compat_blkpg_ioctl(struct block_device *bdev, > static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, > unsigned long arg) > { > + sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1; > + sector_t sector, nr_sects; > uint64_t range[2]; > uint64_t start, len; > struct inode *inode = bdev->bd_inode; > @@ -105,18 +107,21 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, > > if (!bdev_max_discard_sectors(bdev)) > return -EOPNOTSUPP; > + if (bdev_read_only(bdev)) > + return -EPERM; > > if (copy_from_user(range, (void __user *)arg, sizeof(range))) > return -EFAULT; > > start = range[0]; > len = range[1]; > + sector = start >> SECTOR_SHIFT; > + nr_sects = len >> SECTOR_SHIFT; > > - if (start & 511) > + if (!nr_sects) > return -EINVAL; > - if (len & 511) > + if ((sector | nr_sects) & bs_mask) > return -EINVAL; > - > if (start + len > bdev_nr_bytes(bdev)) > return -EINVAL; > > @@ -124,7 +129,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, > err = truncate_bdev_range(bdev, mode, start, start + len - 1); > if (err) > goto fail; > - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); > + err = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL); > fail: > filemap_invalidate_unlock(inode->i_mapping); > return err; > -- The incremental change I think you want atop this patch to keep the previous behavior: -- >8 -- diff --git b/block/ioctl.c a/block/ioctl.c index 57c8171fda93c..e14388548ab97 100644 --- b/block/ioctl.c +++ a/block/ioctl.c @@ -95,7 +95,7 @@ static int compat_blkpg_ioctl(struct block_device *bdev, static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { - sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1; + sector_t mask = bdev_logical_block_size(bdev) - 1; sector_t sector, nr_sects; uint64_t range[2]; uint64_t start, len; @@ -120,7 +120,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (!nr_sects) return -EINVAL; - if ((sector | nr_sects) & bs_mask) + if ((start | len) & mask) return -EINVAL; if (start + len > bdev_nr_bytes(bdev)) return -EINVAL;
On Wed, Mar 13, 2024 at 09:40:21AM -0600, Keith Busch wrote: > The incremental change I think you want atop this patch to keep the > previous behavior: Ah, yes, thanks. Can you submit your reproducer to blktests or at least throw a license on it that allows me to wire it up? Also I'm going to wait for more comments on the approach in this series before resending it, but we really should get a fix in in the next days for this regression.
On 3/13/24 2:06 PM, Christoph Hellwig wrote: > Also I'm going to wait for more comments on the approach in this > series before resending it, but we really should get a fix in in > the next days for this regression. Yes please, sooner rather than later. Too much fallout this merge window, and I've got other items that should go out soon too.
diff --git a/block/blk-lib.c b/block/blk-lib.c index dc8e35d0a51d6d..50923508a32466 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -59,26 +59,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { struct bio *bio = *biop; - sector_t bs_mask; - - if (bdev_read_only(bdev)) - return -EPERM; - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; - - /* In case the discard granularity isn't set by buggy device driver */ - if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { - pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n", - bdev); - return -EOPNOTSUPP; - } - - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; - - if (!nr_sects) - return -EINVAL; while (nr_sects) { sector_t req_sects = diff --git a/block/ioctl.c b/block/ioctl.c index 0c76137adcaaa5..57c8171fda93c5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -95,6 +95,8 @@ static int compat_blkpg_ioctl(struct block_device *bdev, static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { + sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1; + sector_t sector, nr_sects; uint64_t range[2]; uint64_t start, len; struct inode *inode = bdev->bd_inode; @@ -105,18 +107,21 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; + if (bdev_read_only(bdev)) + return -EPERM; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; + sector = start >> SECTOR_SHIFT; + nr_sects = len >> SECTOR_SHIFT; - if (start & 511) + if (!nr_sects) return -EINVAL; - if (len & 511) + if ((sector | nr_sects) & bs_mask) return -EINVAL; - if (start + len > bdev_nr_bytes(bdev)) return -EINVAL; @@ -124,7 +129,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + err = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL); fail: filemap_invalidate_unlock(inode->i_mapping); return err;
Most bio operations get basic sanity checking in submit_bio and anything more complicated than that is done in the callers. Discards are a bit different from that in that a lot of checking is done in __blkdev_issue_discard, and the specific errnos for that are returned to userspace. Move the checks that require specific errnos to the ioctl handler instead and replace the existing kernel sector alignment check with the actual alignment check based on the logical block size. This leaves jut the basic sanity checking in submit_bio for the other submitters of discards and introduces two changes in behavior: 1) the logical block size alignment check of the start and len is lost for non-ioctl callers. This matches what is done for other operations including reads and writes. We should probably verify this for all bios, but for now make discards match the normal flow. 2) for non-ioctl callers all errors are reported on I/O completion now instead of synchronously. Callers in general mostly ignore or log errors so this will actually simplify the code once cleaned up Signed-off-by: Christoph Hellwig <hch@lst.de> --- block/blk-lib.c | 20 -------------------- block/ioctl.c | 13 +++++++++---- 2 files changed, 9 insertions(+), 24 deletions(-)