diff mbox series

[1/5] block: move discard checks into the ioctl handler

Message ID 20240312144532.1044427-2-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [1/5] block: move discard checks into the ioctl handler | expand

Commit Message

Christoph Hellwig March 12, 2024, 2:45 p.m. UTC
Most bio operations get basic sanity checking in submit_bio and anything
more complicated than that is done in the callers.  Discards are a bit
different from that in that a lot of checking is done in
__blkdev_issue_discard, and the specific errnos for that are returned
to userspace.  Move the checks that require specific errnos to the ioctl
handler instead and replace the existing kernel sector alignment check
with the actual alignment check based on the logical block size. This
leaves jut the basic sanity checking in submit_bio for the other
submitters of discards and introduces two changes in behavior:

 1) the logical block size alignment check of the start and len is lost
    for non-ioctl callers.
    This matches what is done for other operations including reads and
    writes.  We should probably verify this for all bios, but for now
    make discards match the normal flow.
 2) for non-ioctl callers all errors are reported on I/O completion now
    instead of synchronously.  Callers in general mostly ignore or log
    errors so this will actually simplify the code once cleaned up

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-lib.c | 20 --------------------
 block/ioctl.c   | 13 +++++++++----
 2 files changed, 9 insertions(+), 24 deletions(-)

Comments

Keith Busch March 12, 2024, 10:12 p.m. UTC | #1
On Tue, Mar 12, 2024 at 08:45:27AM -0600, Christoph Hellwig wrote:
> @@ -95,6 +95,8 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
>  static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  		unsigned long arg)
>  {
> +	sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1;
> +	sector_t sector, nr_sects;
>  	uint64_t range[2];
>  	uint64_t start, len;
>  	struct inode *inode = bdev->bd_inode;
> @@ -105,18 +107,21 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  
>  	if (!bdev_max_discard_sectors(bdev))
>  		return -EOPNOTSUPP;
> +	if (bdev_read_only(bdev))
> +		return -EPERM;
>  
>  	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
>  		return -EFAULT;
>  
>  	start = range[0];
>  	len = range[1];
> +	sector = start >> SECTOR_SHIFT;
> +	nr_sects = len >> SECTOR_SHIFT;
>  
> -	if (start & 511)
> +	if (!nr_sects)
>  		return -EINVAL;
> -	if (len & 511)
> +	if ((sector | nr_sects) & bs_mask)
>  		return -EINVAL;
> -
>  	if (start + len > bdev_nr_bytes(bdev))
>  		return -EINVAL;

Maybe you want to shift lower bytes out of consideration, but it is
different, right? For example, if I call this ioctl with start=5 and
len=555, it would return EINVAL, but your change would let it succeed
the same as if start=0, len=512.
Christoph Hellwig March 12, 2024, 10:31 p.m. UTC | #2
On Tue, Mar 12, 2024 at 04:12:54PM -0600, Keith Busch wrote:
> > +	if (!nr_sects)
> >  		return -EINVAL;
> > +	if ((sector | nr_sects) & bs_mask)
> >  		return -EINVAL;
> > -
> >  	if (start + len > bdev_nr_bytes(bdev))
> >  		return -EINVAL;
> 
> Maybe you want to shift lower bytes out of consideration, but it is
> different, right? For example, if I call this ioctl with start=5 and
> len=555, it would return EINVAL, but your change would let it succeed
> the same as if start=0, len=512.

We did the same before, just down in __blkdev_issue_discard instead of
in the ioctl handler.
Keith Busch March 13, 2024, 1:22 a.m. UTC | #3
On Tue, Mar 12, 2024 at 11:31:31PM +0100, Christoph Hellwig wrote:
> On Tue, Mar 12, 2024 at 04:12:54PM -0600, Keith Busch wrote:
> > > +	if (!nr_sects)
> > >  		return -EINVAL;
> > > +	if ((sector | nr_sects) & bs_mask)
> > >  		return -EINVAL;
> > > -
> > >  	if (start + len > bdev_nr_bytes(bdev))
> > >  		return -EINVAL;
> > 
> > Maybe you want to shift lower bytes out of consideration, but it is
> > different, right? For example, if I call this ioctl with start=5 and
> > len=555, it would return EINVAL, but your change would let it succeed
> > the same as if start=0, len=512.
> 
> We did the same before, just down in __blkdev_issue_discard instead of
> in the ioctl handler.

Here's an example program demonstrating the difference:

discard-test.c:
---
#include <stdio.h>
#include <stdint.h>
#include <fcntl.h>
#include <linux/fs.h>
#include <sys/ioctl.h>

int main(int argc, char **argv)
{
	uint64_t range[2];
	int fd;

	if (argc < 2)
	        return -1;

	fd = open(argv[1], O_RDWR);
	if (fd < 0)
	        return fd;

	range[0] = 5;
	range[1] = 555;
	ioctl(fd, BLKDISCARD, &range);
	perror("BLKDISCARD");

	return 0;
}
--

Before:

 # ./discard-test /dev/nvme0n1
 BLKDISCARD: Invalid argument

After:

 # ./discard-test /dev/nvme0n1
 BLKDISCARD: Success
Keith Busch March 13, 2024, 3:40 p.m. UTC | #4
On Tue, Mar 12, 2024 at 08:45:27AM -0600, Christoph Hellwig wrote:
> @@ -95,6 +95,8 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
>  static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  		unsigned long arg)
>  {
> +	sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1;
> +	sector_t sector, nr_sects;
>  	uint64_t range[2];
>  	uint64_t start, len;
>  	struct inode *inode = bdev->bd_inode;
> @@ -105,18 +107,21 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  
>  	if (!bdev_max_discard_sectors(bdev))
>  		return -EOPNOTSUPP;
> +	if (bdev_read_only(bdev))
> +		return -EPERM;
>  
>  	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
>  		return -EFAULT;
>  
>  	start = range[0];
>  	len = range[1];
> +	sector = start >> SECTOR_SHIFT;
> +	nr_sects = len >> SECTOR_SHIFT;
>  
> -	if (start & 511)
> +	if (!nr_sects)
>  		return -EINVAL;
> -	if (len & 511)
> +	if ((sector | nr_sects) & bs_mask)
>  		return -EINVAL;
> -
>  	if (start + len > bdev_nr_bytes(bdev))
>  		return -EINVAL;
>  
> @@ -124,7 +129,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  	err = truncate_bdev_range(bdev, mode, start, start + len - 1);
>  	if (err)
>  		goto fail;
> -	err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
> +	err = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL);
>  fail:
>  	filemap_invalidate_unlock(inode->i_mapping);
>  	return err;
> -- 

The incremental change I think you want atop this patch to keep the
previous behavior:

-- >8 --
diff --git b/block/ioctl.c a/block/ioctl.c
index 57c8171fda93c..e14388548ab97 100644
--- b/block/ioctl.c
+++ a/block/ioctl.c
@@ -95,7 +95,7 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
 static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 		unsigned long arg)
 {
-	sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1;
+	sector_t mask = bdev_logical_block_size(bdev) - 1;
 	sector_t sector, nr_sects;
 	uint64_t range[2];
 	uint64_t start, len;
@@ -120,7 +120,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 
 	if (!nr_sects)
 		return -EINVAL;
-	if ((sector | nr_sects) & bs_mask)
+	if ((start | len) & mask)
 		return -EINVAL;
 	if (start + len > bdev_nr_bytes(bdev))
 		return -EINVAL;
Christoph Hellwig March 13, 2024, 8:06 p.m. UTC | #5
On Wed, Mar 13, 2024 at 09:40:21AM -0600, Keith Busch wrote:
> The incremental change I think you want atop this patch to keep the
> previous behavior:

Ah, yes, thanks.  Can you submit your reproducer to blktests or at
least throw a license on it that allows me to wire it up?

Also I'm going to wait for more comments on the approach in this
series before resending it, but we really should get a fix in in
the next days for this regression.
Jens Axboe March 13, 2024, 8:08 p.m. UTC | #6
On 3/13/24 2:06 PM, Christoph Hellwig wrote:
> Also I'm going to wait for more comments on the approach in this
> series before resending it, but we really should get a fix in in
> the next days for this regression.

Yes please, sooner rather than later. Too much fallout this merge
window, and I've got other items that should go out soon too.
diff mbox series

Patch

diff --git a/block/blk-lib.c b/block/blk-lib.c
index dc8e35d0a51d6d..50923508a32466 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -59,26 +59,6 @@  int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
 {
 	struct bio *bio = *biop;
-	sector_t bs_mask;
-
-	if (bdev_read_only(bdev))
-		return -EPERM;
-	if (!bdev_max_discard_sectors(bdev))
-		return -EOPNOTSUPP;
-
-	/* In case the discard granularity isn't set by buggy device driver */
-	if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) {
-		pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n",
-				   bdev);
-		return -EOPNOTSUPP;
-	}
-
-	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
-	if ((sector | nr_sects) & bs_mask)
-		return -EINVAL;
-
-	if (!nr_sects)
-		return -EINVAL;
 
 	while (nr_sects) {
 		sector_t req_sects =
diff --git a/block/ioctl.c b/block/ioctl.c
index 0c76137adcaaa5..57c8171fda93c5 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -95,6 +95,8 @@  static int compat_blkpg_ioctl(struct block_device *bdev,
 static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 		unsigned long arg)
 {
+	sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1;
+	sector_t sector, nr_sects;
 	uint64_t range[2];
 	uint64_t start, len;
 	struct inode *inode = bdev->bd_inode;
@@ -105,18 +107,21 @@  static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 
 	if (!bdev_max_discard_sectors(bdev))
 		return -EOPNOTSUPP;
+	if (bdev_read_only(bdev))
+		return -EPERM;
 
 	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 		return -EFAULT;
 
 	start = range[0];
 	len = range[1];
+	sector = start >> SECTOR_SHIFT;
+	nr_sects = len >> SECTOR_SHIFT;
 
-	if (start & 511)
+	if (!nr_sects)
 		return -EINVAL;
-	if (len & 511)
+	if ((sector | nr_sects) & bs_mask)
 		return -EINVAL;
-
 	if (start + len > bdev_nr_bytes(bdev))
 		return -EINVAL;
 
@@ -124,7 +129,7 @@  static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 	err = truncate_bdev_range(bdev, mode, start, start + len - 1);
 	if (err)
 		goto fail;
-	err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+	err = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL);
 fail:
 	filemap_invalidate_unlock(inode->i_mapping);
 	return err;