Message ID | 20240620125359.2684798-8-john.g.garry@oracle.com (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | block atomic writes | expand |
On Thu, Jun 20, 2024 at 12:53:56PM +0000, John Garry wrote: > Support atomic writes by submitting a single BIO with the REQ_ATOMIC set. > > It must be ensured that the atomic write adheres to its rules, like > naturally aligned offset, so call blkdev_dio_invalid() -> > blkdev_atomic_write_valid() [with renaming blkdev_dio_unaligned() to > blkdev_dio_invalid()] for this purpose. The BIO submission path currently > checks for atomic writes which are too large, so no need to check here. > > In blkdev_direct_IO(), if the nr_pages exceeds BIO_MAX_VECS, then we cannot > produce a single BIO, so error in this case. > > Finally set FMODE_CAN_ATOMIC_WRITE when the bdev can support atomic writes > and the associated file flag is for O_DIRECT. Looks good. Reviewed-by: Keith Busch <kbusch@kernel.org>
On 6/20/24 14:53, John Garry wrote: > Support atomic writes by submitting a single BIO with the REQ_ATOMIC set. > > It must be ensured that the atomic write adheres to its rules, like > naturally aligned offset, so call blkdev_dio_invalid() -> > blkdev_atomic_write_valid() [with renaming blkdev_dio_unaligned() to > blkdev_dio_invalid()] for this purpose. The BIO submission path currently > checks for atomic writes which are too large, so no need to check here. > > In blkdev_direct_IO(), if the nr_pages exceeds BIO_MAX_VECS, then we cannot > produce a single BIO, so error in this case. > > Finally set FMODE_CAN_ATOMIC_WRITE when the bdev can support atomic writes > and the associated file flag is for O_DIRECT. > > Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> > Signed-off-by: John Garry <john.g.garry@oracle.com> > --- > block/fops.c | 20 +++++++++++++++++--- > 1 file changed, 17 insertions(+), 3 deletions(-) > > diff --git a/block/fops.c b/block/fops.c > index 376265935714..be36c9fbd500 100644 > --- a/block/fops.c > +++ b/block/fops.c > @@ -34,9 +34,12 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) > return opf; > } > > -static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, > - struct iov_iter *iter) > +static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos, > + struct iov_iter *iter, bool is_atomic) > { > + if (is_atomic && !generic_atomic_write_valid(iter, pos)) > + return true; > + > return pos & (bdev_logical_block_size(bdev) - 1) || > !bdev_iter_is_aligned(bdev, iter); > } > @@ -72,6 +75,8 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, > bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; > bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; > bio.bi_ioprio = iocb->ki_ioprio; > + if (iocb->ki_flags & IOCB_ATOMIC) > + bio.bi_opf |= REQ_ATOMIC; > > ret = bio_iov_iter_get_pages(&bio, iter); > if (unlikely(ret)) > @@ -343,6 +348,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, > task_io_account_write(bio->bi_iter.bi_size); > } > > + if (iocb->ki_flags & IOCB_ATOMIC) > + bio->bi_opf |= REQ_ATOMIC; > + > if (iocb->ki_flags & IOCB_NOWAIT) > bio->bi_opf |= REQ_NOWAIT; > > @@ -359,12 +367,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, > static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) > { > struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); > + bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; > unsigned int nr_pages; > > if (!iov_iter_count(iter)) > return 0; > > - if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) > + if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic)) Why not passing in iocb->ki_flags here? Or, indeed, the entire iocb? Cheers, Hannes
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
On 21/06/2024 07:13, Hannes Reinecke wrote: > On 6/20/24 14:53, John Garry wrote: >> Support atomic writes by submitting a single BIO with the REQ_ATOMIC set. >> >> It must be ensured that the atomic write adheres to its rules, like >> naturally aligned offset, so call blkdev_dio_invalid() -> >> blkdev_atomic_write_valid() [with renaming blkdev_dio_unaligned() to >> blkdev_dio_invalid()] for this purpose. The BIO submission path currently >> checks for atomic writes which are too large, so no need to check here. >> >> In blkdev_direct_IO(), if the nr_pages exceeds BIO_MAX_VECS, then we >> cannot >> produce a single BIO, so error in this case. >> >> Finally set FMODE_CAN_ATOMIC_WRITE when the bdev can support atomic >> writes >> and the associated file flag is for O_DIRECT. >> >> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> >> Signed-off-by: John Garry <john.g.garry@oracle.com> >> --- >> block/fops.c | 20 +++++++++++++++++--- >> 1 file changed, 17 insertions(+), 3 deletions(-) >> >> diff --git a/block/fops.c b/block/fops.c >> index 376265935714..be36c9fbd500 100644 >> --- a/block/fops.c >> +++ b/block/fops.c >> @@ -34,9 +34,12 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) >> return opf; >> } >> -static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, >> - struct iov_iter *iter) >> +static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos, >> + struct iov_iter *iter, bool is_atomic) >> { >> + if (is_atomic && !generic_atomic_write_valid(iter, pos)) >> + return true; >> + >> return pos & (bdev_logical_block_size(bdev) - 1) || >> !bdev_iter_is_aligned(bdev, iter); >> } >> @@ -72,6 +75,8 @@ static ssize_t __blkdev_direct_IO_simple(struct >> kiocb *iocb, >> bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; >> bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; >> bio.bi_ioprio = iocb->ki_ioprio; >> + if (iocb->ki_flags & IOCB_ATOMIC) >> + bio.bi_opf |= REQ_ATOMIC; >> ret = bio_iov_iter_get_pages(&bio, iter); >> if (unlikely(ret)) >> @@ -343,6 +348,9 @@ static ssize_t __blkdev_direct_IO_async(struct >> kiocb *iocb, >> task_io_account_write(bio->bi_iter.bi_size); >> } >> + if (iocb->ki_flags & IOCB_ATOMIC) >> + bio->bi_opf |= REQ_ATOMIC; >> + >> if (iocb->ki_flags & IOCB_NOWAIT) >> bio->bi_opf |= REQ_NOWAIT; >> @@ -359,12 +367,13 @@ static ssize_t __blkdev_direct_IO_async(struct >> kiocb *iocb, >> static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter >> *iter) >> { >> struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); >> + bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; >> unsigned int nr_pages; >> if (!iov_iter_count(iter)) >> return 0; >> - if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) >> + if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic)) > > Why not passing in iocb->ki_flags here? > Or, indeed, the entire iocb? We could (pass the iocb), but we only need to look up one thing - ki_pos. We already have is_atomic local. I am just trying to make things as efficient as possible. If you really think it's better (to pass iocb), then it can be changed. Thanks, John
On Fri, Jun 21, 2024 at 01:02:34PM +0100, John Garry wrote: > On 21/06/2024 07:13, Hannes Reinecke wrote: > > On 6/20/24 14:53, John Garry wrote: > > > Support atomic writes by submitting a single BIO with the REQ_ATOMIC set. > > > > > > It must be ensured that the atomic write adheres to its rules, like > > > naturally aligned offset, so call blkdev_dio_invalid() -> > > > blkdev_atomic_write_valid() [with renaming blkdev_dio_unaligned() to > > > blkdev_dio_invalid()] for this purpose. The BIO submission path currently > > > checks for atomic writes which are too large, so no need to check here. > > > > > > In blkdev_direct_IO(), if the nr_pages exceeds BIO_MAX_VECS, then we > > > cannot > > > produce a single BIO, so error in this case. > > > > > > Finally set FMODE_CAN_ATOMIC_WRITE when the bdev can support atomic > > > writes > > > and the associated file flag is for O_DIRECT. > > > > > > Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> > > > Signed-off-by: John Garry <john.g.garry@oracle.com> > > > --- > > > block/fops.c | 20 +++++++++++++++++--- > > > 1 file changed, 17 insertions(+), 3 deletions(-) > > > > > > diff --git a/block/fops.c b/block/fops.c > > > index 376265935714..be36c9fbd500 100644 > > > --- a/block/fops.c > > > +++ b/block/fops.c > > > @@ -34,9 +34,12 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) > > > return opf; > > > } > > > -static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, > > > - struct iov_iter *iter) > > > +static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos, > > > + struct iov_iter *iter, bool is_atomic) > > > { > > > + if (is_atomic && !generic_atomic_write_valid(iter, pos)) > > > + return true; > > > + > > > return pos & (bdev_logical_block_size(bdev) - 1) || > > > !bdev_iter_is_aligned(bdev, iter); > > > } > > > @@ -72,6 +75,8 @@ static ssize_t __blkdev_direct_IO_simple(struct > > > kiocb *iocb, > > > bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; > > > bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; > > > bio.bi_ioprio = iocb->ki_ioprio; > > > + if (iocb->ki_flags & IOCB_ATOMIC) > > > + bio.bi_opf |= REQ_ATOMIC; > > > ret = bio_iov_iter_get_pages(&bio, iter); > > > if (unlikely(ret)) > > > @@ -343,6 +348,9 @@ static ssize_t __blkdev_direct_IO_async(struct > > > kiocb *iocb, > > > task_io_account_write(bio->bi_iter.bi_size); > > > } > > > + if (iocb->ki_flags & IOCB_ATOMIC) > > > + bio->bi_opf |= REQ_ATOMIC; > > > + > > > if (iocb->ki_flags & IOCB_NOWAIT) > > > bio->bi_opf |= REQ_NOWAIT; > > > @@ -359,12 +367,13 @@ static ssize_t __blkdev_direct_IO_async(struct > > > kiocb *iocb, > > > static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct > > > iov_iter *iter) > > > { > > > struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); > > > + bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; > > > unsigned int nr_pages; > > > if (!iov_iter_count(iter)) > > > return 0; > > > - if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) > > > + if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic)) > > > > Why not passing in iocb->ki_flags here? > > Or, indeed, the entire iocb? > > We could (pass the iocb), but we only need to look up one thing - ki_pos. We > already have is_atomic local. I am just trying to make things as efficient > as possible. If you really think it's better (to pass iocb), then it can be > changed. I certainly do. ;) https://lore.kernel.org/linux-xfs/20240620212401.GA3058325@frogsfrogsfrogs/ --D > Thanks, > John > >
diff --git a/block/fops.c b/block/fops.c index 376265935714..be36c9fbd500 100644 --- a/block/fops.c +++ b/block/fops.c @@ -34,9 +34,12 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) return opf; } -static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, - struct iov_iter *iter) +static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos, + struct iov_iter *iter, bool is_atomic) { + if (is_atomic && !generic_atomic_write_valid(iter, pos)) + return true; + return pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } @@ -72,6 +75,8 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_ioprio = iocb->ki_ioprio; + if (iocb->ki_flags & IOCB_ATOMIC) + bio.bi_opf |= REQ_ATOMIC; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) @@ -343,6 +348,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_ATOMIC) + bio->bi_opf |= REQ_ATOMIC; + if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; @@ -359,12 +367,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; - if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) + if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic)) return -EINVAL; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); @@ -373,6 +382,8 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); + } else if (is_atomic) { + return -EINVAL; } return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } @@ -612,6 +623,9 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (!bdev) return -ENXIO; + if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) blkdev_put_no_open(bdev);