diff mbox series

[10/13] iomap: fix iomap_dio_zero() for fs bs > system page size

Message ID 20240226094936.2677493-11-kernel@pankajraghav.com (mailing list archive)
State New
Headers show
Series enable bs > ps in XFS | expand

Commit Message

Pankaj Raghav (Samsung) Feb. 26, 2024, 9:49 a.m. UTC
From: Pankaj Raghav <p.raghav@samsung.com>

iomap_dio_zero() will pad a fs block with zeroes if the direct IO size
< fs block size. iomap_dio_zero() has an implicit assumption that fs block
size < page_size. This is true for most filesystems at the moment.

If the block size > page size, this will send the contents of the page
next to zero page(as len > PAGE_SIZE) to the underlying block device,
causing FS corruption.

iomap is a generic infrastructure and it should not make any assumptions
about the fs block size and the page size of the system.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/direct-io.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

Comments

Matthew Wilcox Feb. 26, 2024, 5:58 p.m. UTC | #1
On Mon, Feb 26, 2024 at 10:49:33AM +0100, Pankaj Raghav (Samsung) wrote:
> +++ b/fs/iomap/direct-io.c
> @@ -239,14 +239,23 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
>  	struct page *page = ZERO_PAGE(0);
>  	struct bio *bio;
>  
> -	bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
> +	WARN_ON_ONCE(len > (BIO_MAX_VECS * PAGE_SIZE));
> +
> +	bio = iomap_dio_alloc_bio(iter, dio, BIO_MAX_VECS,
> +				  REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
>  	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
>  				  GFP_KERNEL);
> +
>  	bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
>  	bio->bi_private = dio;
>  	bio->bi_end_io = iomap_dio_bio_end_io;
>  
> -	__bio_add_page(bio, page, len, 0);
> +	while (len) {
> +		unsigned int io_len = min_t(unsigned int, len, PAGE_SIZE);
> +
> +		__bio_add_page(bio, page, io_len, 0);
> +		len -= io_len;
> +	}

I thought we were going to use the huge_zero_page for this?
Pankaj Raghav (Samsung) Feb. 27, 2024, 9:33 a.m. UTC | #2
> I thought we were going to use the huge_zero_page for this?

Yes. We discussed that huge_zero_page might fail, so we concluded that
we needed an api that can return arbitrary folio order that will not
fail:
```
your point about it possibly failing is correct.  so i think we need an
api which definitely returns a folio, but it might be of arbitrary
order.
```

I couldn't come up with implementing your latter suggestion, so I
informed darrick that let's use this patch for now, and add the
arbitrary folio order with zero as a later enhancement.

If we want to use mm_huge_zero_page, then this should work:

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 04f6c5548136..b6a3f52f48da 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -237,10 +237,17 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
 {
        struct inode *inode = file_inode(dio->iocb->ki_filp);
        struct page *page = ZERO_PAGE(0);
+       struct folio *folio = NULL;
        struct bio *bio;
 
        WARN_ON_ONCE(len > (BIO_MAX_VECS * PAGE_SIZE));
 
+       if (len > PAGE_SIZE) {
+               page = mm_get_huge_zero_page(current->mm);
+               if (!page)
+                       page = ZERO_PAGE(0);
+       }
+
        bio = iomap_dio_alloc_bio(iter, dio, BIO_MAX_VECS,
                                  REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
        fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
@@ -249,13 +256,15 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
        bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
        bio->bi_private = dio;
        bio->bi_end_io = iomap_dio_bio_end_io;
+       folio = page_folio(page);
 
        while (len) {
-               unsigned int io_len = min_t(unsigned int, len, PAGE_SIZE);
+               size_t size = min(len, folio_size(folio));
 
-               __bio_add_page(bio, page, io_len, 0);
-               len -= io_len;
+               bio_add_folio_nofail(bio, folio, size, 0);
+               len -= size;
        }
+
        iomap_dio_submit_bio(iter, dio, bio, pos);
 }

Let me know if we should go for this or let's keep the original patch
and add a ZERO_FOLIO_ORDER API that will not fail and use it as a later
enhancement.
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..04f6c5548136 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -239,14 +239,23 @@  static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
 	struct page *page = ZERO_PAGE(0);
 	struct bio *bio;
 
-	bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
+	WARN_ON_ONCE(len > (BIO_MAX_VECS * PAGE_SIZE));
+
+	bio = iomap_dio_alloc_bio(iter, dio, BIO_MAX_VECS,
+				  REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
 	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
 				  GFP_KERNEL);
+
 	bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
-	__bio_add_page(bio, page, len, 0);
+	while (len) {
+		unsigned int io_len = min_t(unsigned int, len, PAGE_SIZE);
+
+		__bio_add_page(bio, page, io_len, 0);
+		len -= io_len;
+	}
 	iomap_dio_submit_bio(iter, dio, bio, pos);
 }