diff mbox series

[06/16] iomap: support block size > page size for direct IO

Message ID 20181107063127.3902-7-david@fromorbit.com (mailing list archive)
State New, archived
Headers show
Series xfs: Block size > PAGE_SIZE support | expand

Commit Message

Dave Chinner Nov. 7, 2018, 6:31 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

iomap_dio_rw() has all the infrastructure in place to support block
size > page size filesystems because it is essentially just
sub-block DIO. It needs help, however, with the sub-block zeroing
code (needs multi-page IOs) page cache invalidation over the block
being written.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/iomap.c | 65 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 16 deletions(-)

Comments

Nikolay Borisov Nov. 8, 2018, 11:28 a.m. UTC | #1
On 7.11.18 г. 8:31 ч., Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> iomap_dio_rw() has all the infrastructure in place to support block
> size > page size filesystems because it is essentially just
> sub-block DIO. It needs help, however, with the sub-block zeroing
> code (needs multi-page IOs) page cache invalidation over the block
> being written.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/iomap.c | 65 ++++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 49 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 16d16596b00f..8878b1f1f9c7 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -1548,21 +1548,34 @@ static void iomap_dio_bio_end_io(struct bio *bio)
>  	}
>  }
>  
> +/*
> + * With zeroing for block size larger than page size, the zeroing length can
> + * span multiple pages.
> + */
> +#define howmany(x, y) (((x)+((y)-1))/(y))

nit: This could be replaced by DIV_ROUND_UP

>  static blk_qc_t
>  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
>  		unsigned len)
>  {
>  	struct page *page = ZERO_PAGE(0);
>  	struct bio *bio;
> +	int npages = howmany(len, PAGE_SIZE);
> +
> +	WARN_ON_ONCE(npages > 16);
>  
> -	bio = bio_alloc(GFP_KERNEL, 1);
> +	bio = bio_alloc(GFP_KERNEL, npages);
>  	bio_set_dev(bio, iomap->bdev);
>  	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>  	bio->bi_private = dio;
>  	bio->bi_end_io = iomap_dio_bio_end_io;
>  
> -	get_page(page);
> -	__bio_add_page(bio, page, len, 0);
> +	while (npages-- > 0) {
> +		unsigned plen = min_t(unsigned, PAGE_SIZE, len);
> +		get_page(page);
> +		__bio_add_page(bio, page, plen, 0);
> +		len -= plen;
> +	}
> +	WARN_ON(len != 0);
>  	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
>  
>  	atomic_inc(&dio->ref);
> @@ -1752,6 +1765,38 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
>  	}
>  }
>  
> +/*
> + * This is lifted almost straight from xfs_flush_unmap_range(). Need a generic
> + * version of the block size rounding for these purposes.
> + */
> +static int
> +iomap_flush_unmap_range(struct file *f, loff_t offset, loff_t len)
> +{
> +	struct inode *inode = file_inode(f);
> +	loff_t rounding, start, end;
> +	int ret;
> +
> +	rounding = max_t(loff_t, i_blocksize(inode), PAGE_SIZE);
> +	start = round_down(offset, rounding);
> +	end = round_up(offset + len, rounding) - 1;
> +
> +	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * Try to invalidate cache pages for the range we're direct
> +	 * writing.  If this invalidation fails, tough, the write will
> +	 * still work, but racing two incompatible write paths is a
> +	 * pretty crazy thing to do, so we don't support it 100%.
> +	 */
> +	ret = invalidate_inode_pages2_range(inode->i_mapping,
> +			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> +	if (ret)
> +		dio_warn_stale_pagecache(f);
> +	return 0;
> +}
> +
>  /*
>   * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
>   * is being issued as AIO or not.  This allows us to optimise pure data writes
> @@ -1829,22 +1874,10 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		flags |= IOMAP_NOWAIT;
>  	}
>  
> -	ret = filemap_write_and_wait_range(mapping, start, end);
> +	ret = iomap_flush_unmap_range(iocb->ki_filp, start, end);
>  	if (ret)
>  		goto out_free_dio;
>  
> -	/*
> -	 * Try to invalidate cache pages for the range we're direct
> -	 * writing.  If this invalidation fails, tough, the write will
> -	 * still work, but racing two incompatible write paths is a
> -	 * pretty crazy thing to do, so we don't support it 100%.
> -	 */
> -	ret = invalidate_inode_pages2_range(mapping,
> -			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> -	if (ret)
> -		dio_warn_stale_pagecache(iocb->ki_filp);
> -	ret = 0;
> -
>  	if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion &&
>  	    !inode->i_sb->s_dio_done_wq) {
>  		ret = sb_init_dio_done_wq(inode->i_sb);
>
Christoph Hellwig Nov. 9, 2018, 3:18 p.m. UTC | #2
>  static blk_qc_t
>  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
>  		unsigned len)
>  {
>  	struct page *page = ZERO_PAGE(0);
>  	struct bio *bio;
> +	int npages = howmany(len, PAGE_SIZE);
> +
> +	WARN_ON_ONCE(npages > 16);

Where does this magic 16 come from?

> +	WARN_ON(len != 0);

WARN_ON_ONCE please to avoid making the log unreadable if it ever
triggers.

> +/*
> + * This is lifted almost straight from xfs_flush_unmap_range(). Need a generic
> + * version of the block size rounding for these purposes.
> + */

Can you just add a generic version of this in a separate patch and
also switch XFS over to it?

> +static int
> +iomap_flush_unmap_range(struct file *f, loff_t offset, loff_t len)

Can we please spell out file in the parameter name?
Dave Chinner Nov. 11, 2018, 1:12 a.m. UTC | #3
On Fri, Nov 09, 2018 at 07:18:19AM -0800, Christoph Hellwig wrote:
> >  static blk_qc_t
> >  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
> >  		unsigned len)
> >  {
> >  	struct page *page = ZERO_PAGE(0);
> >  	struct bio *bio;
> > +	int npages = howmany(len, PAGE_SIZE);
> > +
> > +	WARN_ON_ONCE(npages > 16);
> 
> Where does this magic 16 come from?

4k page size, 64k block size. Debug code, essentially.

> > +	WARN_ON(len != 0);
> 
> WARN_ON_ONCE please to avoid making the log unreadable if it ever
> triggers.

Debug code, too, so it'll get removed eventually.

> > +/*
> > + * This is lifted almost straight from xfs_flush_unmap_range(). Need a generic
> > + * version of the block size rounding for these purposes.
> > + */
> 
> Can you just add a generic version of this in a separate patch and
> also switch XFS over to it?

Well, they do different things. The xfs code must truncate the page
cache over the range (because we are removing the underlying
storage) while this just attempts to invalidate the pages and simply
says "have a nice day" if it fails. 

So they really are two different functions. The comment was written
when I expected that I was going to need to do lots more block size
rounding for invalidation in the generic code., but it seems that
may actually not be necessary....

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/iomap.c b/fs/iomap.c
index 16d16596b00f..8878b1f1f9c7 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1548,21 +1548,34 @@  static void iomap_dio_bio_end_io(struct bio *bio)
 	}
 }
 
+/*
+ * With zeroing for block size larger than page size, the zeroing length can
+ * span multiple pages.
+ */
+#define howmany(x, y) (((x)+((y)-1))/(y))
 static blk_qc_t
 iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 		unsigned len)
 {
 	struct page *page = ZERO_PAGE(0);
 	struct bio *bio;
+	int npages = howmany(len, PAGE_SIZE);
+
+	WARN_ON_ONCE(npages > 16);
 
-	bio = bio_alloc(GFP_KERNEL, 1);
+	bio = bio_alloc(GFP_KERNEL, npages);
 	bio_set_dev(bio, iomap->bdev);
 	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
-	get_page(page);
-	__bio_add_page(bio, page, len, 0);
+	while (npages-- > 0) {
+		unsigned plen = min_t(unsigned, PAGE_SIZE, len);
+		get_page(page);
+		__bio_add_page(bio, page, plen, 0);
+		len -= plen;
+	}
+	WARN_ON(len != 0);
 	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
 
 	atomic_inc(&dio->ref);
@@ -1752,6 +1765,38 @@  iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 	}
 }
 
+/*
+ * This is lifted almost straight from xfs_flush_unmap_range(). Need a generic
+ * version of the block size rounding for these purposes.
+ */
+static int
+iomap_flush_unmap_range(struct file *f, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(f);
+	loff_t rounding, start, end;
+	int ret;
+
+	rounding = max_t(loff_t, i_blocksize(inode), PAGE_SIZE);
+	start = round_down(offset, rounding);
+	end = round_up(offset + len, rounding) - 1;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	/*
+	 * Try to invalidate cache pages for the range we're direct
+	 * writing.  If this invalidation fails, tough, the write will
+	 * still work, but racing two incompatible write paths is a
+	 * pretty crazy thing to do, so we don't support it 100%.
+	 */
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+	if (ret)
+		dio_warn_stale_pagecache(f);
+	return 0;
+}
+
 /*
  * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
  * is being issued as AIO or not.  This allows us to optimise pure data writes
@@ -1829,22 +1874,10 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		flags |= IOMAP_NOWAIT;
 	}
 
-	ret = filemap_write_and_wait_range(mapping, start, end);
+	ret = iomap_flush_unmap_range(iocb->ki_filp, start, end);
 	if (ret)
 		goto out_free_dio;
 
-	/*
-	 * Try to invalidate cache pages for the range we're direct
-	 * writing.  If this invalidation fails, tough, the write will
-	 * still work, but racing two incompatible write paths is a
-	 * pretty crazy thing to do, so we don't support it 100%.
-	 */
-	ret = invalidate_inode_pages2_range(mapping,
-			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-	if (ret)
-		dio_warn_stale_pagecache(iocb->ki_filp);
-	ret = 0;
-
 	if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion &&
 	    !inode->i_sb->s_dio_done_wq) {
 		ret = sb_init_dio_done_wq(inode->i_sb);