Message ID | 20201014030357.21898-10-willy@infradead.org (mailing list archive) |
---|---|
State | Deferred, archived |
Headers | show |
Series | Transparent Huge Page support for XFS | expand |
On Wed, Oct 14, 2020 at 04:03:52AM +0100, Matthew Wilcox (Oracle) wrote: > Pass (up to) the remaining length of the extent to iomap_write_begin() > and have it return the number of bytes that will fit in the page. > That lets us copy more bytes per call to iomap_write_begin() if the page > cache has already allocated a THP (and will in future allow us to pass > a hint to the page cache that it should try to allocate a larger page > if there are none in the cache). > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > fs/iomap/buffered-io.c | 61 +++++++++++++++++++++++------------------- > 1 file changed, 33 insertions(+), 28 deletions(-) > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c > index 4ef02afaedc5..397795db3ce5 100644 > --- a/fs/iomap/buffered-io.c > +++ b/fs/iomap/buffered-io.c > @@ -616,14 +616,14 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, > return submit_bio_wait(&bio); > } > > -static int > -__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, > - struct page *page, struct iomap *srcmap) > +static ssize_t __iomap_write_begin(struct inode *inode, loff_t pos, > + size_t len, int flags, struct page *page, struct iomap *srcmap) > { > loff_t block_size = i_blocksize(inode); > loff_t block_start = pos & ~(block_size - 1); > loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); > - unsigned from = offset_in_page(pos), to = from + len; > + size_t from = offset_in_thp(page, pos); > + size_t to = from + len; > size_t poff, plen; > > if (PageUptodate(page)) > @@ -658,12 +658,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, > return 0; > } > > -static int > -iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, > - struct page **pagep, struct iomap *iomap, struct iomap *srcmap) > +static ssize_t iomap_write_begin(struct inode *inode, loff_t pos, loff_t len, > + unsigned flags, struct page **pagep, struct iomap *iomap, loff_t len? You've been using size_t (ssize_t?) for length elsewhere, can't return more than ssize_t, and afaik MAX_RW_COUNT will never go larger than 2GB so I'm confused about types here...? Mostly because my brain has been trained to think that if it sees "size_t len" as an input parameter and a ssize_t return value, then probably the return value is however much of @len we managed to process. > + struct iomap *srcmap) > { > const struct iomap_page_ops *page_ops = iomap->page_ops; > struct page *page; > + size_t offset; > int status = 0; > > BUG_ON(pos + len > iomap->offset + iomap->length); > @@ -674,6 +675,8 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, > return -EINTR; > > if (page_ops && page_ops->page_prepare) { > + if (len > UINT_MAX) > + len = UINT_MAX; I'm not especially familiar with page_prepare (since it's a gfs2 thing); why do you clamp len to UINT_MAX here? --D > status = page_ops->page_prepare(inode, pos, len, iomap); > if (status) > return status; > @@ -685,6 +688,10 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, > status = -ENOMEM; > goto out_no_page; > } > + page = thp_head(page); > + offset = offset_in_thp(page, pos); > + if (len > thp_size(page) - offset) > + len = thp_size(page) - offset; > > if (srcmap->type == IOMAP_INLINE) > iomap_read_inline_data(inode, page, srcmap); > @@ -694,11 +701,11 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, > status = __iomap_write_begin(inode, pos, len, flags, page, > srcmap); > > - if (unlikely(status)) > + if (status < 0) > goto out_unlock; > > *pagep = page; > - return 0; > + return len; > > out_unlock: > unlock_page(page); > @@ -854,8 +861,10 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, > srcmap); > - if (unlikely(status)) > + if (status < 0) > break; > + /* We may be partway through a THP */ > + offset = offset_in_thp(page, pos); > > if (mapping_writably_mapped(inode->i_mapping)) > flush_dcache_page(page); > @@ -915,7 +924,6 @@ static loff_t > iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > struct iomap *iomap, struct iomap *srcmap) > { > - long status = 0; > loff_t written = 0; > > /* don't bother with blocks that are not shared to start with */ > @@ -926,25 +934,24 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > return length; > > do { > - unsigned long offset = offset_in_page(pos); > - unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); > struct page *page; > + ssize_t bytes; > > - status = iomap_write_begin(inode, pos, bytes, > + bytes = iomap_write_begin(inode, pos, length, > IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); > - if (unlikely(status)) > - return status; > + if (bytes < 0) > + return bytes; > > - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, > + bytes = iomap_write_end(inode, pos, bytes, bytes, page, iomap, > srcmap); > - if (WARN_ON_ONCE(status == 0)) > + if (WARN_ON_ONCE(bytes == 0)) > return -EIO; > > cond_resched(); > > - pos += status; > - written += status; > - length -= status; > + pos += bytes; > + written += bytes; > + length -= bytes; > > balance_dirty_pages_ratelimited(inode->i_mapping); > } while (length); > @@ -975,15 +982,13 @@ static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, > struct iomap *iomap, struct iomap *srcmap) > { > struct page *page; > - int status; > - unsigned offset = offset_in_page(pos); > - unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); > + ssize_t bytes; > > - status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); > - if (status) > - return status; > + bytes = iomap_write_begin(inode, pos, length, 0, &page, iomap, srcmap); > + if (bytes < 0) > + return bytes; > > - zero_user(page, offset, bytes); > + zero_user(page, offset_in_thp(page, pos), bytes); > mark_page_accessed(page); > > return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); > -- > 2.28.0 >
On Wed, Oct 14, 2020 at 09:47:44AM -0700, Darrick J. Wong wrote: > > -static int > > -iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, > > - struct page **pagep, struct iomap *iomap, struct iomap *srcmap) > > +static ssize_t iomap_write_begin(struct inode *inode, loff_t pos, loff_t len, > > + unsigned flags, struct page **pagep, struct iomap *iomap, > > loff_t len? You've been using size_t (ssize_t?) for length elsewhere, > can't return more than ssize_t, and afaik MAX_RW_COUNT will never go > larger than 2GB so I'm confused about types here...? Yes, you're right. This one should be size_t. > > if (page_ops && page_ops->page_prepare) { > > + if (len > UINT_MAX) > > + len = UINT_MAX; > > I'm not especially familiar with page_prepare (since it's a gfs2 thing); > why do you clamp len to UINT_MAX here? The len parameter of ->page_prepare is an unsigned int. I don't want a 1<<32+1 byte I/O to be seen as a 1 byte I/O. We could upgrade the parameter to size_t from unsigned int?
On Wed, Oct 14, 2020 at 06:41:58PM +0100, Matthew Wilcox wrote: > On Wed, Oct 14, 2020 at 09:47:44AM -0700, Darrick J. Wong wrote: > > > -static int > > > -iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, > > > - struct page **pagep, struct iomap *iomap, struct iomap *srcmap) > > > +static ssize_t iomap_write_begin(struct inode *inode, loff_t pos, loff_t len, > > > + unsigned flags, struct page **pagep, struct iomap *iomap, > > > > loff_t len? You've been using size_t (ssize_t?) for length elsewhere, > > can't return more than ssize_t, and afaik MAX_RW_COUNT will never go > > larger than 2GB so I'm confused about types here...? > > Yes, you're right. This one should be size_t. > > > > if (page_ops && page_ops->page_prepare) { > > > + if (len > UINT_MAX) > > > + len = UINT_MAX; > > > > I'm not especially familiar with page_prepare (since it's a gfs2 thing); > > why do you clamp len to UINT_MAX here? > > The len parameter of ->page_prepare is an unsigned int. I don't want > a 1<<32+1 byte I/O to be seen as a 1 byte I/O. We could upgrade the > parameter to size_t from unsigned int? OK, here's what I have -- two patches (copy-and-wasted): iomap: Prepare page ops for larger I/Os Just adjust the types for the page_prepare() and page_done() callbacks to size_t from unsigned int. While I don't see individual pages that large in our near future, it's convenient to be able to pass in larger lengths and has no cost. Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 0f69fbd4af66..9a94d7e58199 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1028,7 +1028,7 @@ static void gfs2_write_unlock(struct inode *inode) } static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, - unsigned len, struct iomap *iomap) + size_t len, struct iomap *iomap) { unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1039,7 +1039,7 @@ static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, } static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, - unsigned copied, struct page *page, + size_t copied, struct page *page, struct iomap *iomap) { struct gfs2_trans *tr = current->journal_info; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4d1d3c3469e9..d3b06bffd996 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -106,9 +106,9 @@ iomap_sector(struct iomap *iomap, loff_t pos) * associated page could not be obtained. */ struct iomap_page_ops { - int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len, + int (*page_prepare)(struct inode *inode, loff_t pos, size_t len, struct iomap *iomap); - void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, + void (*page_done)(struct inode *inode, loff_t pos, size_t copied, struct page *page, struct iomap *iomap); }; The other is an amendment of this: -static int -iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags , - struct page **pagep, struct iomap *iomap, struct iomap *srcmap) +static ssize_t iomap_write_begin(struct inode *inode, loff_t pos, size_t len, + unsigned flags, struct page **pagep, struct iomap *iomap, + struct iomap *srcmap) and I drop the clamp to UINT_MAX. I also added: - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ + size_t offset; /* Offset into pagecache page */ + size_t bytes; /* Bytes to write to page */ which won't change anything, but does fit the general pattern of using size_t for a byte count of in-memory things. It matches iomap_unshare_actor(), for example.
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 4ef02afaedc5..397795db3ce5 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -616,14 +616,14 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, return submit_bio_wait(&bio); } -static int -__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, - struct page *page, struct iomap *srcmap) +static ssize_t __iomap_write_begin(struct inode *inode, loff_t pos, + size_t len, int flags, struct page *page, struct iomap *srcmap) { loff_t block_size = i_blocksize(inode); loff_t block_start = pos & ~(block_size - 1); loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); - unsigned from = offset_in_page(pos), to = from + len; + size_t from = offset_in_thp(page, pos); + size_t to = from + len; size_t poff, plen; if (PageUptodate(page)) @@ -658,12 +658,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, return 0; } -static int -iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, struct iomap *iomap, struct iomap *srcmap) +static ssize_t iomap_write_begin(struct inode *inode, loff_t pos, loff_t len, + unsigned flags, struct page **pagep, struct iomap *iomap, + struct iomap *srcmap) { const struct iomap_page_ops *page_ops = iomap->page_ops; struct page *page; + size_t offset; int status = 0; BUG_ON(pos + len > iomap->offset + iomap->length); @@ -674,6 +675,8 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, return -EINTR; if (page_ops && page_ops->page_prepare) { + if (len > UINT_MAX) + len = UINT_MAX; status = page_ops->page_prepare(inode, pos, len, iomap); if (status) return status; @@ -685,6 +688,10 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, status = -ENOMEM; goto out_no_page; } + page = thp_head(page); + offset = offset_in_thp(page, pos); + if (len > thp_size(page) - offset) + len = thp_size(page) - offset; if (srcmap->type == IOMAP_INLINE) iomap_read_inline_data(inode, page, srcmap); @@ -694,11 +701,11 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, status = __iomap_write_begin(inode, pos, len, flags, page, srcmap); - if (unlikely(status)) + if (status < 0) goto out_unlock; *pagep = page; - return 0; + return len; out_unlock: unlock_page(page); @@ -854,8 +861,10 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); - if (unlikely(status)) + if (status < 0) break; + /* We may be partway through a THP */ + offset = offset_in_thp(page, pos); if (mapping_writably_mapped(inode->i_mapping)) flush_dcache_page(page); @@ -915,7 +924,6 @@ static loff_t iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { - long status = 0; loff_t written = 0; /* don't bother with blocks that are not shared to start with */ @@ -926,25 +934,24 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, return length; do { - unsigned long offset = offset_in_page(pos); - unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct page *page; + ssize_t bytes; - status = iomap_write_begin(inode, pos, bytes, + bytes = iomap_write_begin(inode, pos, length, IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); - if (unlikely(status)) - return status; + if (bytes < 0) + return bytes; - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, + bytes = iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); - if (WARN_ON_ONCE(status == 0)) + if (WARN_ON_ONCE(bytes == 0)) return -EIO; cond_resched(); - pos += status; - written += status; - length -= status; + pos += bytes; + written += bytes; + length -= bytes; balance_dirty_pages_ratelimited(inode->i_mapping); } while (length); @@ -975,15 +982,13 @@ static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, struct iomap *iomap, struct iomap *srcmap) { struct page *page; - int status; - unsigned offset = offset_in_page(pos); - unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); + ssize_t bytes; - status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); - if (status) - return status; + bytes = iomap_write_begin(inode, pos, length, 0, &page, iomap, srcmap); + if (bytes < 0) + return bytes; - zero_user(page, offset, bytes); + zero_user(page, offset_in_thp(page, pos), bytes); mark_page_accessed(page); return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
Pass (up to) the remaining length of the extent to iomap_write_begin() and have it return the number of bytes that will fit in the page. That lets us copy more bytes per call to iomap_write_begin() if the page cache has already allocated a THP (and will in future allow us to pass a hint to the page cache that it should try to allocate a larger page if there are none in the cache). Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> --- fs/iomap/buffered-io.c | 61 +++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 28 deletions(-)