Message ID | 20250305140532.158563-11-hch@lst.de (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [01/12] xfs: unmapped buffer item size straddling mismatch | expand |
On Wed, Mar 05, 2025 at 07:05:27AM -0700, Christoph Hellwig wrote: > The fallback buffer allocation path currently open codes a suboptimal > version of vmalloc to allocate pages that are then mapped into > vmalloc space. Switch to using vmalloc instead, which uses all the > optimizations in the common vmalloc code, and removes the need to > track the backing pages in the xfs_buf structure. > > Signed-off-by: Christoph Hellwig <hch@lst.de> Looks good now, Reviewed-by: "Darrick J. Wong" <djwong@kernel.org> --D > --- > fs/xfs/xfs_buf.c | 207 ++++++++++--------------------------------- > fs/xfs/xfs_buf.h | 7 -- > fs/xfs/xfs_buf_mem.c | 11 +-- > 3 files changed, 48 insertions(+), 177 deletions(-) > > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index 2b4b8c104b0c..f28ca5cb5bd8 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -55,13 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) > return bp->b_rhash_key == XFS_BUF_DADDR_NULL; > } > > -static inline int > -xfs_buf_vmap_len( > - struct xfs_buf *bp) > -{ > - return (bp->b_page_count * PAGE_SIZE); > -} > - > /* > * When we mark a buffer stale, we remove the buffer from the LRU and clear the > * b_lru_ref count so that the buffer is freed immediately when the buffer > @@ -190,29 +183,6 @@ _xfs_buf_alloc( > return 0; > } > > -static void > -xfs_buf_free_pages( > - struct xfs_buf *bp) > -{ > - uint i; > - > - ASSERT(bp->b_flags & _XBF_PAGES); > - > - if (is_vmalloc_addr(bp->b_addr)) > - vm_unmap_ram(bp->b_addr, bp->b_page_count); > - > - for (i = 0; i < bp->b_page_count; i++) { > - if (bp->b_pages[i]) > - folio_put(page_folio(bp->b_pages[i])); > - } > - mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE)); > - > - if (bp->b_pages != bp->b_page_array) > - kfree(bp->b_pages); > - bp->b_pages = NULL; > - bp->b_flags &= ~_XBF_PAGES; > -} > - > static void > xfs_buf_free_callback( > struct callback_head *cb) > @@ -227,16 +197,23 @@ static void > xfs_buf_free( > struct xfs_buf *bp) > { > + unsigned int size = BBTOB(bp->b_length); > + > trace_xfs_buf_free(bp, _RET_IP_); > > ASSERT(list_empty(&bp->b_lru)); > > + if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) > + mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); > + > if (xfs_buftarg_is_mem(bp->b_target)) > xmbuf_unmap_page(bp); > - else if (bp->b_flags & _XBF_PAGES) > - xfs_buf_free_pages(bp); > + else if (is_vmalloc_addr(bp->b_addr)) > + vfree(bp->b_addr); > else if (bp->b_flags & _XBF_KMEM) > kfree(bp->b_addr); > + else > + folio_put(virt_to_folio(bp->b_addr)); > > call_rcu(&bp->b_rcu, xfs_buf_free_callback); > } > @@ -264,9 +241,6 @@ xfs_buf_alloc_kmem( > bp->b_addr = NULL; > return -ENOMEM; > } > - bp->b_pages = bp->b_page_array; > - bp->b_pages[0] = kmem_to_page(bp->b_addr); > - bp->b_page_count = 1; > bp->b_flags |= _XBF_KMEM; > return 0; > } > @@ -287,9 +261,9 @@ xfs_buf_alloc_kmem( > * by the rest of the code - the buffer memory spans a single contiguous memory > * region that we don't have to map and unmap to access the data directly. > * > - * The third type of buffer is the multi-page buffer. These are always made > - * up of single pages so that they can be fed to vmap_ram() to return a > - * contiguous memory region we can access the data through. > + * The third type of buffer is the vmalloc()d buffer. This provides the buffer > + * with the required contiguous memory region but backed by discontiguous > + * physical pages. > */ > static int > xfs_buf_alloc_backing_mem( > @@ -299,7 +273,6 @@ xfs_buf_alloc_backing_mem( > size_t size = BBTOB(bp->b_length); > gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; > struct folio *folio; > - long filled = 0; > > if (xfs_buftarg_is_mem(bp->b_target)) > return xmbuf_map_page(bp); > @@ -347,98 +320,18 @@ xfs_buf_alloc_backing_mem( > goto fallback; > } > bp->b_addr = folio_address(folio); > - bp->b_page_array[0] = &folio->page; > - bp->b_pages = bp->b_page_array; > - bp->b_page_count = 1; > - bp->b_flags |= _XBF_PAGES; > return 0; > > fallback: > - /* Fall back to allocating an array of single page folios. */ > - bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE); > - if (bp->b_page_count <= XB_PAGES) { > - bp->b_pages = bp->b_page_array; > - } else { > - bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, > - gfp_mask); > - if (!bp->b_pages) > - return -ENOMEM; > - } > - bp->b_flags |= _XBF_PAGES; > - > - /* > - * Bulk filling of pages can take multiple calls. Not filling the entire > - * array is not an allocation failure, so don't back off if we get at > - * least one extra page. > - */ > for (;;) { > - long last = filled; > - > - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, > - bp->b_pages); > - if (filled == bp->b_page_count) { > - XFS_STATS_INC(bp->b_mount, xb_page_found); > + bp->b_addr = __vmalloc(size, gfp_mask); > + if (bp->b_addr) > break; > - } > - > - if (filled != last) > - continue; > - > - if (flags & XBF_READ_AHEAD) { > - xfs_buf_free_pages(bp); > + if (flags & XBF_READ_AHEAD) > return -ENOMEM; > - } > - > XFS_STATS_INC(bp->b_mount, xb_page_retries); > memalloc_retry_wait(gfp_mask); > } > - return 0; > -} > - > -/* > - * Map buffer into kernel address-space if necessary. > - */ > -STATIC int > -_xfs_buf_map_pages( > - struct xfs_buf *bp, > - xfs_buf_flags_t flags) > -{ > - ASSERT(bp->b_flags & _XBF_PAGES); > - if (bp->b_page_count == 1) { > - /* A single page buffer is always mappable */ > - bp->b_addr = page_address(bp->b_pages[0]); > - } else { > - int retried = 0; > - unsigned nofs_flag; > - > - /* > - * vm_map_ram() will allocate auxiliary structures (e.g. > - * pagetables) with GFP_KERNEL, yet we often under a scoped nofs > - * context here. Mixing GFP_KERNEL with GFP_NOFS allocations > - * from the same call site that can be run from both above and > - * below memory reclaim causes lockdep false positives. Hence we > - * always need to force this allocation to nofs context because > - * we can't pass __GFP_NOLOCKDEP down to auxillary structures to > - * prevent false positive lockdep reports. > - * > - * XXX(dgc): I think dquot reclaim is the only place we can get > - * to this function from memory reclaim context now. If we fix > - * that like we've fixed inode reclaim to avoid writeback from > - * reclaim, this nofs wrapping can go away. > - */ > - nofs_flag = memalloc_nofs_save(); > - do { > - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, > - -1); > - if (bp->b_addr) > - break; > - vm_unmap_aliases(); > - } while (retried++ <= 1); > - memalloc_nofs_restore(nofs_flag); > - > - if (!bp->b_addr) > - return -ENOMEM; > - } > > return 0; > } > @@ -558,7 +451,7 @@ xfs_buf_find_lock( > return -ENOENT; > } > ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); > - bp->b_flags &= _XBF_KMEM | _XBF_PAGES; > + bp->b_flags &= _XBF_KMEM; > bp->b_ops = NULL; > } > return 0; > @@ -744,18 +637,6 @@ xfs_buf_get_map( > xfs_perag_put(pag); > } > > - /* We do not hold a perag reference anymore. */ > - if (!bp->b_addr) { > - error = _xfs_buf_map_pages(bp, flags); > - if (unlikely(error)) { > - xfs_warn_ratelimited(btp->bt_mount, > - "%s: failed to map %u pages", __func__, > - bp->b_page_count); > - xfs_buf_relse(bp); > - return error; > - } > - } > - > /* > * Clear b_error if this is a lookup from a caller that doesn't expect > * valid data to be found in the buffer. > @@ -998,14 +879,6 @@ xfs_buf_get_uncached( > if (error) > goto fail_free_buf; > > - if (!bp->b_addr) > - error = _xfs_buf_map_pages(bp, 0); > - if (unlikely(error)) { > - xfs_warn(target->bt_mount, > - "%s: failed to map pages", __func__); > - goto fail_free_buf; > - } > - > trace_xfs_buf_get_uncached(bp, _RET_IP_); > *bpp = bp; > return 0; > @@ -1339,7 +1212,7 @@ __xfs_buf_ioend( > if (bp->b_flags & XBF_READ) { > if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) > invalidate_kernel_vmap_range(bp->b_addr, > - xfs_buf_vmap_len(bp)); > + roundup(BBTOB(bp->b_length), PAGE_SIZE)); > if (!bp->b_error && bp->b_ops) > bp->b_ops->verify_read(bp); > if (!bp->b_error) > @@ -1500,29 +1373,43 @@ static void > xfs_buf_submit_bio( > struct xfs_buf *bp) > { > - unsigned int size = BBTOB(bp->b_length); > - unsigned int map = 0, p; > + unsigned int map = 0; > struct blk_plug plug; > struct bio *bio; > > - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, > - xfs_buf_bio_op(bp), GFP_NOIO); > - bio->bi_private = bp; > - bio->bi_end_io = xfs_buf_bio_end_io; > + if (is_vmalloc_addr(bp->b_addr)) { > + unsigned int size = BBTOB(bp->b_length); > + unsigned int alloc_size = roundup(size, PAGE_SIZE); > + void *data = bp->b_addr; > > - if (bp->b_page_count == 1) { > - __bio_add_page(bio, virt_to_page(bp->b_addr), size, > - offset_in_page(bp->b_addr)); > - } else { > - for (p = 0; p < bp->b_page_count; p++) > - __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); > - bio->bi_iter.bi_size = size; /* limit to the actual size used */ > + bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, > + xfs_buf_bio_op(bp), GFP_NOIO); > + > + do { > + unsigned int len = min(size, PAGE_SIZE); > > - if (is_vmalloc_addr(bp->b_addr)) > - flush_kernel_vmap_range(bp->b_addr, > - xfs_buf_vmap_len(bp)); > + ASSERT(offset_in_page(data) == 0); > + __bio_add_page(bio, vmalloc_to_page(data), len, 0); > + data += len; > + size -= len; > + } while (size); > + > + flush_kernel_vmap_range(bp->b_addr, alloc_size); > + } else { > + /* > + * Single folio or slab allocation. Must be contiguous and thus > + * only a single bvec is needed. > + */ > + bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), > + GFP_NOIO); > + __bio_add_page(bio, virt_to_page(bp->b_addr), > + BBTOB(bp->b_length), > + offset_in_page(bp->b_addr)); > } > > + bio->bi_private = bp; > + bio->bi_end_io = xfs_buf_bio_end_io; > + > /* > * If there is more than one map segment, split out a new bio for each > * map except of the last one. The last map is handled by the > diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h > index 8db522f19b0c..db43bdc17f55 100644 > --- a/fs/xfs/xfs_buf.h > +++ b/fs/xfs/xfs_buf.h > @@ -36,7 +36,6 @@ struct xfs_buf; > #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ > > /* flags used only internally */ > -#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ > #define _XBF_KMEM (1u << 21)/* backed by heap memory */ > #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ > > @@ -61,7 +60,6 @@ typedef unsigned int xfs_buf_flags_t; > { XBF_STALE, "STALE" }, \ > { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ > { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ > - { _XBF_PAGES, "PAGES" }, \ > { _XBF_KMEM, "KMEM" }, \ > { _XBF_DELWRI_Q, "DELWRI_Q" }, \ > /* The following interface flags should never be set */ \ > @@ -122,8 +120,6 @@ struct xfs_buftarg { > struct xfs_buf_cache bt_cache[]; > }; > > -#define XB_PAGES 2 > - > struct xfs_buf_map { > xfs_daddr_t bm_bn; /* block number for I/O */ > int bm_len; /* size of I/O */ > @@ -185,13 +181,10 @@ struct xfs_buf { > struct xfs_buf_log_item *b_log_item; > struct list_head b_li_list; /* Log items list head */ > struct xfs_trans *b_transp; > - struct page **b_pages; /* array of page pointers */ > - struct page *b_page_array[XB_PAGES]; /* inline pages */ > struct xfs_buf_map *b_maps; /* compound buffer map */ > struct xfs_buf_map __b_map; /* inline compound buffer map */ > int b_map_count; > atomic_t b_pin_count; /* pin count */ > - unsigned int b_page_count; /* size of page array */ > int b_error; /* error code on I/O */ > void (*b_iodone)(struct xfs_buf *bp); > > diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c > index 5b64a2b3b113..b207754d2ee0 100644 > --- a/fs/xfs/xfs_buf_mem.c > +++ b/fs/xfs/xfs_buf_mem.c > @@ -169,9 +169,6 @@ xmbuf_map_page( > unlock_page(page); > > bp->b_addr = page_address(page); > - bp->b_pages = bp->b_page_array; > - bp->b_pages[0] = page; > - bp->b_page_count = 1; > return 0; > } > > @@ -180,16 +177,10 @@ void > xmbuf_unmap_page( > struct xfs_buf *bp) > { > - struct page *page = bp->b_pages[0]; > - > ASSERT(xfs_buftarg_is_mem(bp->b_target)); > > - put_page(page); > - > + put_page(virt_to_page(bp->b_addr)); > bp->b_addr = NULL; > - bp->b_pages[0] = NULL; > - bp->b_pages = NULL; > - bp->b_page_count = 0; > } > > /* Is this a valid daddr within the buftarg? */ > -- > 2.45.2 > >
On Wed, Mar 05, 2025 at 07:05:27AM -0700, Christoph Hellwig wrote: > The fallback buffer allocation path currently open codes a suboptimal > version of vmalloc to allocate pages that are then mapped into > vmalloc space. Switch to using vmalloc instead, which uses all the > optimizations in the common vmalloc code, and removes the need to > track the backing pages in the xfs_buf structure. > > Signed-off-by: Christoph Hellwig <hch@lst.de> ..... > @@ -1500,29 +1373,43 @@ static void > xfs_buf_submit_bio( > struct xfs_buf *bp) > { > - unsigned int size = BBTOB(bp->b_length); > - unsigned int map = 0, p; > + unsigned int map = 0; > struct blk_plug plug; > struct bio *bio; > > - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, > - xfs_buf_bio_op(bp), GFP_NOIO); > - bio->bi_private = bp; > - bio->bi_end_io = xfs_buf_bio_end_io; > + if (is_vmalloc_addr(bp->b_addr)) { > + unsigned int size = BBTOB(bp->b_length); > + unsigned int alloc_size = roundup(size, PAGE_SIZE); > + void *data = bp->b_addr; > > - if (bp->b_page_count == 1) { > - __bio_add_page(bio, virt_to_page(bp->b_addr), size, > - offset_in_page(bp->b_addr)); > - } else { > - for (p = 0; p < bp->b_page_count; p++) > - __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); > - bio->bi_iter.bi_size = size; /* limit to the actual size used */ > + bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, > + xfs_buf_bio_op(bp), GFP_NOIO); > + > + do { > + unsigned int len = min(size, PAGE_SIZE); > > - if (is_vmalloc_addr(bp->b_addr)) > - flush_kernel_vmap_range(bp->b_addr, > - xfs_buf_vmap_len(bp)); > + ASSERT(offset_in_page(data) == 0); > + __bio_add_page(bio, vmalloc_to_page(data), len, 0); > + data += len; > + size -= len; > + } while (size); > + > + flush_kernel_vmap_range(bp->b_addr, alloc_size); > + } else { > + /* > + * Single folio or slab allocation. Must be contiguous and thus > + * only a single bvec is needed. > + */ > + bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), > + GFP_NOIO); > + __bio_add_page(bio, virt_to_page(bp->b_addr), > + BBTOB(bp->b_length), > + offset_in_page(bp->b_addr)); > } How does offset_in_page() work with a high order folio? It can only return a value between 0 and (PAGE_SIZE - 1). i.e. shouldn't this be: folio = kmem_to_folio(bp->b_addr); bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length), offset_in_folio(folio, bp->b_addr)); -Dave.
On Thu, Mar 06, 2025 at 08:20:08AM +1100, Dave Chinner wrote: > On Wed, Mar 05, 2025 at 07:05:27AM -0700, Christoph Hellwig wrote: > > The fallback buffer allocation path currently open codes a suboptimal > > version of vmalloc to allocate pages that are then mapped into > > vmalloc space. Switch to using vmalloc instead, which uses all the > > optimizations in the common vmalloc code, and removes the need to > > track the backing pages in the xfs_buf structure. > > > > Signed-off-by: Christoph Hellwig <hch@lst.de> > ..... > > > @@ -1500,29 +1373,43 @@ static void > > xfs_buf_submit_bio( > > struct xfs_buf *bp) > > { > > - unsigned int size = BBTOB(bp->b_length); > > - unsigned int map = 0, p; > > + unsigned int map = 0; > > struct blk_plug plug; > > struct bio *bio; > > > > - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, > > - xfs_buf_bio_op(bp), GFP_NOIO); > > - bio->bi_private = bp; > > - bio->bi_end_io = xfs_buf_bio_end_io; > > + if (is_vmalloc_addr(bp->b_addr)) { > > + unsigned int size = BBTOB(bp->b_length); > > + unsigned int alloc_size = roundup(size, PAGE_SIZE); > > + void *data = bp->b_addr; > > > > - if (bp->b_page_count == 1) { > > - __bio_add_page(bio, virt_to_page(bp->b_addr), size, > > - offset_in_page(bp->b_addr)); > > - } else { > > - for (p = 0; p < bp->b_page_count; p++) > > - __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); > > - bio->bi_iter.bi_size = size; /* limit to the actual size used */ > > + bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, > > + xfs_buf_bio_op(bp), GFP_NOIO); > > + > > + do { > > + unsigned int len = min(size, PAGE_SIZE); > > > > - if (is_vmalloc_addr(bp->b_addr)) > > - flush_kernel_vmap_range(bp->b_addr, > > - xfs_buf_vmap_len(bp)); > > + ASSERT(offset_in_page(data) == 0); > > + __bio_add_page(bio, vmalloc_to_page(data), len, 0); > > + data += len; > > + size -= len; > > + } while (size); > > + > > + flush_kernel_vmap_range(bp->b_addr, alloc_size); > > + } else { > > + /* > > + * Single folio or slab allocation. Must be contiguous and thus > > + * only a single bvec is needed. > > + */ > > + bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), > > + GFP_NOIO); > > + __bio_add_page(bio, virt_to_page(bp->b_addr), > > + BBTOB(bp->b_length), > > + offset_in_page(bp->b_addr)); > > } > > How does offset_in_page() work with a high order folio? It can only > return a value between 0 and (PAGE_SIZE - 1). i.e. shouldn't this > be: > > folio = kmem_to_folio(bp->b_addr); > > bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length), > offset_in_folio(folio, bp->b_addr)); I think offset_in_folio() returns 0 in the !kmem && !vmalloc case because we allocate the folio and set b_addr to folio_address(folio); and we never call the kmem alloc code for sizes greater than PAGE_SIZE. --D > > > -Dave. > -- > Dave Chinner > david@fromorbit.com >
On Wed, Mar 05, 2025 at 02:54:07PM -0800, Darrick J. Wong wrote: > On Thu, Mar 06, 2025 at 08:20:08AM +1100, Dave Chinner wrote: > > On Wed, Mar 05, 2025 at 07:05:27AM -0700, Christoph Hellwig wrote: > > > The fallback buffer allocation path currently open codes a suboptimal > > > version of vmalloc to allocate pages that are then mapped into > > > vmalloc space. Switch to using vmalloc instead, which uses all the > > > optimizations in the common vmalloc code, and removes the need to > > > track the backing pages in the xfs_buf structure. > > > > > > Signed-off-by: Christoph Hellwig <hch@lst.de> > > ..... > > > > > @@ -1500,29 +1373,43 @@ static void > > > xfs_buf_submit_bio( > > > struct xfs_buf *bp) > > > { > > > - unsigned int size = BBTOB(bp->b_length); > > > - unsigned int map = 0, p; > > > + unsigned int map = 0; > > > struct blk_plug plug; > > > struct bio *bio; > > > > > > - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, > > > - xfs_buf_bio_op(bp), GFP_NOIO); > > > - bio->bi_private = bp; > > > - bio->bi_end_io = xfs_buf_bio_end_io; > > > + if (is_vmalloc_addr(bp->b_addr)) { > > > + unsigned int size = BBTOB(bp->b_length); > > > + unsigned int alloc_size = roundup(size, PAGE_SIZE); > > > + void *data = bp->b_addr; > > > > > > - if (bp->b_page_count == 1) { > > > - __bio_add_page(bio, virt_to_page(bp->b_addr), size, > > > - offset_in_page(bp->b_addr)); > > > - } else { > > > - for (p = 0; p < bp->b_page_count; p++) > > > - __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); > > > - bio->bi_iter.bi_size = size; /* limit to the actual size used */ > > > + bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, > > > + xfs_buf_bio_op(bp), GFP_NOIO); > > > + > > > + do { > > > + unsigned int len = min(size, PAGE_SIZE); > > > > > > - if (is_vmalloc_addr(bp->b_addr)) > > > - flush_kernel_vmap_range(bp->b_addr, > > > - xfs_buf_vmap_len(bp)); > > > + ASSERT(offset_in_page(data) == 0); > > > + __bio_add_page(bio, vmalloc_to_page(data), len, 0); > > > + data += len; > > > + size -= len; > > > + } while (size); > > > + > > > + flush_kernel_vmap_range(bp->b_addr, alloc_size); > > > + } else { > > > + /* > > > + * Single folio or slab allocation. Must be contiguous and thus > > > + * only a single bvec is needed. > > > + */ > > > + bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), > > > + GFP_NOIO); > > > + __bio_add_page(bio, virt_to_page(bp->b_addr), > > > + BBTOB(bp->b_length), > > > + offset_in_page(bp->b_addr)); > > > } > > > > How does offset_in_page() work with a high order folio? It can only > > return a value between 0 and (PAGE_SIZE - 1). i.e. shouldn't this > > be: > > > > folio = kmem_to_folio(bp->b_addr); > > > > bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length), > > offset_in_folio(folio, bp->b_addr)); > > I think offset_in_folio() returns 0 in the !kmem && !vmalloc case > because we allocate the folio and set b_addr to folio_address(folio); > and we never call the kmem alloc code for sizes greater than PAGE_SIZE. Yes, but that misses my point: this is a folio conversion, whilst this treats a folio as a page. We're trying to get rid of this sort of page/folio type confusion (i.e. stuff like "does offset_in_page() work correctly on large folios"). New code shouldn't be adding new issues like these, especially when there are existing folio-based APIs that are guaranteed to work correctly and won't need fixing in future before pages and folios can be fully separated. -Dave.
On Thu, Mar 06, 2025 at 08:20:08AM +1100, Dave Chinner wrote: > > + __bio_add_page(bio, virt_to_page(bp->b_addr), > > + BBTOB(bp->b_length), > > + offset_in_page(bp->b_addr)); > > } > > How does offset_in_page() work with a high order folio? It can only > return a value between 0 and (PAGE_SIZE - 1). Yes. > i.e. shouldn't this > be: > > folio = kmem_to_folio(bp->b_addr); > > bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length), > offset_in_folio(folio, bp->b_addr)); > That is also correct, but does a lot more work underneath as the bio_vecs work in terms of pages. In the long run this should use a bio_add_virt that hides all that (and the bio_vecs should move to store physical addresses). For now the above is the simplest and most efficient version.
On Thu, Mar 06, 2025 at 10:28:30AM +1100, Dave Chinner wrote: > Yes, but that misses my point: this is a folio conversion, whilst > this treats a folio as a page. The code covers both folios and slab. willy has explicitly NAKed using folio helpers for slab. So if we'd really want to use folio helpers here we'd need to special case them vs slab. That's why I keep using the existing API for now until the block layer grows better helpers, on which I'm waiting. And this is not new code, it's relatively minor refactoring of the existing code.
On Thu, Mar 06, 2025 at 12:35:36AM +0100, Christoph Hellwig wrote: > On Thu, Mar 06, 2025 at 08:20:08AM +1100, Dave Chinner wrote: > > > + __bio_add_page(bio, virt_to_page(bp->b_addr), > > > + BBTOB(bp->b_length), > > > + offset_in_page(bp->b_addr)); > > > } > > > > How does offset_in_page() work with a high order folio? It can only > > return a value between 0 and (PAGE_SIZE - 1). > > Yes. > > > i.e. shouldn't this > > be: > > > > folio = kmem_to_folio(bp->b_addr); > > > > bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length), > > offset_in_folio(folio, bp->b_addr)); > > > > That is also correct, but does a lot more work underneath as the > bio_vecs work in terms of pages. In the long run this should use > a bio_add_virt that hides all that (and the bio_vecs should move to > store physical addresses). For now the above is the simplest and > most efficient version. Can you add a comment that the code is done this way because of the mismatch between block layer API and mm object (folio/slab) handling APIs? Otherwise this code is going to look wrong every time I look at in future.... -Dave.
On Thu, Mar 06, 2025 at 11:57:29AM +1100, Dave Chinner wrote: > Can you add a comment that the code is done this way because > of the mismatch between block layer API and mm object (folio/slab) > handling APIs? Otherwise this code is going to look wrong every time > I look at in future.... Sure. Although I hope we'll have the bio_add_virt helper by the following merge window.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 2b4b8c104b0c..f28ca5cb5bd8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -55,13 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) return bp->b_rhash_key == XFS_BUF_DADDR_NULL; } -static inline int -xfs_buf_vmap_len( - struct xfs_buf *bp) -{ - return (bp->b_page_count * PAGE_SIZE); -} - /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer @@ -190,29 +183,6 @@ _xfs_buf_alloc( return 0; } -static void -xfs_buf_free_pages( - struct xfs_buf *bp) -{ - uint i; - - ASSERT(bp->b_flags & _XBF_PAGES); - - if (is_vmalloc_addr(bp->b_addr)) - vm_unmap_ram(bp->b_addr, bp->b_page_count); - - for (i = 0; i < bp->b_page_count; i++) { - if (bp->b_pages[i]) - folio_put(page_folio(bp->b_pages[i])); - } - mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE)); - - if (bp->b_pages != bp->b_page_array) - kfree(bp->b_pages); - bp->b_pages = NULL; - bp->b_flags &= ~_XBF_PAGES; -} - static void xfs_buf_free_callback( struct callback_head *cb) @@ -227,16 +197,23 @@ static void xfs_buf_free( struct xfs_buf *bp) { + unsigned int size = BBTOB(bp->b_length); + trace_xfs_buf_free(bp, _RET_IP_); ASSERT(list_empty(&bp->b_lru)); + if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) + mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); + if (xfs_buftarg_is_mem(bp->b_target)) xmbuf_unmap_page(bp); - else if (bp->b_flags & _XBF_PAGES) - xfs_buf_free_pages(bp); + else if (is_vmalloc_addr(bp->b_addr)) + vfree(bp->b_addr); else if (bp->b_flags & _XBF_KMEM) kfree(bp->b_addr); + else + folio_put(virt_to_folio(bp->b_addr)); call_rcu(&bp->b_rcu, xfs_buf_free_callback); } @@ -264,9 +241,6 @@ xfs_buf_alloc_kmem( bp->b_addr = NULL; return -ENOMEM; } - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = kmem_to_page(bp->b_addr); - bp->b_page_count = 1; bp->b_flags |= _XBF_KMEM; return 0; } @@ -287,9 +261,9 @@ xfs_buf_alloc_kmem( * by the rest of the code - the buffer memory spans a single contiguous memory * region that we don't have to map and unmap to access the data directly. * - * The third type of buffer is the multi-page buffer. These are always made - * up of single pages so that they can be fed to vmap_ram() to return a - * contiguous memory region we can access the data through. + * The third type of buffer is the vmalloc()d buffer. This provides the buffer + * with the required contiguous memory region but backed by discontiguous + * physical pages. */ static int xfs_buf_alloc_backing_mem( @@ -299,7 +273,6 @@ xfs_buf_alloc_backing_mem( size_t size = BBTOB(bp->b_length); gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; struct folio *folio; - long filled = 0; if (xfs_buftarg_is_mem(bp->b_target)) return xmbuf_map_page(bp); @@ -347,98 +320,18 @@ xfs_buf_alloc_backing_mem( goto fallback; } bp->b_addr = folio_address(folio); - bp->b_page_array[0] = &folio->page; - bp->b_pages = bp->b_page_array; - bp->b_page_count = 1; - bp->b_flags |= _XBF_PAGES; return 0; fallback: - /* Fall back to allocating an array of single page folios. */ - bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE); - if (bp->b_page_count <= XB_PAGES) { - bp->b_pages = bp->b_page_array; - } else { - bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, - gfp_mask); - if (!bp->b_pages) - return -ENOMEM; - } - bp->b_flags |= _XBF_PAGES; - - /* - * Bulk filling of pages can take multiple calls. Not filling the entire - * array is not an allocation failure, so don't back off if we get at - * least one extra page. - */ for (;;) { - long last = filled; - - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, - bp->b_pages); - if (filled == bp->b_page_count) { - XFS_STATS_INC(bp->b_mount, xb_page_found); + bp->b_addr = __vmalloc(size, gfp_mask); + if (bp->b_addr) break; - } - - if (filled != last) - continue; - - if (flags & XBF_READ_AHEAD) { - xfs_buf_free_pages(bp); + if (flags & XBF_READ_AHEAD) return -ENOMEM; - } - XFS_STATS_INC(bp->b_mount, xb_page_retries); memalloc_retry_wait(gfp_mask); } - return 0; -} - -/* - * Map buffer into kernel address-space if necessary. - */ -STATIC int -_xfs_buf_map_pages( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - ASSERT(bp->b_flags & _XBF_PAGES); - if (bp->b_page_count == 1) { - /* A single page buffer is always mappable */ - bp->b_addr = page_address(bp->b_pages[0]); - } else { - int retried = 0; - unsigned nofs_flag; - - /* - * vm_map_ram() will allocate auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we often under a scoped nofs - * context here. Mixing GFP_KERNEL with GFP_NOFS allocations - * from the same call site that can be run from both above and - * below memory reclaim causes lockdep false positives. Hence we - * always need to force this allocation to nofs context because - * we can't pass __GFP_NOLOCKDEP down to auxillary structures to - * prevent false positive lockdep reports. - * - * XXX(dgc): I think dquot reclaim is the only place we can get - * to this function from memory reclaim context now. If we fix - * that like we've fixed inode reclaim to avoid writeback from - * reclaim, this nofs wrapping can go away. - */ - nofs_flag = memalloc_nofs_save(); - do { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1); - if (bp->b_addr) - break; - vm_unmap_aliases(); - } while (retried++ <= 1); - memalloc_nofs_restore(nofs_flag); - - if (!bp->b_addr) - return -ENOMEM; - } return 0; } @@ -558,7 +451,7 @@ xfs_buf_find_lock( return -ENOENT; } ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM; bp->b_ops = NULL; } return 0; @@ -744,18 +637,6 @@ xfs_buf_get_map( xfs_perag_put(pag); } - /* We do not hold a perag reference anymore. */ - if (!bp->b_addr) { - error = _xfs_buf_map_pages(bp, flags); - if (unlikely(error)) { - xfs_warn_ratelimited(btp->bt_mount, - "%s: failed to map %u pages", __func__, - bp->b_page_count); - xfs_buf_relse(bp); - return error; - } - } - /* * Clear b_error if this is a lookup from a caller that doesn't expect * valid data to be found in the buffer. @@ -998,14 +879,6 @@ xfs_buf_get_uncached( if (error) goto fail_free_buf; - if (!bp->b_addr) - error = _xfs_buf_map_pages(bp, 0); - if (unlikely(error)) { - xfs_warn(target->bt_mount, - "%s: failed to map pages", __func__); - goto fail_free_buf; - } - trace_xfs_buf_get_uncached(bp, _RET_IP_); *bpp = bp; return 0; @@ -1339,7 +1212,7 @@ __xfs_buf_ioend( if (bp->b_flags & XBF_READ) { if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) invalidate_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); + roundup(BBTOB(bp->b_length), PAGE_SIZE)); if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) @@ -1500,29 +1373,43 @@ static void xfs_buf_submit_bio( struct xfs_buf *bp) { - unsigned int size = BBTOB(bp->b_length); - unsigned int map = 0, p; + unsigned int map = 0; struct blk_plug plug; struct bio *bio; - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, - xfs_buf_bio_op(bp), GFP_NOIO); - bio->bi_private = bp; - bio->bi_end_io = xfs_buf_bio_end_io; + if (is_vmalloc_addr(bp->b_addr)) { + unsigned int size = BBTOB(bp->b_length); + unsigned int alloc_size = roundup(size, PAGE_SIZE); + void *data = bp->b_addr; - if (bp->b_page_count == 1) { - __bio_add_page(bio, virt_to_page(bp->b_addr), size, - offset_in_page(bp->b_addr)); - } else { - for (p = 0; p < bp->b_page_count; p++) - __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); - bio->bi_iter.bi_size = size; /* limit to the actual size used */ + bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, + xfs_buf_bio_op(bp), GFP_NOIO); + + do { + unsigned int len = min(size, PAGE_SIZE); - if (is_vmalloc_addr(bp->b_addr)) - flush_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); + ASSERT(offset_in_page(data) == 0); + __bio_add_page(bio, vmalloc_to_page(data), len, 0); + data += len; + size -= len; + } while (size); + + flush_kernel_vmap_range(bp->b_addr, alloc_size); + } else { + /* + * Single folio or slab allocation. Must be contiguous and thus + * only a single bvec is needed. + */ + bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), + GFP_NOIO); + __bio_add_page(bio, virt_to_page(bp->b_addr), + BBTOB(bp->b_length), + offset_in_page(bp->b_addr)); } + bio->bi_private = bp; + bio->bi_end_io = xfs_buf_bio_end_io; + /* * If there is more than one map segment, split out a new bio for each * map except of the last one. The last map is handled by the diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8db522f19b0c..db43bdc17f55 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -36,7 +36,6 @@ struct xfs_buf; #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ -#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1u << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ @@ -61,7 +60,6 @@ typedef unsigned int xfs_buf_flags_t; { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ - { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ @@ -122,8 +120,6 @@ struct xfs_buftarg { struct xfs_buf_cache bt_cache[]; }; -#define XB_PAGES 2 - struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ @@ -185,13 +181,10 @@ struct xfs_buf { struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; - struct page **b_pages; /* array of page pointers */ - struct page *b_page_array[XB_PAGES]; /* inline pages */ struct xfs_buf_map *b_maps; /* compound buffer map */ struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; atomic_t b_pin_count; /* pin count */ - unsigned int b_page_count; /* size of page array */ int b_error; /* error code on I/O */ void (*b_iodone)(struct xfs_buf *bp); diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 5b64a2b3b113..b207754d2ee0 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -169,9 +169,6 @@ xmbuf_map_page( unlock_page(page); bp->b_addr = page_address(page); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = page; - bp->b_page_count = 1; return 0; } @@ -180,16 +177,10 @@ void xmbuf_unmap_page( struct xfs_buf *bp) { - struct page *page = bp->b_pages[0]; - ASSERT(xfs_buftarg_is_mem(bp->b_target)); - put_page(page); - + put_page(virt_to_page(bp->b_addr)); bp->b_addr = NULL; - bp->b_pages[0] = NULL; - bp->b_pages = NULL; - bp->b_page_count = 0; } /* Is this a valid daddr within the buftarg? */
The fallback buffer allocation path currently open codes a suboptimal version of vmalloc to allocate pages that are then mapped into vmalloc space. Switch to using vmalloc instead, which uses all the optimizations in the common vmalloc code, and removes the need to track the backing pages in the xfs_buf structure. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_buf.c | 207 ++++++++++--------------------------------- fs/xfs/xfs_buf.h | 7 -- fs/xfs/xfs_buf_mem.c | 11 +-- 3 files changed, 48 insertions(+), 177 deletions(-)