Message ID | 20250106095613.847700-10-hch@lst.de (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [01/15] xfs: fix a double completion for buffers on in-memory targets | expand |
On Mon, Jan 06, 2025 at 10:54:46AM +0100, Christoph Hellwig wrote: > The code in _xfs_buf_ioapply is unnecessarily complicated because it > doesn't take advantage of modern bio features. > > Simplify it by making use of bio splitting and chaining, that is build > a single bio for the pages in the buffer using a simple loop, and then > split that bio on the map boundaries for discontiguous multi-FSB buffers > and chain the split bios to the main one so that there is only a single > I/O completion. > > This not only simplifies the code to build the buffer, but also removes > the need for the b_io_remaining field as buffer ownership is granted > to the bio on submit of the final bio with no chance for a completion > before that as well as the b_io_error field that is now superfluous > because there always is exactly one completion. So I guess b_io_remaining was the count of in-flight bios? And making a chain of bios basically just moves all that to the bio layer, so all xfs_buf needs to know is whether or not a bio is in progress, right? > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/xfs_buf.c | 207 ++++++++++++++--------------------------------- > fs/xfs/xfs_buf.h | 2 - > 2 files changed, 60 insertions(+), 149 deletions(-) > > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index e886605b5721..094f16457998 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -1364,13 +1364,6 @@ xfs_buf_ioend( > { > trace_xfs_buf_iodone(bp, _RET_IP_); > > - /* > - * Pull in IO completion errors now. We are guaranteed to be running > - * single threaded, so we don't need the lock to read b_io_error. > - */ > - if (!bp->b_error && bp->b_io_error) > - xfs_buf_ioerror(bp, bp->b_io_error); > - > if (bp->b_flags & XBF_READ) { > if (!bp->b_error && bp->b_ops) > bp->b_ops->verify_read(bp); > @@ -1494,118 +1487,26 @@ static void > xfs_buf_bio_end_io( > struct bio *bio) > { > - struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; > + struct xfs_buf *bp = bio->bi_private; > > - if (!bio->bi_status && > - (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && > - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) > - bio->bi_status = BLK_STS_IOERR; > - > - /* > - * don't overwrite existing errors - otherwise we can lose errors on > - * buffers that require multiple bios to complete. > - */ > - if (bio->bi_status) { > - int error = blk_status_to_errno(bio->bi_status); > - > - cmpxchg(&bp->b_io_error, 0, error); > - } > + if (bio->bi_status) > + xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); > + else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && > + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) > + xfs_buf_ioerror(bp, -EIO); > > if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) > invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); > > - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) > - xfs_buf_ioend_async(bp); > + xfs_buf_ioend_async(bp); > bio_put(bio); > } > > -static void > -xfs_buf_ioapply_map( > - struct xfs_buf *bp, > - int map, > - int *buf_offset, > - int *count, > - blk_opf_t op) > -{ > - int page_index; > - unsigned int total_nr_pages = bp->b_page_count; > - int nr_pages; > - struct bio *bio; > - sector_t sector = bp->b_maps[map].bm_bn; > - int size; > - int offset; > - > - /* skip the pages in the buffer before the start offset */ > - page_index = 0; > - offset = *buf_offset; > - while (offset >= PAGE_SIZE) { > - page_index++; > - offset -= PAGE_SIZE; > - } > - > - /* > - * Limit the IO size to the length of the current vector, and update the > - * remaining IO count for the next time around. > - */ > - size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); > - *count -= size; > - *buf_offset += size; > - > -next_chunk: > - atomic_inc(&bp->b_io_remaining); > - nr_pages = bio_max_segs(total_nr_pages); > - > - bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); > - bio->bi_iter.bi_sector = sector; > - bio->bi_end_io = xfs_buf_bio_end_io; > - bio->bi_private = bp; > - > - for (; size && nr_pages; nr_pages--, page_index++) { > - int rbytes, nbytes = PAGE_SIZE - offset; > - > - if (nbytes > size) > - nbytes = size; > - > - rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, > - offset); > - if (rbytes < nbytes) > - break; > - > - offset = 0; > - sector += BTOBB(nbytes); > - size -= nbytes; > - total_nr_pages--; > - } > - > - if (likely(bio->bi_iter.bi_size)) { > - if (xfs_buf_is_vmapped(bp)) { > - flush_kernel_vmap_range(bp->b_addr, > - xfs_buf_vmap_len(bp)); > - } > - submit_bio(bio); > - if (size) > - goto next_chunk; > - } else { > - /* > - * This is guaranteed not to be the last io reference count > - * because the caller (xfs_buf_submit) holds a count itself. > - */ > - atomic_dec(&bp->b_io_remaining); > - xfs_buf_ioerror(bp, -EIO); > - bio_put(bio); > - } > - > -} > - > -STATIC void > -_xfs_buf_ioapply( > - struct xfs_buf *bp) > +static inline blk_opf_t > +xfs_buf_bio_op( > + struct xfs_buf *bp) > { > - struct blk_plug plug; > - blk_opf_t op; > - int offset; > - int size; > - int i; > + blk_opf_t op; > > if (bp->b_flags & XBF_WRITE) { > op = REQ_OP_WRITE; > @@ -1615,25 +1516,52 @@ _xfs_buf_ioapply( > op |= REQ_RAHEAD; > } > > - /* we only use the buffer cache for meta-data */ > - op |= REQ_META; > + return op | REQ_META; > +} > + > +static void > +xfs_buf_submit_bio( > + struct xfs_buf *bp) > +{ > + unsigned int size = BBTOB(bp->b_length); > + unsigned int map = 0, p; > + struct blk_plug plug; > + struct bio *bio; > + > + bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, > + xfs_buf_bio_op(bp), GFP_NOIO); > + bio->bi_private = bp; > + bio->bi_end_io = xfs_buf_bio_end_io; > + > + if (bp->b_flags & _XBF_KMEM) { > + __bio_add_page(bio, virt_to_page(bp->b_addr), size, > + bp->b_offset); > + } else { > + for (p = 0; p < bp->b_page_count; p++) > + __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); > + bio->bi_iter.bi_size = size; /* limit to the actual size used */ > + > + if (xfs_buf_is_vmapped(bp)) > + flush_kernel_vmap_range(bp->b_addr, > + xfs_buf_vmap_len(bp)); > + } > > /* > - * Walk all the vectors issuing IO on them. Set up the initial offset > - * into the buffer and the desired IO size before we start - > - * _xfs_buf_ioapply_vec() will modify them appropriately for each > - * subsequent call. > + * If there is more than one map segment, split the original bio to > + * submit a separate bio for each discontiguous range. > */ > - offset = bp->b_offset; > - size = BBTOB(bp->b_length); > blk_start_plug(&plug); > - for (i = 0; i < bp->b_map_count; i++) { > - xfs_buf_ioapply_map(bp, i, &offset, &size, op); > - if (bp->b_error) > - break; > - if (size <= 0) > - break; /* all done */ Eerrugh, I spent quite a while trying to wrap my head around the old code when I was writing the in-memory xfs_buf support. This is much less weird to look at... > + for (map = 0; map < bp->b_map_count - 1; map++) { ...but why isn't this "map < bp->b_map_count"? --D > + struct bio *split; > + > + split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS, > + &fs_bio_set); > + split->bi_iter.bi_sector = bp->b_maps[map].bm_bn; > + bio_chain(split, bio); > + submit_bio(split); > } > + bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; > + submit_bio(bio); > blk_finish_plug(&plug); > } > > @@ -1693,8 +1621,6 @@ static int > xfs_buf_submit( > struct xfs_buf *bp) > { > - int error = 0; > - > trace_xfs_buf_submit(bp, _RET_IP_); > > ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); > @@ -1735,14 +1661,7 @@ xfs_buf_submit( > * left over from previous use of the buffer (e.g. failed readahead). > */ > bp->b_error = 0; > - bp->b_io_error = 0; > > - /* > - * Set the count to 1 initially, this will stop an I/O completion > - * callout which happens before we have started all the I/O from calling > - * xfs_buf_ioend too early. > - */ > - atomic_set(&bp->b_io_remaining, 1); > if (bp->b_flags & XBF_ASYNC) > xfs_buf_ioacct_inc(bp); > > @@ -1755,28 +1674,22 @@ xfs_buf_submit( > if (xfs_buftarg_is_mem(bp->b_target)) > goto done; > > - _xfs_buf_ioapply(bp); > + xfs_buf_submit_bio(bp); > + goto rele; > > done: > - /* > - * If _xfs_buf_ioapply failed, we can get back here with only the IO > - * reference we took above. If we drop it to zero, run completion so > - * that we don't return to the caller with completion still pending. > - */ > - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { > - if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) > - xfs_buf_ioend(bp); > - else > - xfs_buf_ioend_async(bp); > - } > - > + if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) > + xfs_buf_ioend(bp); > + else > + xfs_buf_ioend_async(bp); > +rele: > /* > * Release the hold that keeps the buffer referenced for the entire > * I/O. Note that if the buffer is async, it is not safe to reference > * after this release. > */ > xfs_buf_rele(bp); > - return error; > + return 0; > } > > void * > diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h > index da80399c7457..c53d27439ff2 100644 > --- a/fs/xfs/xfs_buf.h > +++ b/fs/xfs/xfs_buf.h > @@ -184,7 +184,6 @@ struct xfs_buf { > struct list_head b_lru; /* lru list */ > spinlock_t b_lock; /* internal state lock */ > unsigned int b_state; /* internal state flags */ > - int b_io_error; /* internal IO error state */ > wait_queue_head_t b_waiters; /* unpin waiters */ > struct list_head b_list; > struct xfs_perag *b_pag; > @@ -202,7 +201,6 @@ struct xfs_buf { > struct xfs_buf_map __b_map; /* inline compound buffer map */ > int b_map_count; > atomic_t b_pin_count; /* pin count */ > - atomic_t b_io_remaining; /* #outstanding I/O requests */ > unsigned int b_page_count; /* size of page array */ > unsigned int b_offset; /* page offset of b_addr, > only for _XBF_KMEM buffers */ > -- > 2.45.2 > >
On Mon, Jan 06, 2025 at 10:42:24PM -0800, Darrick J. Wong wrote: > So I guess b_io_remaining was the count of in-flight bios? Yes. Plus a bias of 1 for the submitting context so that the completion isn't run until all bios are submitted. > And making a > chain of bios basically just moves all that to the bio layer, so all > xfs_buf needs to know is whether or not a bio is in progress, right? Yes. > Eerrugh, I spent quite a while trying to wrap my head around the old > code when I was writing the in-memory xfs_buf support. This is much > less weird to look at... > > > + for (map = 0; map < bp->b_map_count - 1; map++) { > > ...but why isn't this "map < bp->b_map_count"? Because the final ("main") bio is submitted outside the loop as the loop body chains the ones before to it. I guess this should go into a comment to confuse the readers a little less.
On Tue, Jan 07, 2025 at 07:46:37AM +0100, Christoph Hellwig wrote: > On Mon, Jan 06, 2025 at 10:42:24PM -0800, Darrick J. Wong wrote: > > So I guess b_io_remaining was the count of in-flight bios? > > Yes. Plus a bias of 1 for the submitting context so that the > completion isn't run until all bios are submitted. > > > And making a > > chain of bios basically just moves all that to the bio layer, so all > > xfs_buf needs to know is whether or not a bio is in progress, right? > > Yes. > > > Eerrugh, I spent quite a while trying to wrap my head around the old > > code when I was writing the in-memory xfs_buf support. This is much > > less weird to look at... > > > > > + for (map = 0; map < bp->b_map_count - 1; map++) { > > > > ...but why isn't this "map < bp->b_map_count"? > > Because the final ("main") bio is submitted outside the loop as the > loop body chains the ones before to it. I guess this should go into > a comment to confuse the readers a little less. Ah, right, because we start with one bio for N maps, which means we need to split it N-1 times to end up with N (chained) bios, one for each map. Yes, comment please. :) --D
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e886605b5721..094f16457998 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1364,13 +1364,6 @@ xfs_buf_ioend( { trace_xfs_buf_iodone(bp, _RET_IP_); - /* - * Pull in IO completion errors now. We are guaranteed to be running - * single threaded, so we don't need the lock to read b_io_error. - */ - if (!bp->b_error && bp->b_io_error) - xfs_buf_ioerror(bp, bp->b_io_error); - if (bp->b_flags & XBF_READ) { if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); @@ -1494,118 +1487,26 @@ static void xfs_buf_bio_end_io( struct bio *bio) { - struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; + struct xfs_buf *bp = bio->bi_private; - if (!bio->bi_status && - (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) - bio->bi_status = BLK_STS_IOERR; - - /* - * don't overwrite existing errors - otherwise we can lose errors on - * buffers that require multiple bios to complete. - */ - if (bio->bi_status) { - int error = blk_status_to_errno(bio->bi_status); - - cmpxchg(&bp->b_io_error, 0, error); - } + if (bio->bi_status) + xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); + else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + xfs_buf_ioerror(bp, -EIO); if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) - xfs_buf_ioend_async(bp); + xfs_buf_ioend_async(bp); bio_put(bio); } -static void -xfs_buf_ioapply_map( - struct xfs_buf *bp, - int map, - int *buf_offset, - int *count, - blk_opf_t op) -{ - int page_index; - unsigned int total_nr_pages = bp->b_page_count; - int nr_pages; - struct bio *bio; - sector_t sector = bp->b_maps[map].bm_bn; - int size; - int offset; - - /* skip the pages in the buffer before the start offset */ - page_index = 0; - offset = *buf_offset; - while (offset >= PAGE_SIZE) { - page_index++; - offset -= PAGE_SIZE; - } - - /* - * Limit the IO size to the length of the current vector, and update the - * remaining IO count for the next time around. - */ - size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); - *count -= size; - *buf_offset += size; - -next_chunk: - atomic_inc(&bp->b_io_remaining); - nr_pages = bio_max_segs(total_nr_pages); - - bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); - bio->bi_iter.bi_sector = sector; - bio->bi_end_io = xfs_buf_bio_end_io; - bio->bi_private = bp; - - for (; size && nr_pages; nr_pages--, page_index++) { - int rbytes, nbytes = PAGE_SIZE - offset; - - if (nbytes > size) - nbytes = size; - - rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, - offset); - if (rbytes < nbytes) - break; - - offset = 0; - sector += BTOBB(nbytes); - size -= nbytes; - total_nr_pages--; - } - - if (likely(bio->bi_iter.bi_size)) { - if (xfs_buf_is_vmapped(bp)) { - flush_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); - } - submit_bio(bio); - if (size) - goto next_chunk; - } else { - /* - * This is guaranteed not to be the last io reference count - * because the caller (xfs_buf_submit) holds a count itself. - */ - atomic_dec(&bp->b_io_remaining); - xfs_buf_ioerror(bp, -EIO); - bio_put(bio); - } - -} - -STATIC void -_xfs_buf_ioapply( - struct xfs_buf *bp) +static inline blk_opf_t +xfs_buf_bio_op( + struct xfs_buf *bp) { - struct blk_plug plug; - blk_opf_t op; - int offset; - int size; - int i; + blk_opf_t op; if (bp->b_flags & XBF_WRITE) { op = REQ_OP_WRITE; @@ -1615,25 +1516,52 @@ _xfs_buf_ioapply( op |= REQ_RAHEAD; } - /* we only use the buffer cache for meta-data */ - op |= REQ_META; + return op | REQ_META; +} + +static void +xfs_buf_submit_bio( + struct xfs_buf *bp) +{ + unsigned int size = BBTOB(bp->b_length); + unsigned int map = 0, p; + struct blk_plug plug; + struct bio *bio; + + bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, + xfs_buf_bio_op(bp), GFP_NOIO); + bio->bi_private = bp; + bio->bi_end_io = xfs_buf_bio_end_io; + + if (bp->b_flags & _XBF_KMEM) { + __bio_add_page(bio, virt_to_page(bp->b_addr), size, + bp->b_offset); + } else { + for (p = 0; p < bp->b_page_count; p++) + __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); + bio->bi_iter.bi_size = size; /* limit to the actual size used */ + + if (xfs_buf_is_vmapped(bp)) + flush_kernel_vmap_range(bp->b_addr, + xfs_buf_vmap_len(bp)); + } /* - * Walk all the vectors issuing IO on them. Set up the initial offset - * into the buffer and the desired IO size before we start - - * _xfs_buf_ioapply_vec() will modify them appropriately for each - * subsequent call. + * If there is more than one map segment, split the original bio to + * submit a separate bio for each discontiguous range. */ - offset = bp->b_offset; - size = BBTOB(bp->b_length); blk_start_plug(&plug); - for (i = 0; i < bp->b_map_count; i++) { - xfs_buf_ioapply_map(bp, i, &offset, &size, op); - if (bp->b_error) - break; - if (size <= 0) - break; /* all done */ + for (map = 0; map < bp->b_map_count - 1; map++) { + struct bio *split; + + split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS, + &fs_bio_set); + split->bi_iter.bi_sector = bp->b_maps[map].bm_bn; + bio_chain(split, bio); + submit_bio(split); } + bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; + submit_bio(bio); blk_finish_plug(&plug); } @@ -1693,8 +1621,6 @@ static int xfs_buf_submit( struct xfs_buf *bp) { - int error = 0; - trace_xfs_buf_submit(bp, _RET_IP_); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); @@ -1735,14 +1661,7 @@ xfs_buf_submit( * left over from previous use of the buffer (e.g. failed readahead). */ bp->b_error = 0; - bp->b_io_error = 0; - /* - * Set the count to 1 initially, this will stop an I/O completion - * callout which happens before we have started all the I/O from calling - * xfs_buf_ioend too early. - */ - atomic_set(&bp->b_io_remaining, 1); if (bp->b_flags & XBF_ASYNC) xfs_buf_ioacct_inc(bp); @@ -1755,28 +1674,22 @@ xfs_buf_submit( if (xfs_buftarg_is_mem(bp->b_target)) goto done; - _xfs_buf_ioapply(bp); + xfs_buf_submit_bio(bp); + goto rele; done: - /* - * If _xfs_buf_ioapply failed, we can get back here with only the IO - * reference we took above. If we drop it to zero, run completion so - * that we don't return to the caller with completion still pending. - */ - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { - if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) - xfs_buf_ioend(bp); - else - xfs_buf_ioend_async(bp); - } - + if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) + xfs_buf_ioend(bp); + else + xfs_buf_ioend_async(bp); +rele: /* * Release the hold that keeps the buffer referenced for the entire * I/O. Note that if the buffer is async, it is not safe to reference * after this release. */ xfs_buf_rele(bp); - return error; + return 0; } void * diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index da80399c7457..c53d27439ff2 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -184,7 +184,6 @@ struct xfs_buf { struct list_head b_lru; /* lru list */ spinlock_t b_lock; /* internal state lock */ unsigned int b_state; /* internal state flags */ - int b_io_error; /* internal IO error state */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; @@ -202,7 +201,6 @@ struct xfs_buf { struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; atomic_t b_pin_count; /* pin count */ - atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset of b_addr, only for _XBF_KMEM buffers */
The code in _xfs_buf_ioapply is unnecessarily complicated because it doesn't take advantage of modern bio features. Simplify it by making use of bio splitting and chaining, that is build a single bio for the pages in the buffer using a simple loop, and then split that bio on the map boundaries for discontiguous multi-FSB buffers and chain the split bios to the main one so that there is only a single I/O completion. This not only simplifies the code to build the buffer, but also removes the need for the b_io_remaining field as buffer ownership is granted to the bio on submit of the final bio with no chance for a completion before that as well as the b_io_error field that is now superfluous because there always is exactly one completion. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_buf.c | 207 ++++++++++++++--------------------------------- fs/xfs/xfs_buf.h | 2 - 2 files changed, 60 insertions(+), 149 deletions(-)