diff mbox series

[02/15] btrfs: fix double __endio_write_update_ordered in direct I/O

Message ID b4b45179cc951dde98feea48723572683daf7fb3.1583789410.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show
Series btrfs: read repair/direct I/O improvements | expand

Commit Message

Omar Sandoval March 9, 2020, 9:32 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

In btrfs_submit_direct(), if we fail to allocate the btrfs_dio_private,
we complete the ordered extent range. However, we don't mark that the
range doesn't need to be cleaned up from btrfs_direct_IO() until later.
Therefore, if we fail to allocate the btrfs_dio_private, we complete the
ordered extent range twice. We could fix this by updating
unsubmitted_oe_range earlier, but it's simpler to always clean up via
the bio once the btrfs_dio_private is allocated and leave it for
btrfs_direct_IO() before that.

Fixes: f28a49287817 ("Btrfs: fix leaking of ordered extents after direct IO write error")
Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/btrfs/inode.c | 92 ++++++++++++++----------------------------------
 1 file changed, 26 insertions(+), 66 deletions(-)

Comments

Christoph Hellwig March 10, 2020, 4:30 p.m. UTC | #1
On Mon, Mar 09, 2020 at 02:32:28PM -0700, Omar Sandoval wrote:
> +static void btrfs_submit_direct_hook(struct btrfs_dio_private *dip)

Just curious: why is this routine called btrfs_submit_direct_hook?
it doesn't seem to hook up anything, but just contains the guts of
btrfs_submit_direct.  Any reason to keep the two functions separate?

Also instead of the separate bip allocation and the bio clone, why
not clone into a private bio_set that contains the private data
as part of the allocation?
Omar Sandoval March 11, 2020, 9:03 a.m. UTC | #2
On Tue, Mar 10, 2020 at 05:30:24PM +0100, Christoph Hellwig wrote:
> On Mon, Mar 09, 2020 at 02:32:28PM -0700, Omar Sandoval wrote:
> > +static void btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
> 
> Just curious: why is this routine called btrfs_submit_direct_hook?

No idea. The name goes back to commit e65e15355429 ("btrfs: fix panic
caused by direct IO"), and it didn't make any sense then, either.

> it doesn't seem to hook up anything, but just contains the guts of
> btrfs_submit_direct.  Any reason to keep the two functions separate?

The only reason I didn't combine them is that it would make for a pretty
big, unwieldy function. Maybe folding btrfs_submit_direct_hook() into
btrfs_submit_direct() and splitting the dip setup into something like
btrfs_create_dio_private() would be more natural.

> Also instead of the separate bip allocation and the bio clone, why
> not clone into a private bio_set that contains the private data
> as part of the allocation?

Mainly because dip is not tied to any one bio, as we might need to split
up or retry bios. We also already clone the bios from a btrfs bioset,
although that doesn't have everything we need for direct I/O.

Thanks!
Nikolay Borisov March 17, 2020, 2:04 p.m. UTC | #3
On 9.03.20 г. 23:32 ч., Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> In btrfs_submit_direct(), if we fail to allocate the btrfs_dio_private,
> we complete the ordered extent range. However, we don't mark that the
> range doesn't need to be cleaned up from btrfs_direct_IO() until later.
> Therefore, if we fail to allocate the btrfs_dio_private, we complete the
> ordered extent range twice. We could fix this by updating
> unsubmitted_oe_range earlier, but it's simpler to always clean up via
> the bio once the btrfs_dio_private is allocated and leave it for
> btrfs_direct_IO() before that.
> 
> Fixes: f28a49287817 ("Btrfs: fix leaking of ordered extents after direct IO write error")
> Signed-off-by: Omar Sandoval <osandov@fb.com>

The result code is much nicer and also I think your suggestion of having
just btrfs_submit_direct and factoring out the dip setup code into a
separate function makes sense. In any case:

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d48a2010f24a..8e986056be3c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7912,7 +7912,7 @@  static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 	return ret;
 }
 
-static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
+static void btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
 {
 	struct inode *inode = dip->inode;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -7932,7 +7932,7 @@  static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
 	ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
 				    start_sector << 9, submit_len, &geom);
 	if (ret)
-		return -EIO;
+		goto out_err;
 
 	if (geom.len >= submit_len) {
 		bio = orig_bio;
@@ -7995,7 +7995,7 @@  static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
 submit:
 	status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
 	if (!status)
-		return 0;
+		return;
 
 	if (bio != orig_bio)
 		bio_put(bio);
@@ -8009,9 +8009,6 @@  static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
 	 */
 	if (atomic_dec_and_test(&dip->pending_bios))
 		bio_io_error(dip->orig_bio);
-
-	/* bio_end_io() will handle error, so we needn't return it */
-	return 0;
 }
 
 static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
@@ -8021,14 +8018,24 @@  static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
 	struct bio *bio = NULL;
 	struct btrfs_io_bio *io_bio;
 	bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
-	int ret = 0;
 
 	bio = btrfs_bio_clone(dio_bio);
 
 	dip = kzalloc(sizeof(*dip), GFP_NOFS);
 	if (!dip) {
-		ret = -ENOMEM;
-		goto free_ordered;
+		if (!write) {
+			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+				file_offset + dio_bio->bi_iter.bi_size - 1);
+		}
+
+		dio_bio->bi_status = BLK_STS_RESOURCE;
+		/*
+		 * Releases and cleans up our dio_bio, no need to bio_put() nor
+		 * bio_endio()/bio_io_error() against dio_bio.
+		 */
+		dio_end_io(dio_bio);
+		bio_put(bio);
+		return;
 	}
 
 	dip->private = dio_bio->bi_private;
@@ -8044,72 +8051,25 @@  static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
 	io_bio->logical = file_offset;
 
 	if (write) {
-		bio->bi_end_io = btrfs_endio_direct_write;
-	} else {
-		bio->bi_end_io = btrfs_endio_direct_read;
-		dip->subio_endio = btrfs_subio_endio_read;
-	}
-
-	/*
-	 * Reset the range for unsubmitted ordered extents (to a 0 length range)
-	 * even if we fail to submit a bio, because in such case we do the
-	 * corresponding error handling below and it must not be done a second
-	 * time by btrfs_direct_IO().
-	 */
-	if (write) {
+		/*
+		 * At this point, the btrfs_dio_private is responsible for
+		 * cleaning up the ordered extents whether or not we submit any
+		 * bios.
+		 */
 		struct btrfs_dio_data *dio_data = current->journal_info;
 
 		dio_data->unsubmitted_oe_range_end = dip->logical_offset +
 			dip->bytes;
 		dio_data->unsubmitted_oe_range_start =
 			dio_data->unsubmitted_oe_range_end;
-	}
-
-	ret = btrfs_submit_direct_hook(dip);
-	if (!ret)
-		return;
-
-	btrfs_io_bio_free_csum(io_bio);
 
-free_ordered:
-	/*
-	 * If we arrived here it means either we failed to submit the dip
-	 * or we either failed to clone the dio_bio or failed to allocate the
-	 * dip. If we cloned the dio_bio and allocated the dip, we can just
-	 * call bio_endio against our io_bio so that we get proper resource
-	 * cleanup if we fail to submit the dip, otherwise, we must do the
-	 * same as btrfs_endio_direct_[write|read] because we can't call these
-	 * callbacks - they require an allocated dip and a clone of dio_bio.
-	 */
-	if (bio && dip) {
-		bio_io_error(bio);
-		/*
-		 * The end io callbacks free our dip, do the final put on bio
-		 * and all the cleanup and final put for dio_bio (through
-		 * dio_end_io()).
-		 */
-		dip = NULL;
-		bio = NULL;
+		bio->bi_end_io = btrfs_endio_direct_write;
 	} else {
-		if (write)
-			__endio_write_update_ordered(inode,
-						file_offset,
-						dio_bio->bi_iter.bi_size,
-						false);
-		else
-			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
-			      file_offset + dio_bio->bi_iter.bi_size - 1);
-
-		dio_bio->bi_status = BLK_STS_IOERR;
-		/*
-		 * Releases and cleans up our dio_bio, no need to bio_put()
-		 * nor bio_endio()/bio_io_error() against dio_bio.
-		 */
-		dio_end_io(dio_bio);
+		bio->bi_end_io = btrfs_endio_direct_read;
+		dip->subio_endio = btrfs_subio_endio_read;
 	}
-	if (bio)
-		bio_put(bio);
-	kfree(dip);
+
+	btrfs_submit_direct_hook(dip);
 }
 
 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,