diff mbox

[v2] Btrfs: fix memory corruption on failure to submit bio for direct IO

Message ID 1435702656-16995-1-git-send-email-fdmanana@kernel.org (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Filipe Manana June 30, 2015, 10:17 p.m. UTC
From: Filipe Manana <fdmanana@suse.com>

If we fail to submit a bio for a direct IO request, we were grabbing the
corresponding ordered extent and decrementing its reference count twice,
once for our lookup reference and once for the ordered tree reference.
This was a problem because it caused the ordered extent to be freed
without removing it from the ordered tree and any lists it might be
attached to, leaving dangling pointers to the ordered extent around.
Example trace with CONFIG_DEBUG_PAGEALLOC=y:

[161779.858707] BUG: unable to handle kernel paging request at 0000000087654330
[161779.859983] IP: [<ffffffff8124ca68>] rb_prev+0x22/0x3b
[161779.860636] PGD 34d818067 PUD 0
[161779.860636] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
(...)
[161779.860636] Call Trace:
[161779.860636]  [<ffffffffa06b36a6>] __tree_search+0xd9/0xf9 [btrfs]
[161779.860636]  [<ffffffffa06b3708>] tree_search+0x42/0x63 [btrfs]
[161779.860636]  [<ffffffffa06b4868>] ? btrfs_lookup_ordered_range+0x2d/0xa5 [btrfs]
[161779.860636]  [<ffffffffa06b4873>] btrfs_lookup_ordered_range+0x38/0xa5 [btrfs]
[161779.860636]  [<ffffffffa06aab8e>] btrfs_get_blocks_direct+0x11b/0x615 [btrfs]
[161779.860636]  [<ffffffff8119727f>] do_blockdev_direct_IO+0x5ff/0xb43
[161779.860636]  [<ffffffffa06aaa73>] ? btrfs_page_exists_in_range+0x1ad/0x1ad [btrfs]
[161779.860636]  [<ffffffffa06a2c9a>] ? btrfs_get_extent_fiemap+0x1bc/0x1bc [btrfs]
[161779.860636]  [<ffffffff811977f5>] __blockdev_direct_IO+0x32/0x34
[161779.860636]  [<ffffffffa06a2c9a>] ? btrfs_get_extent_fiemap+0x1bc/0x1bc [btrfs]
[161779.860636]  [<ffffffffa06a10ae>] btrfs_direct_IO+0x198/0x21f [btrfs]
[161779.860636]  [<ffffffffa06a2c9a>] ? btrfs_get_extent_fiemap+0x1bc/0x1bc [btrfs]
[161779.860636]  [<ffffffff81112ca1>] generic_file_direct_write+0xb3/0x128
[161779.860636]  [<ffffffffa06affaa>] ? btrfs_file_write_iter+0x15f/0x3e0 [btrfs]
[161779.860636]  [<ffffffffa06b004c>] btrfs_file_write_iter+0x201/0x3e0 [btrfs]
(...)

We were also not freeing the btrfs_dio_private we allocated previously,
which kmemleak reported with the following trace in its sysfs file:

unreferenced object 0xffff8803f553bf80 (size 96):
  comm "xfs_io", pid 4501, jiffies 4295039588 (age 173.936s)
  hex dump (first 32 bytes):
    88 6c 9b f5 02 88 ff ff 00 00 00 00 00 00 00 00  .l..............
    00 00 00 00 00 00 00 00 00 00 c4 00 00 00 00 00  ................
  backtrace:
    [<ffffffff81161ffe>] create_object+0x172/0x29a
    [<ffffffff8145870f>] kmemleak_alloc+0x25/0x41
    [<ffffffff81154e64>] kmemleak_alloc_recursive.constprop.40+0x16/0x18
    [<ffffffff811579ed>] kmem_cache_alloc_trace+0xfb/0x148
    [<ffffffffa03d8cff>] btrfs_submit_direct+0x65/0x16a [btrfs]
    [<ffffffff811968dc>] dio_bio_submit+0x62/0x8f
    [<ffffffff811975fe>] do_blockdev_direct_IO+0x97e/0xb43
    [<ffffffff811977f5>] __blockdev_direct_IO+0x32/0x34
    [<ffffffffa03d70ae>] btrfs_direct_IO+0x198/0x21f [btrfs]
    [<ffffffff81112ca1>] generic_file_direct_write+0xb3/0x128
    [<ffffffffa03e604d>] btrfs_file_write_iter+0x201/0x3e0 [btrfs]
    [<ffffffff8116586a>] __vfs_write+0x7c/0xa5
    [<ffffffff81165da9>] vfs_write+0xa0/0xe4
    [<ffffffff81166675>] SyS_pwrite64+0x64/0x82
    [<ffffffff81464fd7>] system_call_fastpath+0x12/0x6f
    [<ffffffffffffffff>] 0xffffffffffffffff

For read requests we weren't doing any cleanup either (none of the work
done by btrfs_endio_direct_read()), so a failure submitting a bio for a
read request would leave a range in the inode's io_tree locked forever,
blocking any future operations (both reads and writes) against that range.

We were also doing an extra bio_put() call against io_bio in some cases -
btrfs_submit_direct_hook() often does that final bio_put() against our
io_bio (pointed to by dip->orig_bio).

So fix this by making sure we do the same cleanup that we do for the case
where the bio submission succeeds.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---

V2: Fixed a case where it's possible to do an extra bio_put() against
    io_bio, which results in a use-after-free bug.

 fs/btrfs/inode.c        | 70 +++++++++++++++++++++++++++++++++----------------
 fs/btrfs/ordered-data.c |  5 ++++
 2 files changed, 53 insertions(+), 22 deletions(-)
diff mbox

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f569d26..6ec1b34 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8046,7 +8046,7 @@  err:
 }
 
 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
-				    int skip_sum)
+				    int skip_sum, bool *end_io_done)
 {
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -8143,7 +8143,9 @@  submit:
 	if (!ret)
 		return 0;
 
-	bio_put(bio);
+	/* Our caller is responsible for doing the final put for orig_bio. */
+	if (bio != orig_bio)
+		bio_put(bio);
 out_err:
 	dip->errors = 1;
 	/*
@@ -8151,8 +8153,10 @@  out_err:
 	 * make sure dip->errors is perceived to be set.
 	 */
 	smp_mb__before_atomic();
-	if (atomic_dec_and_test(&dip->pending_bios))
+	if (atomic_dec_and_test(&dip->pending_bios)) {
 		bio_io_error(dip->orig_bio);
+		*end_io_done = true;
+	}
 
 	/* bio_end_io() will handle error, so we needn't return it */
 	return 0;
@@ -8161,13 +8165,13 @@  out_err:
 static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 				struct inode *inode, loff_t file_offset)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_dio_private *dip;
-	struct bio *io_bio;
+	struct btrfs_dio_private *dip = NULL;
+	struct bio *io_bio = NULL;
 	struct btrfs_io_bio *btrfs_bio;
 	int skip_sum;
 	int write = rw & REQ_WRITE;
 	int ret = 0;
+	bool end_io_done = false;
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
@@ -8180,7 +8184,7 @@  static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 	dip = kzalloc(sizeof(*dip), GFP_NOFS);
 	if (!dip) {
 		ret = -ENOMEM;
-		goto free_io_bio;
+		goto free_ordered;
 	}
 
 	dip->private = dio_bio->bi_private;
@@ -8202,31 +8206,53 @@  static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 		dip->subio_endio = btrfs_subio_endio_read;
 	}
 
-	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
+	ret = btrfs_submit_direct_hook(rw, dip, skip_sum, &end_io_done);
 	if (!ret)
 		return;
 
 	if (btrfs_bio->end_io)
 		btrfs_bio->end_io(btrfs_bio, ret);
-free_io_bio:
-	bio_put(io_bio);
 
 free_ordered:
 	/*
-	 * If this is a write, we need to clean up the reserved space and kill
-	 * the ordered extent.
+	 * If we arrived here it means either we failed to submit the dip
+	 * or we either failed to clone the dio_bio or failed to allocate the
+	 * dip. If we cloned the dio_bio and allocated the dip, we can just
+	 * call bio_endio against our io_bio so that we get proper resource
+	 * cleanup if we fail to submit the dip, otherwise, we must do the
+	 * same as btrfs_endio_direct_[write|read] because we can't call these
+	 * callbacks - they require an allocated dip and a clone of dio_bio.
 	 */
-	if (write) {
-		struct btrfs_ordered_extent *ordered;
-		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
-		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
-			btrfs_free_reserved_extent(root, ordered->start,
-						   ordered->disk_len, 1);
-		btrfs_put_ordered_extent(ordered);
-		btrfs_put_ordered_extent(ordered);
+	if (io_bio && dip) {
+		if (!end_io_done)
+			bio_endio(io_bio, ret);
+		/* end io callbacks free our dip and do final put on io_bio */
+		dip = NULL;
+		io_bio = NULL;
+	} else {
+		if (write) {
+			struct btrfs_ordered_extent *ordered;
+
+			ordered = btrfs_lookup_ordered_extent(inode,
+							      file_offset);
+			set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+			/*
+			 * Decrements our ref on the ordered extent and removes
+			 * the ordered extent from the inode's ordered tree,
+			 * doing all the proper resource cleanup such as for the
+			 * reserved space.
+			 */
+			btrfs_finish_ordered_io(ordered);
+		} else {
+			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+			      file_offset + dio_bio->bi_iter.bi_size - 1);
+		}
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+		dio_end_io(dio_bio, ret);
 	}
-	bio_endio(dio_bio, ret);
+	if (io_bio)
+		bio_put(io_bio);
+	kfree(dip);
 }
 
 static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8f391fd..6c095ad 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -559,6 +559,10 @@  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 	trace_btrfs_ordered_extent_put(entry->inode, entry);
 
 	if (atomic_dec_and_test(&entry->refs)) {
+		ASSERT(list_empty(&entry->log_list));
+		ASSERT(list_empty(&entry->trans_list));
+		ASSERT(list_empty(&entry->root_extent_list));
+		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
 		if (entry->inode)
 			btrfs_add_delayed_iput(entry->inode);
 		while (!list_empty(&entry->list)) {
@@ -586,6 +590,7 @@  void btrfs_remove_ordered_extent(struct inode *inode,
 	spin_lock_irq(&tree->lock);
 	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
+	RB_CLEAR_NODE(node);
 	if (tree->last == node)
 		tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);