diff mbox series

btrfs: dio iomap DSYNC workaround

Message ID 82c9a7d696fa353738c6998a33b581d76414e4f9.1599146199.git.josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show
Series btrfs: dio iomap DSYNC workaround | expand

Commit Message

Josef Bacik Sept. 3, 2020, 3:16 p.m. UTC
iomap dio will run generic_write_sync() for us if the iocb is DSYNC.
This is problematic for us because of 2 reason, 1 we hold the
inode_lock() during this operation, and we take it in
generic_write_sync().  Secondly we hold a read lock on the dio_sem but
take the write lock in fsync.

Since we don't want to rip out this code right now, but reworking the
locking is a bit much to do on such short of a notice, work around this
problem with this masterpiece of a patch.

First, we clear DSYNC on the iocb so that the iomap stuff doesn't know
that it needs to handle the sync.  We save this fact in
current->journal_info, because we need to see do special things once
we're in iomap_begin, and we have no way to pass private information
into iomap_dio_rw().

Next we specify a separate iomap_dio_ops for sync, which implements an
->end_io() callback that gets called when the dio completes.  This is
important for AIO, because we really do need to run generic_write_sync()
if we complete asynchronously.  However if we're still in the submitting
context when we enter ->end_io() we clear the flag so that the submitter
knows they're the ones that needs to run generic_write_sync().

This is meant to be temporary.  We need to work out how to eliminate the
inode_lock() and the dio_sem in our fsync and use another mechanism to
protect these operations.

Tested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/file.c        | 33 ++++++++++++++++++++++
 fs/btrfs/inode.c       | 62 ++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/transaction.h |  1 +
 3 files changed, 94 insertions(+), 2 deletions(-)

Comments

David Sterba Sept. 4, 2020, 2:59 p.m. UTC | #1
On Thu, Sep 03, 2020 at 11:16:51AM -0400, Josef Bacik wrote:
> iomap dio will run generic_write_sync() for us if the iocb is DSYNC.
> This is problematic for us because of 2 reason, 1 we hold the
> inode_lock() during this operation, and we take it in
> generic_write_sync().  Secondly we hold a read lock on the dio_sem but
> take the write lock in fsync.
> 
> Since we don't want to rip out this code right now, but reworking the
> locking is a bit much to do on such short of a notice, work around this
> problem with this masterpiece of a patch.
> 
> First, we clear DSYNC on the iocb so that the iomap stuff doesn't know
> that it needs to handle the sync.  We save this fact in
> current->journal_info, because we need to see do special things once
> we're in iomap_begin, and we have no way to pass private information
> into iomap_dio_rw().
> 
> Next we specify a separate iomap_dio_ops for sync, which implements an
> ->end_io() callback that gets called when the dio completes.  This is
> important for AIO, because we really do need to run generic_write_sync()
> if we complete asynchronously.  However if we're still in the submitting
> context when we enter ->end_io() we clear the flag so that the submitter
> knows they're the ones that needs to run generic_write_sync().
> 
> This is meant to be temporary.  We need to work out how to eliminate the
> inode_lock() and the dio_sem in our fsync and use another mechanism to
> protect these operations.
> 
> Tested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

Added to misc-next, thanks.
diff mbox series

Patch

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2cacb4424cd4..af4eab9cbc51 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2023,7 +2023,40 @@  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 		atomic_inc(&BTRFS_I(inode)->sync_writers);
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
+		/*
+		 * 1. We must always clear IOCB_DSYNC in order to not deadlock
+		 *    in iomap, as it calls generic_write_sync() in this case.
+		 * 2. If we are async, we can call iomap_dio_complete() either
+		 *    in
+		 *
+		 *    2.1. A worker thread from the last bio completed.  In this
+		 *	   case we need to mark the btrfs_dio_data that it is
+		 *	   async in order to call generic_write_sync() properly.
+		 *	   This is handled by setting BTRFS_DIO_SYNC_STUB in the
+		 *	   current->journal_info.
+		 *    2.2  The submitter context, because all IO completed
+		 *         before we exited iomap_dio_rw().  In this case we can
+		 *         just re-set the IOCB_DSYNC on the iocb and we'll do
+		 *         the sync below.  If our ->end_io() gets called and
+		 *         current->journal_info is set, then we know we're in
+		 *         our current context and we will clear
+		 *         current->journal_info to indicate that we need to
+		 *         sync below.
+		 */
+		if (sync) {
+			ASSERT(current->journal_info == NULL);
+			iocb->ki_flags &= ~IOCB_DSYNC;
+			current->journal_info = BTRFS_DIO_SYNC_STUB;
+		}
 		num_written = __btrfs_direct_write(iocb, from);
+
+		/*
+		 * As stated above, we cleared journal_info, so we need to do
+		 * the sync ourselves.
+		 */
+		if (sync && current->journal_info == NULL)
+			iocb->ki_flags |= IOCB_DSYNC;
+		current->journal_info = NULL;
 	} else {
 		num_written = btrfs_buffered_write(iocb, from);
 		if (num_written > 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b63b858546e..170a929eec81 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,6 +62,7 @@  struct btrfs_dio_data {
 	loff_t length;
 	ssize_t submitted;
 	struct extent_changeset *data_reserved;
+	bool sync;
 };
 
 static const struct inode_operations btrfs_dir_inode_operations;
@@ -7334,6 +7335,17 @@  static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 	int ret = 0;
 	u64 len = length;
 	bool unlock_extents = false;
+	bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
+
+	/*
+	 * We used current->journal_info here to see if we were sync, but
+	 * there's a lot of tests in the enospc machinery to not do flushing if
+	 * we have a journal_info set, so we need to clear this out and re-set
+	 * it in iomap_end.
+	 */
+	ASSERT(current->journal_info == NULL ||
+	       current->journal_info == BTRFS_DIO_SYNC_STUB);
+	current->journal_info = NULL;
 
 	if (!write)
 		len = min_t(u64, len, fs_info->sectorsize);
@@ -7359,6 +7371,7 @@  static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 	if (!dio_data)
 		return -ENOMEM;
 
+	dio_data->sync = sync;
 	dio_data->length = length;
 	if (write) {
 		dio_data->reserve = round_up(length, fs_info->sectorsize);
@@ -7506,6 +7519,14 @@  static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 		extent_changeset_free(dio_data->data_reserved);
 	}
 out:
+	/*
+	 * We're all done, we can re-set the current->journal_info now safely
+	 * for our endio.
+	 */
+	if (dio_data->sync) {
+		ASSERT(current->journal_info == NULL);
+		current->journal_info = BTRFS_DIO_SYNC_STUB;
+	}
 	kfree(dio_data);
 	iomap->private = NULL;
 
@@ -7914,6 +7935,30 @@  static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
 	return retval;
 }
 
+static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
+					   int error, unsigned flags)
+{
+	/*
+	 * Now if we're still in the context of our submitter we know we can't
+	 * safely run generic_write_sync(), so clear our flag here so that the
+	 * caller knows to follow up with a sync.
+	 */
+	if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
+		current->journal_info = NULL;
+		return error;
+	}
+
+	if (error)
+		return error;
+
+	if (size) {
+		iocb->ki_flags |= IOCB_DSYNC;
+		return generic_write_sync(iocb, size);
+	}
+
+	return 0;
+}
+
 static const struct iomap_ops btrfs_dio_iomap_ops = {
 	.iomap_begin            = btrfs_dio_iomap_begin,
 	.iomap_end              = btrfs_dio_iomap_end,
@@ -7923,6 +7968,11 @@  static const struct iomap_dio_ops btrfs_dops = {
 	.submit_io		= btrfs_submit_direct,
 };
 
+static const struct iomap_dio_ops btrfs_sync_dops = {
+	.submit_io		= btrfs_submit_direct,
+	.end_io			= btrfs_maybe_fsync_end_io,
+};
+
 ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
@@ -7951,8 +8001,16 @@  ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		down_read(&BTRFS_I(inode)->dio_sem);
 	}
 
-	ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dops,
-			is_sync_kiocb(iocb));
+	/*
+	 * We have are actually a sync iocb, so we need our fancy endio to know
+	 * if we need to sync.
+	 */
+	if (current->journal_info)
+		ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+				   &btrfs_sync_dops, is_sync_kiocb(iocb));
+	else
+		ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+				   &btrfs_dops, is_sync_kiocb(iocb));
 
 	if (ret == -ENOTBLK)
 		ret = 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 8241c050ba71..858d9153a1cd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -112,6 +112,7 @@  struct btrfs_transaction {
 #define TRANS_EXTWRITERS	(__TRANS_START | __TRANS_ATTACH)
 
 #define BTRFS_SEND_TRANS_STUB	((void *)1)
+#define BTRFS_DIO_SYNC_STUB	((void *)2)
 
 struct btrfs_trans_handle {
 	u64 transid;