@@ -445,6 +445,7 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
+ bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
@@ -1535,21 +1535,37 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* So here we disable page faults in the iov_iter and then retry if we
* got -EFAULT, faulting in the pages before the retry.
*/
+again:
from->nofault = true;
dio = btrfs_dio_write(iocb, from, written);
from->nofault = false;
- /*
- * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
- * iocb, and that needs to lock the inode. So unlock it before calling
- * iomap_dio_complete() to avoid a deadlock.
- */
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-
- if (IS_ERR_OR_NULL(dio))
+ if (IS_ERR_OR_NULL(dio)) {
err = PTR_ERR_OR_ZERO(dio);
- else
+ } else {
+ struct btrfs_file_private stack_private = { 0 };
+ struct btrfs_file_private *private;
+ const bool have_private = (file->private_data != NULL);
+
+ if (!have_private)
+ file->private_data = &stack_private;
+
+ /*
+ * If we have a synchoronous write, we must make sure the fsync
+ * triggered by the iomap_dio_complete() call below doesn't
+ * deadlock on the inode lock - we are already holding it and we
+ * can't call it after unlocking because we may need to complete
+ * partial writes due to the input buffer (or parts of it) not
+ * being already faulted in.
+ */
+ private = file->private_data;
+ private->fsync_skip_inode_lock = true;
err = iomap_dio_complete(dio);
+ private->fsync_skip_inode_lock = false;
+
+ if (!have_private)
+ file->private_data = NULL;
+ }
/* No increment (+=) because iomap returns a cumulative value. */
if (err > 0)
@@ -1576,10 +1592,12 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
} else {
fault_in_iov_iter_readable(from, left);
prev_left = left;
- goto relock;
+ goto again;
}
}
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+
/*
* If 'err' is -ENOTBLK or we have not written all data, then it means
* we must fallback to buffered IO.
@@ -1778,6 +1796,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
+ struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1787,6 +1806,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
+ const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
trace_btrfs_sync_file(file, datasync);
@@ -1814,7 +1834,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -1838,7 +1861,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@@ -1941,7 +1967,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);