diff mbox series

DIO & page cache coherency locking + deadlock prevention

Message ID 20201111191011.GE3365678@moria.home.lan (mailing list archive)
State New
Headers show
Series DIO & page cache coherency locking + deadlock prevention | expand

Commit Message

Kent Overstreet Nov. 11, 2020, 7:10 p.m. UTC
Background: for bcachefs I've been trying to solve the dio page cache coherency
problem. In short, existing filesystems to varying degrees have a problem with
operations that modify file data while bypassing the page cache, because while
we can shoot down/invalidate regions of the page cache just fine, and the inode
lock will prevent reads and writes from syscalls from pulling those pages back
into the page cache - page faults are a different story.

And where this gets really tricky is that direct IO via gup() invokes the page
fault handler, meaning that if the page fault handler is taking the same lock
that the dio write path is using to prevent the pages it invalidated from being
faulted back in - and it kinda has to - oops, deadlock.

The standard approach to these kinds of deadlocks is to check for a lock
ordering violation when taking the 2nd/additional locks, and then if there is a
lock ordering violation and trylock fails - drop and retake locks in the correct
order, and unwind and retry from the top because whatever state we had locked
may have changed.

This means on lock ordering violation the fault handler has to tell the dio
write code that's calling gup() that it has to re-shoot down the page cache and

I finally got around to implementing all this for bcachefs, and it turned out to
be a lot less nasty than I expected - and I've got it passing a torture test
that tries to hit this deadlock, where I've got 3 different processes all doing
dio writes to one file from a buffer that's mapped from the next processes's

So, I'm posting the code to see if there's any interest in having this locking
for other filesystems. In my bcachefs tree I've got this all in fs/bcachefs/,
minus adding one entry to task_struct so we can pass info from the dio write
path to the fault handler. But if this is of interest to other filesystems it
needs to be in generic code in filemap.c, for obvious reasons.

The code below is on top of the existing bcachefs code that implements
ei_pagecache_lock, all it adds is the deadlock detection/handling but it should
be enough to show the general approach:

commit 2872719261668fa9f0cffe04f4895686c8207778
Author: Kent Overstreet <kent.overstreet@gmail.com>
Date:   Wed Nov 11 12:33:12 2020 -0500

    bcachefs: Deadlock prevention for ei_pagecache_lock
diff mbox series


diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 1eb69ed38b..0278e7c156 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,6 +35,22 @@ 
 #include <trace/events/bcachefs.h>
 #include <trace/events/writeback.h>
+static inline struct address_space *faults_disabled_mapping(void)
+	return (void *) (((unsigned long) current->faults_disabled_mapping) & (~0UL << 1));
+static inline void set_fdm_dropped_locks(void)
+	current->faults_disabled_mapping =
+		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
+static inline bool fdm_dropped_locks(void)
+	return ((unsigned long) current->faults_disabled_mapping) & 1;
 struct quota_res {
 	u64				sectors;
@@ -493,10 +509,35 @@  static void bch2_set_page_dirty(struct bch_fs *c,
 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
 	struct file *file = vmf->vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct address_space *fdm = faults_disabled_mapping();
 	struct bch_inode_info *inode = file_bch_inode(file);
 	int ret;
+	if (fdm == mapping)
+		return VM_FAULT_SIGBUS;
+	/* Lock ordering: */
+	if (fdm > mapping) {
+		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+		if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+			goto got_lock;
+		bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+		bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+		bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+		bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+		/* Signal that lock has been dropped: */
+		set_fdm_dropped_locks();
+		return VM_FAULT_SIGBUS;
+	}
 	ret = filemap_fault(vmf);
@@ -1742,14 +1783,16 @@  static long bch2_dio_write_loop(struct dio_write *dio)
 	struct bio *bio = &dio->op.wbio.bio;
 	struct bvec_iter_all iter;
 	struct bio_vec *bv;
-	unsigned unaligned;
-	bool sync = dio->sync;
+	unsigned unaligned, iter_count;
+	bool sync = dio->sync, dropped_locks;
 	long ret;
 	if (dio->loop)
 		goto loop;
 	while (1) {
+		iter_count = dio->iter.count;
 		if (kthread)
@@ -1757,13 +1800,34 @@  static long bch2_dio_write_loop(struct dio_write *dio)
 		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+		dropped_locks = fdm_dropped_locks();
 		current->faults_disabled_mapping = NULL;
 		if (kthread)
+		/*
+		 * If the fault handler returned an error but also signalled
+		 * that it dropped & retook ei_pagecache_lock, we just need to
+		 * re-shoot down the page cache and retry:
+		 */
+		if (dropped_locks && ret)
+			ret = 0;
 		if (unlikely(ret < 0))
 			goto err;
+		if (unlikely(dropped_locks)) {
+			ret = write_invalidate_inode_pages_range(mapping,
+					req->ki_pos,
+					req->ki_pos + iter_count - 1);
+			if (unlikely(ret))
+				goto err;
+			if (!bio->bi_iter.bi_size)
+				continue;
+		}
 		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
 		bio->bi_iter.bi_size -= unaligned;
 		iov_iter_revert(&dio->iter, unaligned);