@@ -350,6 +350,7 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ);
+ pagecache_lock_init(&mapping->add_lock);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
@@ -418,6 +418,28 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+struct pagecache_lock {
+ atomic_long_t v;
+ wait_queue_head_t wait;
+};
+
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
+{
+ atomic_long_set(&lock->v, 0);
+ init_waitqueue_head(&lock->wait);
+}
+
+void pagecache_add_put(struct pagecache_lock *);
+void pagecache_add_get(struct pagecache_lock *);
+void __pagecache_block_put(struct pagecache_lock *);
+void __pagecache_block_get(struct pagecache_lock *);
+void pagecache_block_put(struct pagecache_lock *);
+void pagecache_block_get(struct pagecache_lock *);
+
/**
* struct address_space - Contents of a cacheable, mappable object.
* @host: Owner, either the inode or the block_device.
@@ -452,6 +474,8 @@ struct address_space {
spinlock_t private_lock;
struct list_head private_list;
void *private_data;
+ struct pagecache_lock add_lock
+ ____cacheline_aligned_in_smp; /* protects adding new pages */
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
/*
* On most architectures that alignment is already the case; but
@@ -43,6 +43,7 @@ struct io_context;
struct mempolicy;
struct nameidata;
struct nsproxy;
+struct pagecache_lock;
struct perf_event_context;
struct pid_namespace;
struct pipe_inode_info;
@@ -935,6 +936,9 @@ struct task_struct {
unsigned int in_ubsan;
#endif
+ /* currently held lock, for avoiding recursing in fault path: */
+ struct pagecache_lock *pagecache_lock;
+
/* Journalling filesystem info: */
void *journal_info;
@@ -115,6 +115,7 @@ struct task_struct init_task
},
.blocked = {{0}},
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
+ .pagecache_lock = NULL,
.journal_info = NULL,
INIT_CPU_TIMERS(init_task)
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
@@ -113,6 +113,73 @@
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
+static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
+{
+ BUG_ON(atomic_long_read(&lock->v) == 0);
+
+ if (atomic_long_sub_return_release(i, &lock->v) == 0)
+ wake_up_all(&lock->wait);
+}
+
+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
+{
+ long v = atomic_long_read(&lock->v), old;
+
+ do {
+ old = v;
+
+ if (i > 0 ? v < 0 : v > 0)
+ return false;
+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+ old, old + i)) != old);
+ return true;
+}
+
+static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
+{
+ wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
+}
+
+void pagecache_add_put(struct pagecache_lock *lock)
+{
+ __pagecache_lock_put(lock, 1);
+}
+EXPORT_SYMBOL(pagecache_add_put);
+
+void pagecache_add_get(struct pagecache_lock *lock)
+{
+ __pagecache_lock_get(lock, 1);
+}
+EXPORT_SYMBOL(pagecache_add_get);
+
+void __pagecache_block_put(struct pagecache_lock *lock)
+{
+ __pagecache_lock_put(lock, -1);
+}
+EXPORT_SYMBOL(__pagecache_block_put);
+
+void __pagecache_block_get(struct pagecache_lock *lock)
+{
+ __pagecache_lock_get(lock, -1);
+}
+EXPORT_SYMBOL(__pagecache_block_get);
+
+void pagecache_block_put(struct pagecache_lock *lock)
+{
+ BUG_ON(current->pagecache_lock != lock);
+ current->pagecache_lock = NULL;
+ __pagecache_lock_put(lock, -1);
+}
+EXPORT_SYMBOL(pagecache_block_put);
+
+void pagecache_block_get(struct pagecache_lock *lock)
+{
+ __pagecache_lock_get(lock, -1);
+ BUG_ON(current->pagecache_lock);
+ current->pagecache_lock = lock;
+}
+EXPORT_SYMBOL(pagecache_block_get);
+
static void page_cache_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
@@ -829,11 +896,14 @@ static int __add_to_page_cache_locked(struct page *page,
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
mapping_set_update(&xas, mapping);
+ if (current->pagecache_lock != &mapping->add_lock)
+ pagecache_add_get(&mapping->add_lock);
+
if (!huge) {
error = mem_cgroup_try_charge(page, current->mm,
gfp_mask, &memcg, false);
if (error)
- return error;
+ goto out;
}
get_page(page);
@@ -869,14 +939,19 @@ static int __add_to_page_cache_locked(struct page *page,
if (!huge)
mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
- return 0;
+ error = 0;
+out:
+ if (current->pagecache_lock != &mapping->add_lock)
+ pagecache_add_put(&mapping->add_lock);
+ return error;
error:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return xas_error(&xas);
+ error = xas_error(&xas);
+ goto out;
}
/**
Add a per address space lock around adding pages to the pagecache - making it possible for fallocate INSERT_RANGE/COLLAPSE_RANGE to work correctly, and also hopefully making truncate and dio a bit saner. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> --- fs/inode.c | 1 + include/linux/fs.h | 24 +++++++++++++ include/linux/sched.h | 4 +++ init/init_task.c | 1 + mm/filemap.c | 81 +++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 108 insertions(+), 3 deletions(-)