@@ -167,6 +167,7 @@ struct dax_device {
#if IS_ENABLED(CONFIG_FS_DAX)
static void generic_dax_pagefree(struct page *page, void *data)
{
+ wake_up_devmap_idle(&page->_refcount);
}
struct dax_device *fs_dax_claim_bdev(struct block_device *bdev, void *owner)
@@ -346,6 +346,19 @@ static void dax_disassociate_entry(void *entry, struct inode *inode, bool trunc)
}
}
+static struct page *dma_busy_page(void *entry)
+{
+ unsigned long pfn, end_pfn;
+
+ for_each_entry_pfn(entry, pfn, end_pfn) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (page_ref_count(page) > 1)
+ return page;
+ }
+ return NULL;
+}
+
/*
* Find radix tree entry at given index. If it points to an exceptional entry,
* return it with the radix tree entry locked. If the radix tree doesn't
@@ -487,6 +500,97 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
return entry;
}
+static int wait_page(atomic_t *_refcount)
+{
+ struct page *page = container_of(_refcount, struct page, _refcount);
+ struct inode *inode = page->inode;
+
+ if (page_ref_count(page) == 1)
+ return 0;
+
+ i_daxdma_unlock_shared(inode);
+ schedule();
+ i_daxdma_lock_shared(inode);
+
+ /*
+ * if we bounced the daxdma_lock then we need to rescan the
+ * truncate area.
+ */
+ return 1;
+}
+
+void dax_wait_dma(struct address_space *mapping, loff_t lstart, loff_t len)
+{
+ struct inode *inode = mapping->host;
+ pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t start, end, index;
+ struct pagevec pvec;
+ unsigned i;
+
+ lockdep_assert_held(&inode->i_dax_dmasem);
+
+ if (lstart < 0 || len < -1)
+ return;
+
+ /* in the limited case get_user_pages for dax is disabled */
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ return;
+
+ if (!dax_mapping(mapping))
+ return;
+
+ if (mapping->nrexceptional == 0)
+ return;
+
+ if (len == -1)
+ end = -1;
+ else
+ end = (lstart + len) >> PAGE_SHIFT;
+ start = lstart >> PAGE_SHIFT;
+
+retry:
+ pagevec_init(&pvec, 0);
+ index = start;
+ while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *pvec_ent = pvec.pages[i];
+ struct page *page = NULL;
+ void *entry;
+
+ index = indices[i];
+ if (index >= end)
+ break;
+
+ if (!radix_tree_exceptional_entry(pvec_ent))
+ continue;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = get_unlocked_mapping_entry(mapping, index, NULL);
+ if (entry)
+ page = dma_busy_page(entry);
+ put_unlocked_mapping_entry(mapping, index, entry);
+ spin_unlock_irq(&mapping->tree_lock);
+
+ if (page && wait_on_devmap_idle(&page->_refcount,
+ wait_page,
+ TASK_UNINTERRUPTIBLE) != 0) {
+ /*
+ * We dropped the dma lock, so we need
+ * to revalidate that previously seen
+ * idle pages are still idle.
+ */
+ goto retry;
+ }
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ index++;
+ }
+}
+EXPORT_SYMBOL_GPL(dax_wait_dma);
+
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
@@ -509,8 +613,10 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
out:
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
+
return ret;
}
+
/*
* Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
* entry to get unlocked before deleting it.
@@ -192,6 +192,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fsnotify_mask = 0;
#endif
inode->i_flctx = NULL;
+ i_daxdma_init(inode);
this_cpu_inc(nr_inodes);
return 0;
@@ -100,10 +100,15 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
#ifdef CONFIG_FS_DAX
+void dax_wait_dma(struct address_space *mapping, loff_t lstart, loff_t len);
int __dax_zero_page_range(struct block_device *bdev,
struct dax_device *dax_dev, sector_t sector,
unsigned int offset, unsigned int length);
#else
+static inline void dax_wait_dma(struct address_space *mapping, loff_t lstart,
+ loff_t len)
+{
+}
static inline int __dax_zero_page_range(struct block_device *bdev,
struct dax_device *dax_dev, sector_t sector,
unsigned int offset, unsigned int length)
@@ -645,6 +645,9 @@ struct inode {
#ifdef CONFIG_IMA
atomic_t i_readcount; /* struct files open RO */
#endif
+#ifdef CONFIG_FS_DAX
+ struct rw_semaphore i_dax_dmasem;
+#endif
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct file_lock_context *i_flctx;
struct address_space i_data;
@@ -747,6 +750,59 @@ static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
down_write_nested(&inode->i_rwsem, subclass);
}
+#ifdef CONFIG_FS_DAX
+static inline void i_daxdma_init(struct inode *inode)
+{
+ init_rwsem(&inode->i_dax_dmasem);
+}
+
+static inline void i_daxdma_lock(struct inode *inode)
+{
+ down_write(&inode->i_dax_dmasem);
+}
+
+static inline void i_daxdma_unlock(struct inode *inode)
+{
+ up_write(&inode->i_dax_dmasem);
+}
+
+static inline void i_daxdma_lock_shared(struct inode *inode)
+{
+ /*
+ * The write lock is taken under mmap_sem in the
+ * get_user_pages() path the read lock nests in the truncate
+ * path.
+ */
+#define DAXDMA_TRUNCATE_CLASS 1
+ down_read_nested(&inode->i_dax_dmasem, DAXDMA_TRUNCATE_CLASS);
+}
+
+static inline void i_daxdma_unlock_shared(struct inode *inode)
+{
+ up_read(&inode->i_dax_dmasem);
+}
+#else /* CONFIG_FS_DAX */
+static inline void i_daxdma_init(struct inode *inode)
+{
+}
+
+static inline void i_daxdma_lock(struct inode *inode)
+{
+}
+
+static inline void i_daxdma_unlock(struct inode *inode)
+{
+}
+
+static inline void i_daxdma_lock_shared(struct inode *inode)
+{
+}
+
+static inline void i_daxdma_unlock_shared(struct inode *inode)
+{
+}
+#endif /* CONFIG_FS_DAX */
+
void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);
@@ -30,10 +30,12 @@ int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *
int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
void wake_up_bit(void *word, int bit);
void wake_up_atomic_t(atomic_t *p);
+void wake_up_devmap_idle(atomic_t *p);
int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode);
+int out_of_line_wait_on_devmap_idle(atomic_t *p, int (*)(atomic_t *), unsigned int mode);
struct wait_queue_head *bit_waitqueue(void *word, int bit);
extern void __init wait_bit_init(void);
@@ -258,4 +260,12 @@ int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
return out_of_line_wait_on_atomic_t(val, action, mode);
}
+static inline
+int wait_on_devmap_idle(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
+{
+ might_sleep();
+ if (atomic_read(val) == 1)
+ return 0;
+ return out_of_line_wait_on_devmap_idle(val, action, mode);
+}
#endif /* _LINUX_WAIT_BIT_H */
@@ -162,11 +162,17 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
return bit_waitqueue(p, 0);
}
-static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync,
- void *arg)
+static inline struct wait_bit_queue_entry *to_wait_bit_q(
+ struct wait_queue_entry *wq_entry)
+{
+ return container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+}
+
+static int wake_atomic_t_function(struct wait_queue_entry *wq_entry,
+ unsigned mode, int sync, void *arg)
{
struct wait_bit_key *key = arg;
- struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+ struct wait_bit_queue_entry *wait_bit = to_wait_bit_q(wq_entry);
atomic_t *val = key->flags;
if (wait_bit->key.flags != key->flags ||
@@ -176,14 +182,29 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
return autoremove_wake_function(wq_entry, mode, sync, key);
}
+static int wake_devmap_idle_function(struct wait_queue_entry *wq_entry,
+ unsigned mode, int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue_entry *wait_bit = to_wait_bit_q(wq_entry);
+ atomic_t *val = key->flags;
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ atomic_read(val) != 1)
+ return 0;
+ return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+
/*
* To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
* the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
* return codes halt waiting and return.
*/
static __sched
-int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
- int (*action)(atomic_t *), unsigned mode)
+int __wait_on_atomic_t(struct wait_queue_head *wq_head,
+ struct wait_bit_queue_entry *wbq_entry,
+ int (*action)(atomic_t *), unsigned mode, int target)
{
atomic_t *val;
int ret = 0;
@@ -191,10 +212,10 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
do {
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
val = wbq_entry->key.flags;
- if (atomic_read(val) == 0)
+ if (atomic_read(val) == target)
break;
ret = (*action)(val);
- } while (!ret && atomic_read(val) != 0);
+ } while (!ret && atomic_read(val) != target);
finish_wait(wq_head, &wbq_entry->wq_entry);
return ret;
}
@@ -210,16 +231,37 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
}, \
}
+#define DEFINE_WAIT_DEVMAP_IDLE(name, p) \
+ struct wait_bit_queue_entry name = { \
+ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
+ .wq_entry = { \
+ .private = current, \
+ .func = wake_devmap_idle_function, \
+ .entry = \
+ LIST_HEAD_INIT((name).wq_entry.entry), \
+ }, \
+ }
+
__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
unsigned mode)
{
struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
DEFINE_WAIT_ATOMIC_T(wq_entry, p);
- return __wait_on_atomic_t(wq_head, &wq_entry, action, mode);
+ return __wait_on_atomic_t(wq_head, &wq_entry, action, mode, 0);
}
EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+__sched int out_of_line_wait_on_devmap_idle(atomic_t *p, int (*action)(atomic_t *),
+ unsigned mode)
+{
+ struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
+ DEFINE_WAIT_DEVMAP_IDLE(wq_entry, p);
+
+ return __wait_on_atomic_t(wq_head, &wq_entry, action, mode, 1);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_devmap_idle);
+
/**
* wake_up_atomic_t - Wake up a waiter on a atomic_t
* @p: The atomic_t being waited on, a kernel virtual address
@@ -235,6 +277,12 @@ void wake_up_atomic_t(atomic_t *p)
}
EXPORT_SYMBOL(wake_up_atomic_t);
+void wake_up_devmap_idle(atomic_t *p)
+{
+ __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_devmap_idle);
+
__sched int bit_wait(struct wait_bit_key *word, int mode)
{
schedule();
@@ -579,6 +579,41 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
return 0;
}
+static struct inode *do_dax_lock(struct vm_area_struct *vma,
+ unsigned int foll_flags)
+{
+ struct file *file;
+ struct inode *inode;
+
+ if (!(foll_flags & FOLL_GET))
+ return NULL;
+ if (!vma_is_dax(vma))
+ return NULL;
+ file = vma->vm_file;
+ inode = file_inode(file);
+ if (inode->i_mode == S_IFCHR)
+ return NULL;
+ return inode;
+}
+
+static struct inode *dax_truncate_lock(struct vm_area_struct *vma,
+ unsigned int foll_flags)
+{
+ struct inode *inode = do_dax_lock(vma, foll_flags);
+
+ if (!inode)
+ return NULL;
+ i_daxdma_lock(inode);
+ return inode;
+}
+
+static void dax_truncate_unlock(struct inode *inode)
+{
+ if (!inode)
+ return;
+ i_daxdma_unlock(inode);
+}
+
/**
* __get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
@@ -659,6 +694,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
do {
struct page *page;
+ struct inode *inode;
unsigned int foll_flags = gup_flags;
unsigned int page_increm;
@@ -693,7 +729,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (unlikely(fatal_signal_pending(current)))
return i ? i : -ERESTARTSYS;
cond_resched();
+ inode = dax_truncate_lock(vma, foll_flags);
page = follow_page_mask(vma, start, foll_flags, &page_mask);
+ dax_truncate_unlock(inode);
if (!page) {
int ret;
ret = faultin_page(tsk, vma, start, &foll_flags,
commit 67d952314e9989b3b1945c50488f4a0f760264c3
Author: Dan Williams <dan.j.williams@intel.com>
Date: Tue Oct 24 13:41:22 2017 -0700
xfs: wire up dax dma waiting
The dax-dma vs truncate collision avoidance involves acquiring the new
i_dax_dmasem and validating the no ranges that are to be mapped out of
the file are active for dma. If any are found we wait for page idle
and retry the scan. The locations where we implement this wait line up
with where we currently wait for pnfs layout leases to expire.
Since we need both dma to be idle and leases to be broken, and since
xfs_break_layouts drops locks, we need to retry the dma busy scan until
we can complete one that finds no busy pages.
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
@@ -347,7 +347,7 @@ xfs_file_aio_write_checks(
return error;
error = xfs_break_layouts(inode, iolock);
- if (error)
+ if (error < 0)
return error;
/*
@@ -762,7 +762,7 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode);
long error;
enum xfs_prealloc_flags flags = 0;
- uint iolock = XFS_IOLOCK_EXCL;
+ uint iolock = XFS_DAXDMA_LOCK_SHARED;
loff_t new_size = 0;
bool do_file_insert = 0;
@@ -771,10 +771,20 @@ xfs_file_fallocate(
if (mode & ~XFS_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
+retry:
xfs_ilock(ip, iolock);
+ dax_wait_dma(inode->i_mapping, offset, len);
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ iolock |= XFS_IOLOCK_EXCL;
error = xfs_break_layouts(inode, &iolock);
- if (error)
+ if (error < 0)
goto out_unlock;
+ else if (error > 0 && IS_ENABLED(CONFIG_FS_DAX)) {
+ xfs_iunlock(ip, iolock);
+ iolock = XFS_DAXDMA_LOCK_SHARED;
+ goto retry;
+ }
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
iolock |= XFS_MMAPLOCK_EXCL;
@@ -171,7 +171,14 @@ xfs_ilock_attr_map_shared(
* taken in places where we need to invalidate the page cache in a race
* free manner (e.g. truncate, hole punch and other extent manipulation
* functions).
- */
+ *
+ * The XFS_DAXDMA_LOCK_SHARED lock is a CONFIG_FS_DAX special case lock
+ * for synchronizing truncate vs ongoing DMA. The get_user_pages() path
+ * will hold this lock exclusively when incrementing page reference
+ * counts for DMA. Before an extent can be truncated we need to complete
+ * a validate-idle sweep of all pages in the range while holding this
+ * lock in shared mode.
+*/
void
xfs_ilock(
xfs_inode_t *ip,
@@ -192,6 +199,9 @@ xfs_ilock(
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ if (lock_flags & XFS_DAXDMA_LOCK_SHARED)
+ i_daxdma_lock_shared(VFS_I(ip));
+
if (lock_flags & XFS_IOLOCK_EXCL) {
down_write_nested(&VFS_I(ip)->i_rwsem,
XFS_IOLOCK_DEP(lock_flags));
@@ -328,6 +338,9 @@ xfs_iunlock(
else if (lock_flags & XFS_ILOCK_SHARED)
mrunlock_shared(&ip->i_lock);
+ if (lock_flags & XFS_DAXDMA_LOCK_SHARED)
+ i_daxdma_unlock_shared(VFS_I(ip));
+
trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
}
@@ -283,10 +283,12 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
#define XFS_ILOCK_SHARED (1<<3)
#define XFS_MMAPLOCK_EXCL (1<<4)
#define XFS_MMAPLOCK_SHARED (1<<5)
+#define XFS_DAXDMA_LOCK_SHARED (1<<6)
#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
- | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
+ | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED \
+ | XFS_DAXDMA_LOCK_SHARED)
#define XFS_LOCK_FLAGS \
{ XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \
@@ -294,7 +296,8 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
{ XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \
{ XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \
{ XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \
- { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" }
+ { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" }, \
+ { XFS_DAXDMA_LOCK_SHARED, "XFS_DAXDMA_LOCK_SHARED" }
/*
@@ -612,7 +612,7 @@ xfs_ioc_space(
struct xfs_inode *ip = XFS_I(inode);
struct iattr iattr;
enum xfs_prealloc_flags flags = 0;
- uint iolock = XFS_IOLOCK_EXCL;
+ uint iolock = XFS_DAXDMA_LOCK_SHARED;
int error;
/*
@@ -637,18 +637,6 @@ xfs_ioc_space(
if (filp->f_mode & FMODE_NOCMTIME)
flags |= XFS_PREALLOC_INVISIBLE;
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock);
- if (error)
- goto out_unlock;
-
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- iolock |= XFS_MMAPLOCK_EXCL;
-
switch (bf->l_whence) {
case 0: /*SEEK_SET*/
break;
@@ -659,10 +647,31 @@ xfs_ioc_space(
bf->l_start += XFS_ISIZE(ip);
break;
default:
- error = -EINVAL;
+ return -EINVAL;
+ }
+
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+
+retry:
+ xfs_ilock(ip, iolock);
+ dax_wait_dma(inode->i_mapping, bf->l_start, bf->l_len);
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ iolock |= XFS_IOLOCK_EXCL;
+ error = xfs_break_layouts(inode, &iolock);
+ if (error < 0)
goto out_unlock;
+ else if (error > 0 && IS_ENABLED(CONFIG_FS_DAX)) {
+ xfs_iunlock(ip, iolock);
+ iolock = XFS_DAXDMA_LOCK_SHARED;
+ goto retry;
}
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ iolock |= XFS_MMAPLOCK_EXCL;
+
/*
* length of <= 0 for resv/unresv/zero is invalid. length for
* alloc/free is ignored completely and we have no idea what userspace
@@ -35,18 +35,19 @@ xfs_break_layouts(
uint *iolock)
{
struct xfs_inode *ip = XFS_I(inode);
- int error;
+ int error, did_unlock = 0;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
xfs_iunlock(ip, *iolock);
+ did_unlock = 1;
error = break_layout(inode, true);
*iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, *iolock);
}
- return error;
+ return error < 0 ? error : did_unlock;
}
/*