diff mbox

[v2,1/2] Revert "mm: take i_mmap_lock in unmap_mapping_range() for DAX"

Message ID 1443830494-8748-2-git-send-email-ross.zwisler@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ross Zwisler Oct. 3, 2015, 12:01 a.m. UTC
This reverts commits 46c043ede4711e8d598b9d63c5616c1fedb0605e
and 8346c416d17bf5b4ea1508662959bb62e73fd6a5.

The following two locking commits in the DAX code:

commit 843172978bb9 ("dax: fix race between simultaneous faults")
commit 46c043ede471 ("mm: take i_mmap_lock in unmap_mapping_range() for DAX")

introduced a number of deadlocks and other issues, and need to be
reverted for the v4.3 kernel. The list of issues in DAX after these
commits (some newly introduced by the commits, some preexisting) can be
found here:

https://lkml.org/lkml/2015/9/25/602

This revert keeps the PMEM API changes to the zeroing code in
__dax_pmd_fault(), which were added by this commit:

commit d77e92e270ed ("dax: update PMD fault handler with PMEM API")

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c    | 50 ++++++++++++++++++--------------------------------
 mm/memory.c | 11 +++++++++--
 2 files changed, 27 insertions(+), 34 deletions(-)

Comments

Ross Zwisler Oct. 4, 2015, 5:24 a.m. UTC | #1
On Fri, Oct 02, 2015 at 06:01:32PM -0600, Ross Zwisler wrote:
> This series reverts some recent changes to the locking scheme in DAX introduced
> by these two commits:
> 
> commit 843172978bb9 ("dax: fix race between simultaneous faults")
> commit 46c043ede471 ("mm: take i_mmap_lock in unmap_mapping_range() for DAX")
> 
> Changes from v1:
>  -  Squashed patches 1 and 2 from the first series into a single patch to avoid
>     adding another spot in the git history where we could end up referencing an
>     uninitialized pointer.
> 
> Ross Zwisler (2):
>   Revert "mm: take i_mmap_lock in unmap_mapping_range() for DAX"
>   Revert "dax: fix race between simultaneous faults"
> 
>  fs/dax.c    | 83 +++++++++++++++++++++++++------------------------------------
>  mm/memory.c |  2 ++
>  2 files changed, 36 insertions(+), 49 deletions(-)
> 
> -- 
> 2.1.0

*sigh* - even after these reverts we can deadlock on in the DAX PMD code with
its original locking scheme.  I can hit them 100% of the time with either
generic/074 or generic/198 using either XFS or ext4.  I'll debug exactly
what's going on on Monday.

The quick and easy workaround for this is to do a "return VM_FAULT_FALLBACK;"
at the beginning of __dax_pmd_fault() to just turn off PMD faults while we
rework the locking for v4.4.  This saves us reverting and re-adding all the
PMD code, and will let us ship v4.3 without known deadlocks.

Other better ideas?

- Ross
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/dax.c b/fs/dax.c
index bcfb14b..de3f53e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -569,38 +569,6 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
 		goto fallback;
 
-	sector = bh.b_blocknr << (blkbits - 9);
-
-	if (buffer_unwritten(&bh) || buffer_new(&bh)) {
-		int i;
-
-		length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
-						bh.b_size);
-		if (length < 0) {
-			result = VM_FAULT_SIGBUS;
-			goto out;
-		}
-		if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
-			goto fallback;
-
-		for (i = 0; i < PTRS_PER_PMD; i++)
-			clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
-		wmb_pmem();
-		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-		result |= VM_FAULT_MAJOR;
-	}
-
-	/*
-	 * If we allocated new storage, make sure no process has any
-	 * zero pages covering this hole
-	 */
-	if (buffer_new(&bh)) {
-		i_mmap_unlock_write(mapping);
-		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
-		i_mmap_lock_write(mapping);
-	}
-
 	/*
 	 * If a truncate happened while we were allocating blocks, we may
 	 * leave blocks allocated to the file that are beyond EOF.  We can't
@@ -615,6 +583,13 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if ((pgoff | PG_PMD_COLOUR) >= size)
 		goto fallback;
 
+	/*
+	 * If we allocated new storage, make sure no process has any
+	 * zero pages covering this hole
+	 */
+	if (buffer_new(&bh))
+		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+
 	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
 		spinlock_t *ptl;
 		pmd_t entry;
@@ -635,6 +610,7 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		result = VM_FAULT_NOPAGE;
 		spin_unlock(ptl);
 	} else {
+		sector = bh.b_blocknr << (blkbits - 9);
 		length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
 						bh.b_size);
 		if (length < 0) {
@@ -644,6 +620,16 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
 			goto fallback;
 
+		if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+			int i;
+			for (i = 0; i < PTRS_PER_PMD; i++)
+				clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+			wmb_pmem();
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			result |= VM_FAULT_MAJOR;
+		}
+
 		result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
 	}
 
diff --git a/mm/memory.c b/mm/memory.c
index 9cb2747..5ec066f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2426,10 +2426,17 @@  void unmap_mapping_range(struct address_space *mapping,
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
 
-	i_mmap_lock_write(mapping);
+
+	/*
+	 * DAX already holds i_mmap_lock to serialise file truncate vs
+	 * page fault and page fault vs page fault.
+	 */
+	if (!IS_DAX(mapping->host))
+		i_mmap_lock_write(mapping);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
-	i_mmap_unlock_write(mapping);
+	if (!IS_DAX(mapping->host))
+		i_mmap_unlock_write(mapping);
 }
 EXPORT_SYMBOL(unmap_mapping_range);