@@ -2378,6 +2378,35 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
}
/*
+ * If mmap_sem temporarily dropped, revalidate vma
+ * before taking mmap_sem.
+ * Return 0 if succeeds, otherwise return none-zero
+ * value (scan code).
+ */
+
+static int hugepage_vma_revalidate(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ unsigned long hstart, hend;
+
+ if (unlikely(khugepaged_test_exit(mm)))
+ return SCAN_ANY_PROCESS;
+
+ vma = find_vma(mm, address);
+ if (!vma)
+ return SCAN_VMA_NULL;
+
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ return SCAN_ADDRESS_RANGE;
+ if (!hugepage_vma_check(vma))
+ return SCAN_VMA_CHECK;
+ return 0;
+}
+
+/*
* Bring missing pages in from swap, to complete THP collapse.
* Only done if khugepaged_scan_pmd believes it is worthwhile.
*
@@ -2385,7 +2414,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
* but with mmap_sem held to protect against vma changes.
*/
-static void __collapse_huge_page_swapin(struct mm_struct *mm,
+static bool __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd)
{
@@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
continue;
swapped_in++;
ret = do_swap_page(mm, vma, _address, pte, pmd,
- FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+ FAULT_FLAG_ALLOW_RETRY,
pteval);
+ /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
+ if (ret & VM_FAULT_RETRY) {
+ down_read(&mm->mmap_sem);
+ /* vma is no longer available, don't continue to swapin */
+ if (hugepage_vma_revalidate(mm, vma, address))
+ return false;
+ }
if (ret & VM_FAULT_ERROR) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
- return;
+ return false;
}
/* pte is unmapped now, we need to map it */
pte = pte_offset_map(pmd, _address);
@@ -2413,6 +2449,7 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
pte--;
pte_unmap(pte);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+ return true;
}
static void collapse_huge_page(struct mm_struct *mm,
@@ -2427,7 +2464,6 @@ static void collapse_huge_page(struct mm_struct *mm,
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
int isolated = 0, result = 0;
- unsigned long hstart, hend;
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -2450,39 +2486,37 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out_nolock;
}
- /*
- * Prevent all access to pagetables with the exception of
- * gup_fast later hanlded by the ptep_clear_flush and the VM
- * handled by the anon_vma lock + PG_lock.
- */
- down_write(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm))) {
- result = SCAN_ANY_PROCESS;
+ down_read(&mm->mmap_sem);
+ result = hugepage_vma_revalidate(mm, vma, address);
+ if (result)
goto out;
- }
- vma = find_vma(mm, address);
- if (!vma) {
- result = SCAN_VMA_NULL;
- goto out;
- }
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
- result = SCAN_ADDRESS_RANGE;
- goto out;
- }
- if (!hugepage_vma_check(vma)) {
- result = SCAN_VMA_CHECK;
- goto out;
- }
pmd = mm_find_pmd(mm, address);
if (!pmd) {
result = SCAN_PMD_NULL;
goto out;
}
- __collapse_huge_page_swapin(mm, vma, address, pmd);
+ /*
+ * __collapse_huge_page_swapin always returns with mmap_sem
+ * locked. If it fails, release mmap_sem and jump directly
+ * label out. Continuing to collapse causes inconsistency.
+ */
+ if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
+ up_read(&mm->mmap_sem);
+ goto out;
+ }
+
+ up_read(&mm->mmap_sem);
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later handled by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ */
+ down_write(&mm->mmap_sem);
+ result = hugepage_vma_revalidate(mm, vma, address);
+ if (result)
+ goto out;
anon_vma_lock_write(vma->anon_vma);