Message ID | 20220624173656.2033256-21-jthoughton@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | hugetlb: Introduce HugeTLB high-granularity mapping | expand |
On Fri, Jun 24, 2022 at 05:36:50PM +0000, James Houghton wrote: > The changes here are very similar to the changes made to > hugetlb_no_page, where we do a high-granularity page table walk and > do accounting slightly differently because we are mapping only a piece > of a page. > > Signed-off-by: James Houghton <jthoughton@google.com> > --- > fs/userfaultfd.c | 3 +++ > include/linux/hugetlb.h | 6 +++-- > mm/hugetlb.c | 54 +++++++++++++++++++++----------------- > mm/userfaultfd.c | 57 +++++++++++++++++++++++++++++++---------- > 4 files changed, 82 insertions(+), 38 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index e943370107d0..77c1b8a7d0b9 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -245,6 +245,9 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, > if (!ptep) > goto out; > > + if (hugetlb_hgm_enabled(vma)) > + goto out; > + This is weird. It means we'll never wait for sub-page mapping enabled vmas. Why? Not to mention hugetlb_hgm_enabled() currently is simply VM_SHARED, so it means we'll stop waiting for all shared hugetlbfs uffd page faults.. I'd expect in the in-house postcopy tests you should see vcpu threads spinning on the page faults until it's serviced. IMO we still need to properly wait when the pgtable doesn't have the faulted address covered. For sub-page mapping it'll probably need to walk into sub-page levels. > ret = false; > pte = huge_ptep_get(ptep); > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index ac4ac8fbd901..c207b1ac6195 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -221,13 +221,15 @@ unsigned long hugetlb_total_pages(void); > vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, > unsigned long address, unsigned int flags); > #ifdef CONFIG_USERFAULTFD > -int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, > +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > + struct hugetlb_pte *dst_hpte, > struct vm_area_struct *dst_vma, > unsigned long dst_addr, > unsigned long src_addr, > enum mcopy_atomic_mode mode, > struct page **pagep, > - bool wp_copy); > + bool wp_copy, > + bool new_mapping); > #endif /* CONFIG_USERFAULTFD */ > bool hugetlb_reserve_pages(struct inode *inode, long from, long to, > struct vm_area_struct *vma, > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 0ec2f231524e..09fa57599233 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5808,6 +5808,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, > vma_end_reservation(h, vma, haddr); > } > > + /* This lock will get pretty expensive at 4K. */ > ptl = hugetlb_pte_lock(mm, hpte); > ret = 0; > /* If pte changed from under us, retry */ > @@ -6098,24 +6099,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, > * modifications for huge pages. > */ > int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > - pte_t *dst_pte, > + struct hugetlb_pte *dst_hpte, > struct vm_area_struct *dst_vma, > unsigned long dst_addr, > unsigned long src_addr, > enum mcopy_atomic_mode mode, > struct page **pagep, > - bool wp_copy) > + bool wp_copy, > + bool new_mapping) > { > bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); > struct hstate *h = hstate_vma(dst_vma); > struct address_space *mapping = dst_vma->vm_file->f_mapping; > + unsigned long haddr = dst_addr & huge_page_mask(h); > pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); > unsigned long size; > int vm_shared = dst_vma->vm_flags & VM_SHARED; > pte_t _dst_pte; > spinlock_t *ptl; > int ret = -ENOMEM; > - struct page *page; > + struct page *page, *subpage; > int writable; > bool page_in_pagecache = false; > > @@ -6130,12 +6133,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > * a non-missing case. Return -EEXIST. > */ > if (vm_shared && > - hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { > + hugetlbfs_pagecache_present(h, dst_vma, haddr)) { > ret = -EEXIST; > goto out; > } > > - page = alloc_huge_page(dst_vma, dst_addr, 0); > + page = alloc_huge_page(dst_vma, haddr, 0); > if (IS_ERR(page)) { > ret = -ENOMEM; > goto out; > @@ -6151,13 +6154,13 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > /* Free the allocated page which may have > * consumed a reservation. > */ > - restore_reserve_on_error(h, dst_vma, dst_addr, page); > + restore_reserve_on_error(h, dst_vma, haddr, page); > put_page(page); > > /* Allocate a temporary page to hold the copied > * contents. > */ > - page = alloc_huge_page_vma(h, dst_vma, dst_addr); > + page = alloc_huge_page_vma(h, dst_vma, haddr); > if (!page) { > ret = -ENOMEM; > goto out; > @@ -6171,14 +6174,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > } > } else { > if (vm_shared && > - hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { > + hugetlbfs_pagecache_present(h, dst_vma, haddr)) { > put_page(*pagep); > ret = -EEXIST; > *pagep = NULL; > goto out; > } > > - page = alloc_huge_page(dst_vma, dst_addr, 0); > + page = alloc_huge_page(dst_vma, haddr, 0); > if (IS_ERR(page)) { > ret = -ENOMEM; > *pagep = NULL; > @@ -6216,8 +6219,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > page_in_pagecache = true; > } > > - ptl = huge_pte_lockptr(huge_page_shift(h), dst_mm, dst_pte); > - spin_lock(ptl); > + ptl = hugetlb_pte_lock(dst_mm, dst_hpte); > > /* > * Recheck the i_size after holding PT lock to make sure not > @@ -6239,14 +6241,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > * registered, we firstly wr-protect a none pte which has no page cache > * page backing it, then access the page. > */ > - if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) > + if (!hugetlb_pte_none_mostly(dst_hpte)) > goto out_release_unlock; > > - if (vm_shared) { > - page_dup_file_rmap(page, true); > - } else { > - ClearHPageRestoreReserve(page); > - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); > + if (new_mapping) { IIUC you wanted to avoid the mapcount accountings when it's the sub-page that was going to be mapped. Is it a must we get this only from the caller? Can we know we're doing sub-page mapping already here and make a decision with e.g. dst_hpte? It looks weird to me to pass this explicitly from the caller, especially that's when we don't really have the pgtable lock so I'm wondering about possible race conditions too on having stale new_mapping values. > + if (vm_shared) { > + page_dup_file_rmap(page, true); > + } else { > + ClearHPageRestoreReserve(page); > + hugepage_add_new_anon_rmap(page, dst_vma, haddr); > + } > } > > /* > @@ -6258,7 +6262,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > else > writable = dst_vma->vm_flags & VM_WRITE; > > - _dst_pte = make_huge_pte(dst_vma, page, writable); > + subpage = hugetlb_find_subpage(h, page, dst_addr); > + if (subpage != page) > + BUG_ON(!hugetlb_hgm_enabled(dst_vma)); > + > + _dst_pte = make_huge_pte(dst_vma, subpage, writable); > /* > * Always mark UFFDIO_COPY page dirty; note that this may not be > * extremely important for hugetlbfs for now since swapping is not > @@ -6271,14 +6279,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > if (wp_copy) > _dst_pte = huge_pte_mkuffd_wp(_dst_pte); > > - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); > + set_huge_pte_at(dst_mm, dst_addr, dst_hpte->ptep, _dst_pte); > > - (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, > - dst_vma->vm_flags & VM_WRITE); > - hugetlb_count_add(pages_per_huge_page(h), dst_mm); > + (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_hpte->ptep, > + _dst_pte, dst_vma->vm_flags & VM_WRITE); > + hugetlb_count_add(hugetlb_pte_size(dst_hpte) / PAGE_SIZE, dst_mm); > > /* No need to invalidate - it was non-present before */ > - update_mmu_cache(dst_vma, dst_addr, dst_pte); > + update_mmu_cache(dst_vma, dst_addr, dst_hpte->ptep); > > spin_unlock(ptl); > if (!is_continue) > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 4f4892a5f767..ee40d98068bf 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -310,14 +310,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > { > int vm_shared = dst_vma->vm_flags & VM_SHARED; > ssize_t err; > - pte_t *dst_pte; > unsigned long src_addr, dst_addr; > long copied; > struct page *page; > - unsigned long vma_hpagesize; > + unsigned long vma_hpagesize, vma_altpagesize; > pgoff_t idx; > u32 hash; > struct address_space *mapping; > + bool use_hgm = hugetlb_hgm_enabled(dst_vma) && > + mode == MCOPY_ATOMIC_CONTINUE; > + struct hstate *h = hstate_vma(dst_vma); > > /* > * There is no default zero huge page for all huge page sizes as > @@ -335,12 +337,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > copied = 0; > page = NULL; > vma_hpagesize = vma_kernel_pagesize(dst_vma); > + if (use_hgm) > + vma_altpagesize = PAGE_SIZE; Do we need to check the "len" to know whether we should use sub-page mapping or original hpage size? E.g. any old UFFDIO_CONTINUE code will still want the old behavior I think. > + else > + vma_altpagesize = vma_hpagesize; > > /* > * Validate alignment based on huge page size > */ > err = -EINVAL; > - if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) > + if (dst_start & (vma_altpagesize - 1) || len & (vma_altpagesize - 1)) > goto out_unlock; > > retry: > @@ -361,6 +367,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > vm_shared = dst_vma->vm_flags & VM_SHARED; > } > > + BUG_ON(!vm_shared && use_hgm); > + > /* > * If not shared, ensure the dst_vma has a anon_vma. > */ > @@ -371,11 +379,13 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > } > > while (src_addr < src_start + len) { > + struct hugetlb_pte hpte; > + bool new_mapping; > BUG_ON(dst_addr >= dst_start + len); > > /* > * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. > - * i_mmap_rwsem ensures the dst_pte remains valid even > + * i_mmap_rwsem ensures the hpte.ptep remains valid even > * in the case of shared pmds. fault mutex prevents > * races with other faulting threads. > */ > @@ -383,27 +393,47 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > i_mmap_lock_read(mapping); > idx = linear_page_index(dst_vma, dst_addr); > hash = hugetlb_fault_mutex_hash(mapping, idx); > + /* This lock will get expensive at 4K. */ > mutex_lock(&hugetlb_fault_mutex_table[hash]); > > - err = -ENOMEM; > - dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); > - if (!dst_pte) { > + err = 0; > + > + pte_t *ptep = huge_pte_alloc(dst_mm, dst_vma, dst_addr, > + vma_hpagesize); > + if (!ptep) > + err = -ENOMEM; > + else { > + hugetlb_pte_populate(&hpte, ptep, > + huge_page_shift(h)); > + /* > + * If the hstate-level PTE is not none, then a mapping > + * was previously established. > + * The per-hpage mutex prevents double-counting. > + */ > + new_mapping = hugetlb_pte_none(&hpte); > + if (use_hgm) > + err = hugetlb_alloc_largest_pte(&hpte, dst_mm, dst_vma, > + dst_addr, > + dst_start + len); > + } > + > + if (err) { > mutex_unlock(&hugetlb_fault_mutex_table[hash]); > i_mmap_unlock_read(mapping); > goto out_unlock; > } > > if (mode != MCOPY_ATOMIC_CONTINUE && > - !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { > + !hugetlb_pte_none_mostly(&hpte)) { > err = -EEXIST; > mutex_unlock(&hugetlb_fault_mutex_table[hash]); > i_mmap_unlock_read(mapping); > goto out_unlock; > } > > - err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, > + err = hugetlb_mcopy_atomic_pte(dst_mm, &hpte, dst_vma, > dst_addr, src_addr, mode, &page, > - wp_copy); > + wp_copy, new_mapping); > > mutex_unlock(&hugetlb_fault_mutex_table[hash]); > i_mmap_unlock_read(mapping); > @@ -413,6 +443,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > if (unlikely(err == -ENOENT)) { > mmap_read_unlock(dst_mm); > BUG_ON(!page); > + BUG_ON(hpte.shift != huge_page_shift(h)); > > err = copy_huge_page_from_user(page, > (const void __user *)src_addr, > @@ -430,9 +461,9 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > BUG_ON(page); > > if (!err) { > - dst_addr += vma_hpagesize; > - src_addr += vma_hpagesize; > - copied += vma_hpagesize; > + dst_addr += hugetlb_pte_size(&hpte); > + src_addr += hugetlb_pte_size(&hpte); > + copied += hugetlb_pte_size(&hpte); > > if (fatal_signal_pending(current)) > err = -EINTR; > -- > 2.37.0.rc0.161.g10f37bed90-goog >
On Fri, Jul 15, 2022 at 9:21 AM Peter Xu <peterx@redhat.com> wrote: > > On Fri, Jun 24, 2022 at 05:36:50PM +0000, James Houghton wrote: > > The changes here are very similar to the changes made to > > hugetlb_no_page, where we do a high-granularity page table walk and > > do accounting slightly differently because we are mapping only a piece > > of a page. > > > > Signed-off-by: James Houghton <jthoughton@google.com> > > --- > > fs/userfaultfd.c | 3 +++ > > include/linux/hugetlb.h | 6 +++-- > > mm/hugetlb.c | 54 +++++++++++++++++++++----------------- > > mm/userfaultfd.c | 57 +++++++++++++++++++++++++++++++---------- > > 4 files changed, 82 insertions(+), 38 deletions(-) > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > index e943370107d0..77c1b8a7d0b9 100644 > > --- a/fs/userfaultfd.c > > +++ b/fs/userfaultfd.c > > @@ -245,6 +245,9 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, > > if (!ptep) > > goto out; > > > > + if (hugetlb_hgm_enabled(vma)) > > + goto out; > > + > > This is weird. It means we'll never wait for sub-page mapping enabled > vmas. Why? > `ret` is true in this case, so we're actually *always* waiting. > Not to mention hugetlb_hgm_enabled() currently is simply VM_SHARED, so it > means we'll stop waiting for all shared hugetlbfs uffd page faults.. > > I'd expect in the in-house postcopy tests you should see vcpu threads > spinning on the page faults until it's serviced. > > IMO we still need to properly wait when the pgtable doesn't have the > faulted address covered. For sub-page mapping it'll probably need to walk > into sub-page levels. Ok, SGTM. I'll do that for the next version. I'm not sure of the consequences of returning `true` here when we should be returning `false`. > > > ret = false; > > pte = huge_ptep_get(ptep); > > > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > > index ac4ac8fbd901..c207b1ac6195 100644 > > --- a/include/linux/hugetlb.h > > +++ b/include/linux/hugetlb.h > > @@ -221,13 +221,15 @@ unsigned long hugetlb_total_pages(void); > > vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, > > unsigned long address, unsigned int flags); > > #ifdef CONFIG_USERFAULTFD > > -int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, > > +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > + struct hugetlb_pte *dst_hpte, > > struct vm_area_struct *dst_vma, > > unsigned long dst_addr, > > unsigned long src_addr, > > enum mcopy_atomic_mode mode, > > struct page **pagep, > > - bool wp_copy); > > + bool wp_copy, > > + bool new_mapping); > > #endif /* CONFIG_USERFAULTFD */ > > bool hugetlb_reserve_pages(struct inode *inode, long from, long to, > > struct vm_area_struct *vma, > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > > index 0ec2f231524e..09fa57599233 100644 > > --- a/mm/hugetlb.c > > +++ b/mm/hugetlb.c > > @@ -5808,6 +5808,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, > > vma_end_reservation(h, vma, haddr); > > } > > > > + /* This lock will get pretty expensive at 4K. */ > > ptl = hugetlb_pte_lock(mm, hpte); > > ret = 0; > > /* If pte changed from under us, retry */ > > @@ -6098,24 +6099,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, > > * modifications for huge pages. > > */ > > int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > - pte_t *dst_pte, > > + struct hugetlb_pte *dst_hpte, > > struct vm_area_struct *dst_vma, > > unsigned long dst_addr, > > unsigned long src_addr, > > enum mcopy_atomic_mode mode, > > struct page **pagep, > > - bool wp_copy) > > + bool wp_copy, > > + bool new_mapping) > > { > > bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); > > struct hstate *h = hstate_vma(dst_vma); > > struct address_space *mapping = dst_vma->vm_file->f_mapping; > > + unsigned long haddr = dst_addr & huge_page_mask(h); > > pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); > > unsigned long size; > > int vm_shared = dst_vma->vm_flags & VM_SHARED; > > pte_t _dst_pte; > > spinlock_t *ptl; > > int ret = -ENOMEM; > > - struct page *page; > > + struct page *page, *subpage; > > int writable; > > bool page_in_pagecache = false; > > > > @@ -6130,12 +6133,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > * a non-missing case. Return -EEXIST. > > */ > > if (vm_shared && > > - hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { > > + hugetlbfs_pagecache_present(h, dst_vma, haddr)) { > > ret = -EEXIST; > > goto out; > > } > > > > - page = alloc_huge_page(dst_vma, dst_addr, 0); > > + page = alloc_huge_page(dst_vma, haddr, 0); > > if (IS_ERR(page)) { > > ret = -ENOMEM; > > goto out; > > @@ -6151,13 +6154,13 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > /* Free the allocated page which may have > > * consumed a reservation. > > */ > > - restore_reserve_on_error(h, dst_vma, dst_addr, page); > > + restore_reserve_on_error(h, dst_vma, haddr, page); > > put_page(page); > > > > /* Allocate a temporary page to hold the copied > > * contents. > > */ > > - page = alloc_huge_page_vma(h, dst_vma, dst_addr); > > + page = alloc_huge_page_vma(h, dst_vma, haddr); > > if (!page) { > > ret = -ENOMEM; > > goto out; > > @@ -6171,14 +6174,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > } > > } else { > > if (vm_shared && > > - hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { > > + hugetlbfs_pagecache_present(h, dst_vma, haddr)) { > > put_page(*pagep); > > ret = -EEXIST; > > *pagep = NULL; > > goto out; > > } > > > > - page = alloc_huge_page(dst_vma, dst_addr, 0); > > + page = alloc_huge_page(dst_vma, haddr, 0); > > if (IS_ERR(page)) { > > ret = -ENOMEM; > > *pagep = NULL; > > @@ -6216,8 +6219,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > page_in_pagecache = true; > > } > > > > - ptl = huge_pte_lockptr(huge_page_shift(h), dst_mm, dst_pte); > > - spin_lock(ptl); > > + ptl = hugetlb_pte_lock(dst_mm, dst_hpte); > > > > /* > > * Recheck the i_size after holding PT lock to make sure not > > @@ -6239,14 +6241,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > * registered, we firstly wr-protect a none pte which has no page cache > > * page backing it, then access the page. > > */ > > - if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) > > + if (!hugetlb_pte_none_mostly(dst_hpte)) > > goto out_release_unlock; > > > > - if (vm_shared) { > > - page_dup_file_rmap(page, true); > > - } else { > > - ClearHPageRestoreReserve(page); > > - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); > > + if (new_mapping) { > > IIUC you wanted to avoid the mapcount accountings when it's the sub-page > that was going to be mapped. > > Is it a must we get this only from the caller? Can we know we're doing > sub-page mapping already here and make a decision with e.g. dst_hpte? > > It looks weird to me to pass this explicitly from the caller, especially > that's when we don't really have the pgtable lock so I'm wondering about > possible race conditions too on having stale new_mapping values. The only way to know what the correct value for `new_mapping` should be is to know if we had to change the hstate-level P*D to non-none to service this UFFDIO_CONTINUE request. I'll see if there is a nice way to do that check in `hugetlb_mcopy_atomic_pte`. Right now there is no race, because we synchronize on the per-hpage mutex. > > > + if (vm_shared) { > > + page_dup_file_rmap(page, true); > > + } else { > > + ClearHPageRestoreReserve(page); > > + hugepage_add_new_anon_rmap(page, dst_vma, haddr); > > + } > > } > > > > /* > > @@ -6258,7 +6262,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > else > > writable = dst_vma->vm_flags & VM_WRITE; > > > > - _dst_pte = make_huge_pte(dst_vma, page, writable); > > + subpage = hugetlb_find_subpage(h, page, dst_addr); > > + if (subpage != page) > > + BUG_ON(!hugetlb_hgm_enabled(dst_vma)); > > + > > + _dst_pte = make_huge_pte(dst_vma, subpage, writable); > > /* > > * Always mark UFFDIO_COPY page dirty; note that this may not be > > * extremely important for hugetlbfs for now since swapping is not > > @@ -6271,14 +6279,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > if (wp_copy) > > _dst_pte = huge_pte_mkuffd_wp(_dst_pte); > > > > - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); > > + set_huge_pte_at(dst_mm, dst_addr, dst_hpte->ptep, _dst_pte); > > > > - (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, > > - dst_vma->vm_flags & VM_WRITE); > > - hugetlb_count_add(pages_per_huge_page(h), dst_mm); > > + (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_hpte->ptep, > > + _dst_pte, dst_vma->vm_flags & VM_WRITE); > > + hugetlb_count_add(hugetlb_pte_size(dst_hpte) / PAGE_SIZE, dst_mm); > > > > /* No need to invalidate - it was non-present before */ > > - update_mmu_cache(dst_vma, dst_addr, dst_pte); > > + update_mmu_cache(dst_vma, dst_addr, dst_hpte->ptep); > > > > spin_unlock(ptl); > > if (!is_continue) > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > > index 4f4892a5f767..ee40d98068bf 100644 > > --- a/mm/userfaultfd.c > > +++ b/mm/userfaultfd.c > > @@ -310,14 +310,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > { > > int vm_shared = dst_vma->vm_flags & VM_SHARED; > > ssize_t err; > > - pte_t *dst_pte; > > unsigned long src_addr, dst_addr; > > long copied; > > struct page *page; > > - unsigned long vma_hpagesize; > > + unsigned long vma_hpagesize, vma_altpagesize; > > pgoff_t idx; > > u32 hash; > > struct address_space *mapping; > > + bool use_hgm = hugetlb_hgm_enabled(dst_vma) && > > + mode == MCOPY_ATOMIC_CONTINUE; > > + struct hstate *h = hstate_vma(dst_vma); > > > > /* > > * There is no default zero huge page for all huge page sizes as > > @@ -335,12 +337,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > copied = 0; > > page = NULL; > > vma_hpagesize = vma_kernel_pagesize(dst_vma); > > + if (use_hgm) > > + vma_altpagesize = PAGE_SIZE; > > Do we need to check the "len" to know whether we should use sub-page > mapping or original hpage size? E.g. any old UFFDIO_CONTINUE code will > still want the old behavior I think. I think that's a fair point; however, if we enable HGM and the address and len happen to be hstate-aligned, we basically do the same thing as if HGM wasn't enabled. It could be a minor performance optimization to do `vma_altpagesize=vma_hpagesize` in that case, but in terms of how the page tables are set up, the end result would be the same. > > > + else > > + vma_altpagesize = vma_hpagesize; > > > > /* > > * Validate alignment based on huge page size > > */ > > err = -EINVAL; > > - if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) > > + if (dst_start & (vma_altpagesize - 1) || len & (vma_altpagesize - 1)) > > goto out_unlock; > > > > retry: > > @@ -361,6 +367,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > vm_shared = dst_vma->vm_flags & VM_SHARED; > > } > > > > + BUG_ON(!vm_shared && use_hgm); > > + > > /* > > * If not shared, ensure the dst_vma has a anon_vma. > > */ > > @@ -371,11 +379,13 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > } > > > > while (src_addr < src_start + len) { > > + struct hugetlb_pte hpte; > > + bool new_mapping; > > BUG_ON(dst_addr >= dst_start + len); > > > > /* > > * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. > > - * i_mmap_rwsem ensures the dst_pte remains valid even > > + * i_mmap_rwsem ensures the hpte.ptep remains valid even > > * in the case of shared pmds. fault mutex prevents > > * races with other faulting threads. > > */ > > @@ -383,27 +393,47 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > i_mmap_lock_read(mapping); > > idx = linear_page_index(dst_vma, dst_addr); > > hash = hugetlb_fault_mutex_hash(mapping, idx); > > + /* This lock will get expensive at 4K. */ > > mutex_lock(&hugetlb_fault_mutex_table[hash]); > > > > - err = -ENOMEM; > > - dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); > > - if (!dst_pte) { > > + err = 0; > > + > > + pte_t *ptep = huge_pte_alloc(dst_mm, dst_vma, dst_addr, > > + vma_hpagesize); > > + if (!ptep) > > + err = -ENOMEM; > > + else { > > + hugetlb_pte_populate(&hpte, ptep, > > + huge_page_shift(h)); > > + /* > > + * If the hstate-level PTE is not none, then a mapping > > + * was previously established. > > + * The per-hpage mutex prevents double-counting. > > + */ > > + new_mapping = hugetlb_pte_none(&hpte); > > + if (use_hgm) > > + err = hugetlb_alloc_largest_pte(&hpte, dst_mm, dst_vma, > > + dst_addr, > > + dst_start + len); > > + } > > + > > + if (err) { > > mutex_unlock(&hugetlb_fault_mutex_table[hash]); > > i_mmap_unlock_read(mapping); > > goto out_unlock; > > } > > > > if (mode != MCOPY_ATOMIC_CONTINUE && > > - !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { > > + !hugetlb_pte_none_mostly(&hpte)) { > > err = -EEXIST; > > mutex_unlock(&hugetlb_fault_mutex_table[hash]); > > i_mmap_unlock_read(mapping); > > goto out_unlock; > > } > > > > - err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, > > + err = hugetlb_mcopy_atomic_pte(dst_mm, &hpte, dst_vma, > > dst_addr, src_addr, mode, &page, > > - wp_copy); > > + wp_copy, new_mapping); > > > > mutex_unlock(&hugetlb_fault_mutex_table[hash]); > > i_mmap_unlock_read(mapping); > > @@ -413,6 +443,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > if (unlikely(err == -ENOENT)) { > > mmap_read_unlock(dst_mm); > > BUG_ON(!page); > > + BUG_ON(hpte.shift != huge_page_shift(h)); > > > > err = copy_huge_page_from_user(page, > > (const void __user *)src_addr, > > @@ -430,9 +461,9 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > BUG_ON(page); > > > > if (!err) { > > - dst_addr += vma_hpagesize; > > - src_addr += vma_hpagesize; > > - copied += vma_hpagesize; > > + dst_addr += hugetlb_pte_size(&hpte); > > + src_addr += hugetlb_pte_size(&hpte); > > + copied += hugetlb_pte_size(&hpte); > > > > if (fatal_signal_pending(current)) > > err = -EINTR; > > -- > > 2.37.0.rc0.161.g10f37bed90-goog > > > > -- > Peter Xu > Thanks, Peter! :) - James
On Fri, Jul 15, 2022 at 09:58:10AM -0700, James Houghton wrote: > On Fri, Jul 15, 2022 at 9:21 AM Peter Xu <peterx@redhat.com> wrote: > > > > On Fri, Jun 24, 2022 at 05:36:50PM +0000, James Houghton wrote: > > > The changes here are very similar to the changes made to > > > hugetlb_no_page, where we do a high-granularity page table walk and > > > do accounting slightly differently because we are mapping only a piece > > > of a page. > > > > > > Signed-off-by: James Houghton <jthoughton@google.com> > > > --- > > > fs/userfaultfd.c | 3 +++ > > > include/linux/hugetlb.h | 6 +++-- > > > mm/hugetlb.c | 54 +++++++++++++++++++++----------------- > > > mm/userfaultfd.c | 57 +++++++++++++++++++++++++++++++---------- > > > 4 files changed, 82 insertions(+), 38 deletions(-) > > > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > > index e943370107d0..77c1b8a7d0b9 100644 > > > --- a/fs/userfaultfd.c > > > +++ b/fs/userfaultfd.c > > > @@ -245,6 +245,9 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, > > > if (!ptep) > > > goto out; > > > > > > + if (hugetlb_hgm_enabled(vma)) > > > + goto out; > > > + > > > > This is weird. It means we'll never wait for sub-page mapping enabled > > vmas. Why? > > > > `ret` is true in this case, so we're actually *always* waiting. Aha! Then I think that's another problem, sorry. :) See Below. > > > Not to mention hugetlb_hgm_enabled() currently is simply VM_SHARED, so it > > means we'll stop waiting for all shared hugetlbfs uffd page faults.. > > > > I'd expect in the in-house postcopy tests you should see vcpu threads > > spinning on the page faults until it's serviced. > > > > IMO we still need to properly wait when the pgtable doesn't have the > > faulted address covered. For sub-page mapping it'll probably need to walk > > into sub-page levels. > > Ok, SGTM. I'll do that for the next version. I'm not sure of the > consequences of returning `true` here when we should be returning > `false`. We've put ourselves onto the wait queue, if another concurrent UFFDIO_CONTINUE happened and pte is already installed, I think this thread could be waiting forever on the next schedule(). The solution should be the same - walking the sub-page pgtable would work, afaict. [...] > > > @@ -6239,14 +6241,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > > * registered, we firstly wr-protect a none pte which has no page cache > > > * page backing it, then access the page. > > > */ > > > - if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) > > > + if (!hugetlb_pte_none_mostly(dst_hpte)) > > > goto out_release_unlock; > > > > > > - if (vm_shared) { > > > - page_dup_file_rmap(page, true); > > > - } else { > > > - ClearHPageRestoreReserve(page); > > > - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); > > > + if (new_mapping) { > > > > IIUC you wanted to avoid the mapcount accountings when it's the sub-page > > that was going to be mapped. > > > > Is it a must we get this only from the caller? Can we know we're doing > > sub-page mapping already here and make a decision with e.g. dst_hpte? > > > > It looks weird to me to pass this explicitly from the caller, especially > > that's when we don't really have the pgtable lock so I'm wondering about > > possible race conditions too on having stale new_mapping values. > > The only way to know what the correct value for `new_mapping` should > be is to know if we had to change the hstate-level P*D to non-none to > service this UFFDIO_CONTINUE request. I'll see if there is a nice way > to do that check in `hugetlb_mcopy_atomic_pte`. > Right now there is no Would "new_mapping = dest_hpte->shift != huge_page_shift(hstate)" work (or something alike)? > race, because we synchronize on the per-hpage mutex. Yeah not familiar with that mutex enough to tell, as long as that mutex guarantees no pgtable update (hmm, then why we need the pgtable lock here???) then it looks fine. [...] > > > @@ -335,12 +337,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > > copied = 0; > > > page = NULL; > > > vma_hpagesize = vma_kernel_pagesize(dst_vma); > > > + if (use_hgm) > > > + vma_altpagesize = PAGE_SIZE; > > > > Do we need to check the "len" to know whether we should use sub-page > > mapping or original hpage size? E.g. any old UFFDIO_CONTINUE code will > > still want the old behavior I think. > > I think that's a fair point; however, if we enable HGM and the address > and len happen to be hstate-aligned The address can, but len (note! not "end" here) cannot? > , we basically do the same thing as > if HGM wasn't enabled. It could be a minor performance optimization to > do `vma_altpagesize=vma_hpagesize` in that case, but in terms of how > the page tables are set up, the end result would be the same. Thanks,
On Fri, Jul 15, 2022 at 10:20 AM Peter Xu <peterx@redhat.com> wrote: > > On Fri, Jul 15, 2022 at 09:58:10AM -0700, James Houghton wrote: > > On Fri, Jul 15, 2022 at 9:21 AM Peter Xu <peterx@redhat.com> wrote: > > > > > > On Fri, Jun 24, 2022 at 05:36:50PM +0000, James Houghton wrote: > > > > The changes here are very similar to the changes made to > > > > hugetlb_no_page, where we do a high-granularity page table walk and > > > > do accounting slightly differently because we are mapping only a piece > > > > of a page. > > > > > > > > Signed-off-by: James Houghton <jthoughton@google.com> > > > > --- > > > > fs/userfaultfd.c | 3 +++ > > > > include/linux/hugetlb.h | 6 +++-- > > > > mm/hugetlb.c | 54 +++++++++++++++++++++----------------- > > > > mm/userfaultfd.c | 57 +++++++++++++++++++++++++++++++---------- > > > > 4 files changed, 82 insertions(+), 38 deletions(-) > > > > > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > > > index e943370107d0..77c1b8a7d0b9 100644 > > > > --- a/fs/userfaultfd.c > > > > +++ b/fs/userfaultfd.c > > > > @@ -245,6 +245,9 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, > > > > if (!ptep) > > > > goto out; > > > > > > > > + if (hugetlb_hgm_enabled(vma)) > > > > + goto out; > > > > + > > > > > > This is weird. It means we'll never wait for sub-page mapping enabled > > > vmas. Why? > > > > > > > `ret` is true in this case, so we're actually *always* waiting. > > Aha! Then I think that's another problem, sorry. :) See Below. > > > > > > Not to mention hugetlb_hgm_enabled() currently is simply VM_SHARED, so it > > > means we'll stop waiting for all shared hugetlbfs uffd page faults.. > > > > > > I'd expect in the in-house postcopy tests you should see vcpu threads > > > spinning on the page faults until it's serviced. > > > > > > IMO we still need to properly wait when the pgtable doesn't have the > > > faulted address covered. For sub-page mapping it'll probably need to walk > > > into sub-page levels. > > > > Ok, SGTM. I'll do that for the next version. I'm not sure of the > > consequences of returning `true` here when we should be returning > > `false`. > > We've put ourselves onto the wait queue, if another concurrent > UFFDIO_CONTINUE happened and pte is already installed, I think this thread > could be waiting forever on the next schedule(). > > The solution should be the same - walking the sub-page pgtable would work, > afaict. > > [...] > > > > > @@ -6239,14 +6241,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > > > > * registered, we firstly wr-protect a none pte which has no page cache > > > > * page backing it, then access the page. > > > > */ > > > > - if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) > > > > + if (!hugetlb_pte_none_mostly(dst_hpte)) > > > > goto out_release_unlock; > > > > > > > > - if (vm_shared) { > > > > - page_dup_file_rmap(page, true); > > > > - } else { > > > > - ClearHPageRestoreReserve(page); > > > > - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); > > > > + if (new_mapping) { > > > > > > IIUC you wanted to avoid the mapcount accountings when it's the sub-page > > > that was going to be mapped. > > > > > > Is it a must we get this only from the caller? Can we know we're doing > > > sub-page mapping already here and make a decision with e.g. dst_hpte? > > > > > > It looks weird to me to pass this explicitly from the caller, especially > > > that's when we don't really have the pgtable lock so I'm wondering about > > > possible race conditions too on having stale new_mapping values. > > > > The only way to know what the correct value for `new_mapping` should > > be is to know if we had to change the hstate-level P*D to non-none to > > service this UFFDIO_CONTINUE request. I'll see if there is a nice way > > to do that check in `hugetlb_mcopy_atomic_pte`. > > Right now there is no > > Would "new_mapping = dest_hpte->shift != huge_page_shift(hstate)" work (or > something alike)? This works in the hugetlb_fault case, because in the hugetlb_fault case, we install the largest PTE possible. If we are mapping a page for the first time, we will use an hstate-sized PTE. But for UFFDIO_CONTINUE, we may be installing a 4K PTE as the first PTE for the whole hpage. > > > race, because we synchronize on the per-hpage mutex. > > Yeah not familiar with that mutex enough to tell, as long as that mutex > guarantees no pgtable update (hmm, then why we need the pgtable lock > here???) then it looks fine. Let me take a closer look at this. I'll have a more detailed explanation for the next version of the RFC. > > [...] > > > > > @@ -335,12 +337,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > > > copied = 0; > > > > page = NULL; > > > > vma_hpagesize = vma_kernel_pagesize(dst_vma); > > > > + if (use_hgm) > > > > + vma_altpagesize = PAGE_SIZE; > > > > > > Do we need to check the "len" to know whether we should use sub-page > > > mapping or original hpage size? E.g. any old UFFDIO_CONTINUE code will > > > still want the old behavior I think. > > > > I think that's a fair point; however, if we enable HGM and the address > > and len happen to be hstate-aligned > > The address can, but len (note! not "end" here) cannot? They both (dst_start and len) need to be hpage-aligned, otherwise we won't be able to install hstate-sized PTEs. Like if we're installing 4K at the beginning of a 1G hpage, we can't install a PUD, because we only want to install that 4K. > > > , we basically do the same thing as > > if HGM wasn't enabled. It could be a minor performance optimization to > > do `vma_altpagesize=vma_hpagesize` in that case, but in terms of how > > the page tables are set up, the end result would be the same. > > Thanks, Thanks! > > -- > Peter Xu >
On Wed, Jul 20, 2022 at 01:58:06PM -0700, James Houghton wrote: > > > > > @@ -335,12 +337,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > > > > copied = 0; > > > > > page = NULL; > > > > > vma_hpagesize = vma_kernel_pagesize(dst_vma); > > > > > + if (use_hgm) > > > > > + vma_altpagesize = PAGE_SIZE; > > > > > > > > Do we need to check the "len" to know whether we should use sub-page > > > > mapping or original hpage size? E.g. any old UFFDIO_CONTINUE code will > > > > still want the old behavior I think. > > > > > > I think that's a fair point; however, if we enable HGM and the address > > > and len happen to be hstate-aligned > > > > The address can, but len (note! not "end" here) cannot? > > They both (dst_start and len) need to be hpage-aligned, otherwise we > won't be able to install hstate-sized PTEs. Like if we're installing > 4K at the beginning of a 1G hpage, we can't install a PUD, because we > only want to install that 4K. I'm still confused... Shouldn't one of the major goals of sub-page mapping is to grant user the capability to do UFFDIO_CONTINUE with len<hpagesize (so we install pages in sub-page level)? If so, why len needs to be always hpagesize aligned?
On Thu, Jul 21, 2022 at 12:09 PM Peter Xu <peterx@redhat.com> wrote: > > On Wed, Jul 20, 2022 at 01:58:06PM -0700, James Houghton wrote: > > > > > > @@ -335,12 +337,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > > > > > copied = 0; > > > > > > page = NULL; > > > > > > vma_hpagesize = vma_kernel_pagesize(dst_vma); > > > > > > + if (use_hgm) > > > > > > + vma_altpagesize = PAGE_SIZE; > > > > > > > > > > Do we need to check the "len" to know whether we should use sub-page > > > > > mapping or original hpage size? E.g. any old UFFDIO_CONTINUE code will > > > > > still want the old behavior I think. > > > > > > > > I think that's a fair point; however, if we enable HGM and the address > > > > and len happen to be hstate-aligned > > > > > > The address can, but len (note! not "end" here) cannot? > > > > They both (dst_start and len) need to be hpage-aligned, otherwise we > > won't be able to install hstate-sized PTEs. Like if we're installing > > 4K at the beginning of a 1G hpage, we can't install a PUD, because we > > only want to install that 4K. > > I'm still confused... > > Shouldn't one of the major goals of sub-page mapping is to grant user the > capability to do UFFDIO_CONTINUE with len<hpagesize (so we install pages in > sub-page level)? If so, why len needs to be always hpagesize aligned? Sorry I misunderstood what you were asking. We allow both to be PAGE_SIZE-aligned. :) That is indeed the goal of HGM. If dst_start and len were both hpage-aligned, then we *could* set `use_hgm = false`, and everything would still work. That's what I thought you were asking about. I don't see any reason to do this though, as `use_hgm = true` will only grant additional functionality, and `use_hgm = false` would only -- at best -- be a minor performance optimization in this case. - James > > -- > Peter Xu >
On Thu, Jul 21, 2022 at 12:44:58PM -0700, James Houghton wrote: > On Thu, Jul 21, 2022 at 12:09 PM Peter Xu <peterx@redhat.com> wrote: > > > > On Wed, Jul 20, 2022 at 01:58:06PM -0700, James Houghton wrote: > > > > > > > @@ -335,12 +337,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, > > > > > > > copied = 0; > > > > > > > page = NULL; > > > > > > > vma_hpagesize = vma_kernel_pagesize(dst_vma); > > > > > > > + if (use_hgm) > > > > > > > + vma_altpagesize = PAGE_SIZE; > > > > > > > > > > > > Do we need to check the "len" to know whether we should use sub-page > > > > > > mapping or original hpage size? E.g. any old UFFDIO_CONTINUE code will > > > > > > still want the old behavior I think. > > > > > > > > > > I think that's a fair point; however, if we enable HGM and the address > > > > > and len happen to be hstate-aligned > > > > > > > > The address can, but len (note! not "end" here) cannot? > > > > > > They both (dst_start and len) need to be hpage-aligned, otherwise we > > > won't be able to install hstate-sized PTEs. Like if we're installing > > > 4K at the beginning of a 1G hpage, we can't install a PUD, because we > > > only want to install that 4K. > > > > I'm still confused... > > > > Shouldn't one of the major goals of sub-page mapping is to grant user the > > capability to do UFFDIO_CONTINUE with len<hpagesize (so we install pages in > > sub-page level)? If so, why len needs to be always hpagesize aligned? > > Sorry I misunderstood what you were asking. We allow both to be > PAGE_SIZE-aligned. :) That is indeed the goal of HGM. Ah OK. :) > > If dst_start and len were both hpage-aligned, then we *could* set > `use_hgm = false`, and everything would still work. That's what I > thought you were asking about. I don't see any reason to do this > though, as `use_hgm = true` will only grant additional functionality, > and `use_hgm = false` would only -- at best -- be a minor performance > optimization in this case. I just want to make sure this patch won't break existing uffd-minor users, or it'll be an kernel abi breakage. We'd still want to have e.g. existing compiled apps run like before, which iiuc means we should only use sub-page mapping when len!=hpagesize here. I'm not sure it's only about perf - the app may not even be prepared to receive yet another page faults within the same huge page range.
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e943370107d0..77c1b8a7d0b9 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -245,6 +245,9 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, if (!ptep) goto out; + if (hugetlb_hgm_enabled(vma)) + goto out; + ret = false; pte = huge_ptep_get(ptep); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ac4ac8fbd901..c207b1ac6195 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -221,13 +221,15 @@ unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); #ifdef CONFIG_USERFAULTFD -int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, + struct hugetlb_pte *dst_hpte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, enum mcopy_atomic_mode mode, struct page **pagep, - bool wp_copy); + bool wp_copy, + bool new_mapping); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0ec2f231524e..09fa57599233 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5808,6 +5808,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, vma_end_reservation(h, vma, haddr); } + /* This lock will get pretty expensive at 4K. */ ptl = hugetlb_pte_lock(mm, hpte); ret = 0; /* If pte changed from under us, retry */ @@ -6098,24 +6099,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * modifications for huge pages. */ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, - pte_t *dst_pte, + struct hugetlb_pte *dst_hpte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, enum mcopy_atomic_mode mode, struct page **pagep, - bool wp_copy) + bool wp_copy, + bool new_mapping) { bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct hstate *h = hstate_vma(dst_vma); struct address_space *mapping = dst_vma->vm_file->f_mapping; + unsigned long haddr = dst_addr & huge_page_mask(h); pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); unsigned long size; int vm_shared = dst_vma->vm_flags & VM_SHARED; pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; - struct page *page; + struct page *page, *subpage; int writable; bool page_in_pagecache = false; @@ -6130,12 +6133,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * a non-missing case. Return -EEXIST. */ if (vm_shared && - hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + hugetlbfs_pagecache_present(h, dst_vma, haddr)) { ret = -EEXIST; goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); + page = alloc_huge_page(dst_vma, haddr, 0); if (IS_ERR(page)) { ret = -ENOMEM; goto out; @@ -6151,13 +6154,13 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* Free the allocated page which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, page); + restore_reserve_on_error(h, dst_vma, haddr, page); put_page(page); /* Allocate a temporary page to hold the copied * contents. */ - page = alloc_huge_page_vma(h, dst_vma, dst_addr); + page = alloc_huge_page_vma(h, dst_vma, haddr); if (!page) { ret = -ENOMEM; goto out; @@ -6171,14 +6174,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, } } else { if (vm_shared && - hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + hugetlbfs_pagecache_present(h, dst_vma, haddr)) { put_page(*pagep); ret = -EEXIST; *pagep = NULL; goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); + page = alloc_huge_page(dst_vma, haddr, 0); if (IS_ERR(page)) { ret = -ENOMEM; *pagep = NULL; @@ -6216,8 +6219,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, page_in_pagecache = true; } - ptl = huge_pte_lockptr(huge_page_shift(h), dst_mm, dst_pte); - spin_lock(ptl); + ptl = hugetlb_pte_lock(dst_mm, dst_hpte); /* * Recheck the i_size after holding PT lock to make sure not @@ -6239,14 +6241,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ - if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) + if (!hugetlb_pte_none_mostly(dst_hpte)) goto out_release_unlock; - if (vm_shared) { - page_dup_file_rmap(page, true); - } else { - ClearHPageRestoreReserve(page); - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + if (new_mapping) { + if (vm_shared) { + page_dup_file_rmap(page, true); + } else { + ClearHPageRestoreReserve(page); + hugepage_add_new_anon_rmap(page, dst_vma, haddr); + } } /* @@ -6258,7 +6262,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, else writable = dst_vma->vm_flags & VM_WRITE; - _dst_pte = make_huge_pte(dst_vma, page, writable); + subpage = hugetlb_find_subpage(h, page, dst_addr); + if (subpage != page) + BUG_ON(!hugetlb_hgm_enabled(dst_vma)); + + _dst_pte = make_huge_pte(dst_vma, subpage, writable); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -6271,14 +6279,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (wp_copy) _dst_pte = huge_pte_mkuffd_wp(_dst_pte); - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_huge_pte_at(dst_mm, dst_addr, dst_hpte->ptep, _dst_pte); - (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, - dst_vma->vm_flags & VM_WRITE); - hugetlb_count_add(pages_per_huge_page(h), dst_mm); + (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_hpte->ptep, + _dst_pte, dst_vma->vm_flags & VM_WRITE); + hugetlb_count_add(hugetlb_pte_size(dst_hpte) / PAGE_SIZE, dst_mm); /* No need to invalidate - it was non-present before */ - update_mmu_cache(dst_vma, dst_addr, dst_pte); + update_mmu_cache(dst_vma, dst_addr, dst_hpte->ptep); spin_unlock(ptl); if (!is_continue) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 4f4892a5f767..ee40d98068bf 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -310,14 +310,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, { int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; - pte_t *dst_pte; unsigned long src_addr, dst_addr; long copied; struct page *page; - unsigned long vma_hpagesize; + unsigned long vma_hpagesize, vma_altpagesize; pgoff_t idx; u32 hash; struct address_space *mapping; + bool use_hgm = hugetlb_hgm_enabled(dst_vma) && + mode == MCOPY_ATOMIC_CONTINUE; + struct hstate *h = hstate_vma(dst_vma); /* * There is no default zero huge page for all huge page sizes as @@ -335,12 +337,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, copied = 0; page = NULL; vma_hpagesize = vma_kernel_pagesize(dst_vma); + if (use_hgm) + vma_altpagesize = PAGE_SIZE; + else + vma_altpagesize = vma_hpagesize; /* * Validate alignment based on huge page size */ err = -EINVAL; - if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) + if (dst_start & (vma_altpagesize - 1) || len & (vma_altpagesize - 1)) goto out_unlock; retry: @@ -361,6 +367,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, vm_shared = dst_vma->vm_flags & VM_SHARED; } + BUG_ON(!vm_shared && use_hgm); + /* * If not shared, ensure the dst_vma has a anon_vma. */ @@ -371,11 +379,13 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, } while (src_addr < src_start + len) { + struct hugetlb_pte hpte; + bool new_mapping; BUG_ON(dst_addr >= dst_start + len); /* * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. - * i_mmap_rwsem ensures the dst_pte remains valid even + * i_mmap_rwsem ensures the hpte.ptep remains valid even * in the case of shared pmds. fault mutex prevents * races with other faulting threads. */ @@ -383,27 +393,47 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, i_mmap_lock_read(mapping); idx = linear_page_index(dst_vma, dst_addr); hash = hugetlb_fault_mutex_hash(mapping, idx); + /* This lock will get expensive at 4K. */ mutex_lock(&hugetlb_fault_mutex_table[hash]); - err = -ENOMEM; - dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); - if (!dst_pte) { + err = 0; + + pte_t *ptep = huge_pte_alloc(dst_mm, dst_vma, dst_addr, + vma_hpagesize); + if (!ptep) + err = -ENOMEM; + else { + hugetlb_pte_populate(&hpte, ptep, + huge_page_shift(h)); + /* + * If the hstate-level PTE is not none, then a mapping + * was previously established. + * The per-hpage mutex prevents double-counting. + */ + new_mapping = hugetlb_pte_none(&hpte); + if (use_hgm) + err = hugetlb_alloc_largest_pte(&hpte, dst_mm, dst_vma, + dst_addr, + dst_start + len); + } + + if (err) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); goto out_unlock; } if (mode != MCOPY_ATOMIC_CONTINUE && - !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { + !hugetlb_pte_none_mostly(&hpte)) { err = -EEXIST; mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); goto out_unlock; } - err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, + err = hugetlb_mcopy_atomic_pte(dst_mm, &hpte, dst_vma, dst_addr, src_addr, mode, &page, - wp_copy); + wp_copy, new_mapping); mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); @@ -413,6 +443,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, if (unlikely(err == -ENOENT)) { mmap_read_unlock(dst_mm); BUG_ON(!page); + BUG_ON(hpte.shift != huge_page_shift(h)); err = copy_huge_page_from_user(page, (const void __user *)src_addr, @@ -430,9 +461,9 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, BUG_ON(page); if (!err) { - dst_addr += vma_hpagesize; - src_addr += vma_hpagesize; - copied += vma_hpagesize; + dst_addr += hugetlb_pte_size(&hpte); + src_addr += hugetlb_pte_size(&hpte); + copied += hugetlb_pte_size(&hpte); if (fatal_signal_pending(current)) err = -EINTR;
The changes here are very similar to the changes made to hugetlb_no_page, where we do a high-granularity page table walk and do accounting slightly differently because we are mapping only a piece of a page. Signed-off-by: James Houghton <jthoughton@google.com> --- fs/userfaultfd.c | 3 +++ include/linux/hugetlb.h | 6 +++-- mm/hugetlb.c | 54 +++++++++++++++++++++----------------- mm/userfaultfd.c | 57 +++++++++++++++++++++++++++++++---------- 4 files changed, 82 insertions(+), 38 deletions(-)