Message ID | 20230201081737.2330141-5-fengwei.yin@intel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | folio based filemap_map_pages() | expand |
On 01.02.23 09:17, Yin Fengwei wrote: > do_set_pte_range() allows to setup page table entries for a > specific range. It calls page_add_file_rmap_range() to take > advantage of batched rmap update for large folio. > > Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> > --- > include/linux/mm.h | 2 ++ > mm/filemap.c | 1 - > mm/memory.c | 60 ++++++++++++++++++++++++++++++++++++---------- > 3 files changed, 49 insertions(+), 14 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index d6f8f41514cc..96e08fcdce24 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1162,6 +1162,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) > > vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); > void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); > +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > + unsigned long start, unsigned long addr, unsigned int nr); > > vm_fault_t finish_fault(struct vm_fault *vmf); > vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); > diff --git a/mm/filemap.c b/mm/filemap.c > index 9cc5edd8f998..95f634d11581 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, > > ref_count++; > do_set_pte(vmf, page, addr); > - update_mmu_cache(vma, addr, vmf->pte); > } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); > > /* > diff --git a/mm/memory.c b/mm/memory.c > index 51c04bb60724..7e41142e1e4f 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4257,7 +4257,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) > } > #endif > > -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > +static void do_set_pte_entry(struct vm_fault *vmf, struct page *page, > + unsigned long addr) > { > struct vm_area_struct *vma = vmf->vma; > bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); > @@ -4277,16 +4278,52 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > entry = maybe_mkwrite(pte_mkdirty(entry), vma); > if (unlikely(uffd_wp)) > entry = pte_mkuffd_wp(entry); > - /* copy-on-write page */ > - if (write && !(vma->vm_flags & VM_SHARED)) { > - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > - page_add_new_anon_rmap(page, vma, addr); > - lru_cache_add_inactive_or_unevictable(page, vma); > - } else { > - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); > - page_add_file_rmap(page, vma, false); > - } > set_pte_at(vma->vm_mm, addr, vmf->pte, entry); > + > + /* no need to invalidate: a not-present page won't be cached */ > + update_mmu_cache(vma, addr, vmf->pte); > +} > + > +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > + unsigned long start, unsigned long addr, unsigned int nr) > +{ > + unsigned int i = 0; > + struct page *page = folio_page(folio, start); > + struct vm_area_struct *vma = vmf->vma; > + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && > + !(vma->vm_flags & VM_SHARED); > + > + /* > + * file page: batched update rmap, mm counter. > + * copy-on-write page: batched update mm counter. > + */ > + if (!cow) { > + page_add_file_rmap_range(folio, start, nr, vma, false); > + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); > + } else > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); > + > + do { > + if (cow) { > + page_add_new_anon_rmap(page, vma, addr); This doesn't work with anon pages the way you intended in this patch. Please leave anon pages out of the picture for now and make do_set_pte_range() only deal with !cow.
On 2/1/2023 5:09 PM, David Hildenbrand wrote: > On 01.02.23 09:17, Yin Fengwei wrote: >> do_set_pte_range() allows to setup page table entries for a >> specific range. It calls page_add_file_rmap_range() to take >> advantage of batched rmap update for large folio. >> >> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> >> --- >> include/linux/mm.h | 2 ++ >> mm/filemap.c | 1 - >> mm/memory.c | 60 ++++++++++++++++++++++++++++++++++++---------- >> 3 files changed, 49 insertions(+), 14 deletions(-) >> >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index d6f8f41514cc..96e08fcdce24 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -1162,6 +1162,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) >> vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); >> void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >> + unsigned long start, unsigned long addr, unsigned int nr); >> vm_fault_t finish_fault(struct vm_fault *vmf); >> vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); >> diff --git a/mm/filemap.c b/mm/filemap.c >> index 9cc5edd8f998..95f634d11581 100644 >> --- a/mm/filemap.c >> +++ b/mm/filemap.c >> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, >> ref_count++; >> do_set_pte(vmf, page, addr); >> - update_mmu_cache(vma, addr, vmf->pte); >> } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); >> /* >> diff --git a/mm/memory.c b/mm/memory.c >> index 51c04bb60724..7e41142e1e4f 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -4257,7 +4257,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) >> } >> #endif >> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >> +static void do_set_pte_entry(struct vm_fault *vmf, struct page *page, >> + unsigned long addr) >> { >> struct vm_area_struct *vma = vmf->vma; >> bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); >> @@ -4277,16 +4278,52 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >> entry = maybe_mkwrite(pte_mkdirty(entry), vma); >> if (unlikely(uffd_wp)) >> entry = pte_mkuffd_wp(entry); >> - /* copy-on-write page */ >> - if (write && !(vma->vm_flags & VM_SHARED)) { >> - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); >> - page_add_new_anon_rmap(page, vma, addr); >> - lru_cache_add_inactive_or_unevictable(page, vma); >> - } else { >> - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); >> - page_add_file_rmap(page, vma, false); >> - } >> set_pte_at(vma->vm_mm, addr, vmf->pte, entry); >> + >> + /* no need to invalidate: a not-present page won't be cached */ >> + update_mmu_cache(vma, addr, vmf->pte); >> +} >> + >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >> + unsigned long start, unsigned long addr, unsigned int nr) >> +{ >> + unsigned int i = 0; >> + struct page *page = folio_page(folio, start); >> + struct vm_area_struct *vma = vmf->vma; >> + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && >> + !(vma->vm_flags & VM_SHARED); >> + >> + /* >> + * file page: batched update rmap, mm counter. >> + * copy-on-write page: batched update mm counter. >> + */ >> + if (!cow) { >> + page_add_file_rmap_range(folio, start, nr, vma, false); >> + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); >> + } else >> + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); >> + >> + do { >> + if (cow) { >> + page_add_new_anon_rmap(page, vma, addr); > > This doesn't work with anon pages the way you intended in this patch. > > Please leave anon pages out of the picture for now and make do_set_pte_range() only deal with !cow. OK. I will move the check to do_set_pte() and only call to do_set_pte_range() when it's !cow. Thanks. Regards Yin, Fengwei >
On Wed, Feb 01, 2023 at 04:17:36PM +0800, Yin Fengwei wrote: > do_set_pte_range() allows to setup page table entries for a > specific range. It calls page_add_file_rmap_range() to take > advantage of batched rmap update for large folio. How about something more like this? Yes, we need to define flush_icache_pages() and PTE_STRIDE. (we could also do for (i = 0; i < nr; i++) flush_icache_page(...) but given that some architectures already implement flush_icache_range(), I think they may appreciate being given one large range to flush) +++ b/mm/memory.c @@ -4277,15 +4277,19 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) } #endif -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, + unsigned int start, unsigned int nr, + unsigned long addr) { + struct page *page = folio_page(page, start); struct vm_area_struct *vma = vmf->vma; bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); bool write = vmf->flags & FAULT_FLAG_WRITE; bool prefault = vmf->address != addr; pte_t entry; + unsigned int i; - flush_icache_page(vma, page); + flush_icache_pages(vma, page, nr); entry = mk_pte(page, vma->vm_page_prot); if (prefault && arch_wants_old_prefaulted_pte()) @@ -4299,14 +4303,23 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = pte_mkuffd_wp(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr); - lru_cache_add_inactive_or_unevictable(page, vma); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); + for (i = 0; i < nr; i++) { + page_add_new_anon_rmap(page + i, vma, addr); + lru_cache_add_inactive_or_unevictable(page + i, vma); + } } else { - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, vma, false); + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); + folio_add_file_rmap(folio, start, n, vma); + } + + for (i = 0; i < nr; i++) { + set_pte_at(vma->vm_mm, addr, vmf->pte + i, entry); + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, addr, vmf->pte + i); + addr += PAGE_SIZE; + entry += PTE_STRIDE; } - set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } static bool vmf_pte_changed(struct vm_fault *vmf)
On Wed, Feb 01, 2023 at 05:38:39PM +0000, Matthew Wilcox wrote: > On Wed, Feb 01, 2023 at 04:17:36PM +0800, Yin Fengwei wrote: > > do_set_pte_range() allows to setup page table entries for a > > specific range. It calls page_add_file_rmap_range() to take > > advantage of batched rmap update for large folio. > > How about something more like this? Yes, we need to define > flush_icache_pages() and PTE_STRIDE. Never mind about PTE_STRIDE. I forgot that pte_t isn't an integer type. Instead, we'll want each architecture to define /* This should be right for x86 */ static inline pte_next(pte_t pte) { return __pte(pte_val(pte) + PAGE_SIZE); } > + for (i = 0; i < nr; i++) { > + set_pte_at(vma->vm_mm, addr, vmf->pte + i, entry); > + /* no need to invalidate: a not-present page won't be cached */ > + update_mmu_cache(vma, addr, vmf->pte + i); > + addr += PAGE_SIZE; > + entry += PTE_STRIDE; entry = pte_next(entry);
On 2/2/2023 1:38 AM, Matthew Wilcox wrote: > On Wed, Feb 01, 2023 at 04:17:36PM +0800, Yin Fengwei wrote: >> do_set_pte_range() allows to setup page table entries for a >> specific range. It calls page_add_file_rmap_range() to take >> advantage of batched rmap update for large folio. > > How about something more like this? Yes, we need to define > flush_icache_pages() and PTE_STRIDE. Yes. This could remove more duplicated operations here. Let me try to add flush_icache_pages() in next version. But for pte_next(), it needs be added to handle all architectures. I suppose it will take long journey to make it. What about a followup patch for pte_next()? Regards Yin, Fengwei > > (we could also do for (i = 0; i < nr; i++) flush_icache_page(...) but > given that some architectures already implement flush_icache_range(), > I think they may appreciate being given one large range to flush) > > +++ b/mm/memory.c > @@ -4277,15 +4277,19 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page > *page) > } > #endif > > -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > + unsigned int start, unsigned int nr, > + unsigned long addr) > { > + struct page *page = folio_page(page, start); > struct vm_area_struct *vma = vmf->vma; > bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); > bool write = vmf->flags & FAULT_FLAG_WRITE; > bool prefault = vmf->address != addr; > pte_t entry; > + unsigned int i; > > - flush_icache_page(vma, page); > + flush_icache_pages(vma, page, nr); > entry = mk_pte(page, vma->vm_page_prot); > > if (prefault && arch_wants_old_prefaulted_pte()) > @@ -4299,14 +4303,23 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > entry = pte_mkuffd_wp(pte_wrprotect(entry)); > /* copy-on-write page */ > if (write && !(vma->vm_flags & VM_SHARED)) { > - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > - page_add_new_anon_rmap(page, vma, addr); > - lru_cache_add_inactive_or_unevictable(page, vma); > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); > + for (i = 0; i < nr; i++) { > + page_add_new_anon_rmap(page + i, vma, addr); > + lru_cache_add_inactive_or_unevictable(page + i, vma); > + } > } else { > - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); > - page_add_file_rmap(page, vma, false); > + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); > + folio_add_file_rmap(folio, start, n, vma); > + } > + > + for (i = 0; i < nr; i++) { > + set_pte_at(vma->vm_mm, addr, vmf->pte + i, entry); > + /* no need to invalidate: a not-present page won't be cached */ > + update_mmu_cache(vma, addr, vmf->pte + i); > + addr += PAGE_SIZE; > + entry += PTE_STRIDE; > } > - set_pte_at(vma->vm_mm, addr, vmf->pte, entry); > } > > static bool vmf_pte_changed(struct vm_fault *vmf) >
On 2/2/23 01:38, Matthew Wilcox wrote: > On Wed, Feb 01, 2023 at 04:17:36PM +0800, Yin Fengwei wrote: >> do_set_pte_range() allows to setup page table entries for a >> specific range. It calls page_add_file_rmap_range() to take >> advantage of batched rmap update for large folio. > > How about something more like this? Yes, we need to define > flush_icache_pages() and PTE_STRIDE. > > (we could also do for (i = 0; i < nr; i++) flush_icache_page(...) but > given that some architectures already implement flush_icache_range(), > I think they may appreciate being given one large range to flush) For flush_icache_range() and flush_icache_page(), the implementation on riscv could be exception. According to arch/riscv/include/asm/cacheflush.h #define flush_icache_range(start, end) flush_icache_all() There is no definition for flush_icache_page(). I suppose flush_icache_page() does nothing on riscv. Using flush_icache_range() may not be good choice for riscv. Regards Yin, Fengwei > > +++ b/mm/memory.c > @@ -4277,15 +4277,19 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page > *page) > } > #endif > > -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > + unsigned int start, unsigned int nr, > + unsigned long addr) > { > + struct page *page = folio_page(page, start); > struct vm_area_struct *vma = vmf->vma; > bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); > bool write = vmf->flags & FAULT_FLAG_WRITE; > bool prefault = vmf->address != addr; > pte_t entry; > + unsigned int i; > > - flush_icache_page(vma, page); > + flush_icache_pages(vma, page, nr); > entry = mk_pte(page, vma->vm_page_prot); > > if (prefault && arch_wants_old_prefaulted_pte()) > @@ -4299,14 +4303,23 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > entry = pte_mkuffd_wp(pte_wrprotect(entry)); > /* copy-on-write page */ > if (write && !(vma->vm_flags & VM_SHARED)) { > - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > - page_add_new_anon_rmap(page, vma, addr); > - lru_cache_add_inactive_or_unevictable(page, vma); > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); > + for (i = 0; i < nr; i++) { > + page_add_new_anon_rmap(page + i, vma, addr); > + lru_cache_add_inactive_or_unevictable(page + i, vma); > + } > } else { > - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); > - page_add_file_rmap(page, vma, false); > + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); > + folio_add_file_rmap(folio, start, n, vma); > + } > + > + for (i = 0; i < nr; i++) { > + set_pte_at(vma->vm_mm, addr, vmf->pte + i, entry); > + /* no need to invalidate: a not-present page won't be cached */ > + update_mmu_cache(vma, addr, vmf->pte + i); > + addr += PAGE_SIZE; > + entry += PTE_STRIDE; > } > - set_pte_at(vma->vm_mm, addr, vmf->pte, entry); > } > > static bool vmf_pte_changed(struct vm_fault *vmf) >
diff --git a/include/linux/mm.h b/include/linux/mm.h index d6f8f41514cc..96e08fcdce24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1162,6 +1162,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, + unsigned long start, unsigned long addr, unsigned int nr); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); diff --git a/mm/filemap.c b/mm/filemap.c index 9cc5edd8f998..95f634d11581 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, ref_count++; do_set_pte(vmf, page, addr); - update_mmu_cache(vma, addr, vmf->pte); } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); /* diff --git a/mm/memory.c b/mm/memory.c index 51c04bb60724..7e41142e1e4f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4257,7 +4257,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) } #endif -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) +static void do_set_pte_entry(struct vm_fault *vmf, struct page *page, + unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); @@ -4277,16 +4278,52 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (unlikely(uffd_wp)) entry = pte_mkuffd_wp(entry); - /* copy-on-write page */ - if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr); - lru_cache_add_inactive_or_unevictable(page, vma); - } else { - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, vma, false); - } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, addr, vmf->pte); +} + +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, + unsigned long start, unsigned long addr, unsigned int nr) +{ + unsigned int i = 0; + struct page *page = folio_page(folio, start); + struct vm_area_struct *vma = vmf->vma; + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && + !(vma->vm_flags & VM_SHARED); + + /* + * file page: batched update rmap, mm counter. + * copy-on-write page: batched update mm counter. + */ + if (!cow) { + page_add_file_rmap_range(folio, start, nr, vma, false); + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); + } else + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); + + do { + if (cow) { + page_add_new_anon_rmap(page, vma, addr); + lru_cache_add_inactive_or_unevictable(page, vma); + } + + do_set_pte_entry(vmf, page, addr); + } while (vmf->pte++, page++, addr += PAGE_SIZE, ++i < nr); +} + +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) +{ + struct folio *folio = page_folio(page); + + do_set_pte_range(vmf, folio, folio_page_idx(folio, page), addr, 1); + + /* + * do_set_pte_range changes vmf->pte. Restore it back as + * do_set_pte doesn't expect the change of vmf->pte. + */ + vmf->pte--; } static bool vmf_pte_changed(struct vm_fault *vmf) @@ -4361,9 +4398,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf) if (likely(!vmf_pte_changed(vmf))) { do_set_pte(vmf, page, vmf->address); - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); - ret = 0; } else { update_mmu_tlb(vma, vmf->address, vmf->pte);
do_set_pte_range() allows to setup page table entries for a specific range. It calls page_add_file_rmap_range() to take advantage of batched rmap update for large folio. Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> --- include/linux/mm.h | 2 ++ mm/filemap.c | 1 - mm/memory.c | 60 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 49 insertions(+), 14 deletions(-)