Message ID | 20230203131636.1648662-4-fengwei.yin@intel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | folio based filemap_map_pages() | expand |
On 03.02.23 14:16, Yin Fengwei wrote: > do_set_pte_range() allows to setup page table entries for a > specific range. It calls folio_add_file_rmap_range() to take > advantage of batched rmap update for large folio. > > Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> > --- > include/linux/mm.h | 3 +++ > mm/filemap.c | 1 - > mm/memory.c | 59 ++++++++++++++++++++++++++++++---------------- > 3 files changed, 42 insertions(+), 21 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index d6f8f41514cc..93192f04b276 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) > > vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); > void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); > +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > + unsigned long addr, pte_t *pte, > + unsigned long start, unsigned int nr); > > vm_fault_t finish_fault(struct vm_fault *vmf); > vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); > diff --git a/mm/filemap.c b/mm/filemap.c > index f444684db9f2..74046a3a0ff5 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, > > ref_count++; > do_set_pte(vmf, page, addr); > - update_mmu_cache(vma, addr, vmf->pte); > } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); > > /* Restore the vmf->pte */ > diff --git a/mm/memory.c b/mm/memory.c > index 7a04a1130ec1..3754b2ef166a 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) > } > #endif > > -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > + unsigned long addr, pte_t *pte, > + unsigned long start, unsigned int nr) > { > struct vm_area_struct *vma = vmf->vma; > bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); > bool write = vmf->flags & FAULT_FLAG_WRITE; > + bool cow = write && !(vma->vm_flags & VM_SHARED); > bool prefault = vmf->address != addr; > + struct page *page = folio_page(folio, start); > pte_t entry; > > - flush_icache_page(vma, page); > - entry = mk_pte(page, vma->vm_page_prot); > + if (!cow) { > + folio_add_file_rmap_range(folio, start, nr, vma, false); > + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); > + } > > - if (prefault && arch_wants_old_prefaulted_pte()) > - entry = pte_mkold(entry); > - else > - entry = pte_sw_mkyoung(entry); > + do { > + flush_icache_page(vma, page); > + entry = mk_pte(page, vma->vm_page_prot); > > - if (write) > - entry = maybe_mkwrite(pte_mkdirty(entry), vma); > - if (unlikely(uffd_wp)) > - entry = pte_mkuffd_wp(entry); > - /* copy-on-write page */ > - if (write && !(vma->vm_flags & VM_SHARED)) { > + if (prefault && arch_wants_old_prefaulted_pte()) > + entry = pte_mkold(entry); > + else > + entry = pte_sw_mkyoung(entry); > + > + if (write) > + entry = maybe_mkwrite(pte_mkdirty(entry), vma); > + if (unlikely(uffd_wp)) > + entry = pte_mkuffd_wp(entry); > + set_pte_at(vma->vm_mm, addr, pte, entry); > + > + /* no need to invalidate: a not-present page won't be cached */ > + update_mmu_cache(vma, addr, pte); > + } while (pte++, page++, addr += PAGE_SIZE, --nr > 0); > +} > + > +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > +{ > + struct folio *folio = page_folio(page); > + struct vm_area_struct *vma = vmf->vma; > + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && > + !(vma->vm_flags & VM_SHARED); > + > + if (cow) { > inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > page_add_new_anon_rmap(page, vma, addr); As raised, we cannot PTE-map a multi-page folio that way. This function only supports single-page anon folios. page_add_new_anon_rmap() -> folio_add_new_anon_rmap(). As that documents: "If the folio is large, it is accounted as a THP" -- for example, we would only increment the "entire mapcount" and set the PageAnonExclusive bit only on the head page. So this really doesn't work for multi-page folios and if the function would be used for that, we'd be in trouble. We'd want some fence here to detect that and bail out if we'd be instructed to do that. At least a WARN_ON_ONCE() I guess. update_mmu_tlb(vma, vmf->address, vmf->pte); Right now the function looks like it might just handle that.
On 2/3/2023 9:25 PM, David Hildenbrand wrote: > On 03.02.23 14:16, Yin Fengwei wrote: >> do_set_pte_range() allows to setup page table entries for a >> specific range. It calls folio_add_file_rmap_range() to take >> advantage of batched rmap update for large folio. >> >> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> >> --- >> include/linux/mm.h | 3 +++ >> mm/filemap.c | 1 - >> mm/memory.c | 59 ++++++++++++++++++++++++++++++---------------- >> 3 files changed, 42 insertions(+), 21 deletions(-) >> >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index d6f8f41514cc..93192f04b276 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) >> vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); >> void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >> + unsigned long addr, pte_t *pte, >> + unsigned long start, unsigned int nr); >> vm_fault_t finish_fault(struct vm_fault *vmf); >> vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); >> diff --git a/mm/filemap.c b/mm/filemap.c >> index f444684db9f2..74046a3a0ff5 100644 >> --- a/mm/filemap.c >> +++ b/mm/filemap.c >> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, >> ref_count++; >> do_set_pte(vmf, page, addr); >> - update_mmu_cache(vma, addr, vmf->pte); >> } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); >> /* Restore the vmf->pte */ >> diff --git a/mm/memory.c b/mm/memory.c >> index 7a04a1130ec1..3754b2ef166a 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) >> } >> #endif >> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >> + unsigned long addr, pte_t *pte, >> + unsigned long start, unsigned int nr) >> { >> struct vm_area_struct *vma = vmf->vma; >> bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); >> bool write = vmf->flags & FAULT_FLAG_WRITE; >> + bool cow = write && !(vma->vm_flags & VM_SHARED); >> bool prefault = vmf->address != addr; >> + struct page *page = folio_page(folio, start); >> pte_t entry; >> - flush_icache_page(vma, page); >> - entry = mk_pte(page, vma->vm_page_prot); >> + if (!cow) { >> + folio_add_file_rmap_range(folio, start, nr, vma, false); >> + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); >> + } >> - if (prefault && arch_wants_old_prefaulted_pte()) >> - entry = pte_mkold(entry); >> - else >> - entry = pte_sw_mkyoung(entry); >> + do { >> + flush_icache_page(vma, page); >> + entry = mk_pte(page, vma->vm_page_prot); >> - if (write) >> - entry = maybe_mkwrite(pte_mkdirty(entry), vma); >> - if (unlikely(uffd_wp)) >> - entry = pte_mkuffd_wp(entry); >> - /* copy-on-write page */ >> - if (write && !(vma->vm_flags & VM_SHARED)) { >> + if (prefault && arch_wants_old_prefaulted_pte()) >> + entry = pte_mkold(entry); >> + else >> + entry = pte_sw_mkyoung(entry); >> + >> + if (write) >> + entry = maybe_mkwrite(pte_mkdirty(entry), vma); >> + if (unlikely(uffd_wp)) >> + entry = pte_mkuffd_wp(entry); >> + set_pte_at(vma->vm_mm, addr, pte, entry); >> + >> + /* no need to invalidate: a not-present page won't be cached */ >> + update_mmu_cache(vma, addr, pte); >> + } while (pte++, page++, addr += PAGE_SIZE, --nr > 0); >> +} >> + >> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >> +{ >> + struct folio *folio = page_folio(page); >> + struct vm_area_struct *vma = vmf->vma; >> + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && >> + !(vma->vm_flags & VM_SHARED); >> + >> + if (cow) { >> inc_mm_counter(vma->vm_mm, MM_ANONPAGES); >> page_add_new_anon_rmap(page, vma, addr); > > As raised, we cannot PTE-map a multi-page folio that way. > > This function only supports single-page anon folios. > > page_add_new_anon_rmap() -> folio_add_new_anon_rmap(). As that documents: > > "If the folio is large, it is accounted as a THP" -- for example, we would only increment the "entire mapcount" and set the PageAnonExclusive bit only on the head page. > > So this really doesn't work for multi-page folios and if the function would be used for that, we'd be in trouble. > > We'd want some fence here to detect that and bail out if we'd be instructed to do that. At least a WARN_ON_ONCE() I guess. > update_mmu_tlb(vma, vmf->address, vmf->pte); > > Right now the function looks like it might just handle that. You are right. I thought moving cow case out of it can make it explicit. But looks like it doesn't. I will add WARN_ON_ONCE(). Thanks. Regards Yin, Fengwei >
On Fri, Feb 03, 2023 at 09:16:35PM +0800, Yin Fengwei wrote: > do_set_pte_range() allows to setup page table entries for a > specific range. It calls folio_add_file_rmap_range() to take > advantage of batched rmap update for large folio. > > Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> > --- > include/linux/mm.h | 3 +++ > mm/filemap.c | 1 - > mm/memory.c | 59 ++++++++++++++++++++++++++++++---------------- > 3 files changed, 42 insertions(+), 21 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index d6f8f41514cc..93192f04b276 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) > > vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); > void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); > +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > + unsigned long addr, pte_t *pte, > + unsigned long start, unsigned int nr); > > vm_fault_t finish_fault(struct vm_fault *vmf); > vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); > diff --git a/mm/filemap.c b/mm/filemap.c > index f444684db9f2..74046a3a0ff5 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, > > ref_count++; > do_set_pte(vmf, page, addr); > - update_mmu_cache(vma, addr, vmf->pte); > } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); > > /* Restore the vmf->pte */ > diff --git a/mm/memory.c b/mm/memory.c > index 7a04a1130ec1..3754b2ef166a 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) > } > #endif > > -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > + unsigned long addr, pte_t *pte, > + unsigned long start, unsigned int nr) > { > struct vm_area_struct *vma = vmf->vma; > bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); > bool write = vmf->flags & FAULT_FLAG_WRITE; > + bool cow = write && !(vma->vm_flags & VM_SHARED); Why don't use is_cow_mapping()? Is there anything messed up with VM_MAYWRITE? > bool prefault = vmf->address != addr; > + struct page *page = folio_page(folio, start); > pte_t entry; > > - flush_icache_page(vma, page); > - entry = mk_pte(page, vma->vm_page_prot); > + if (!cow) { > + folio_add_file_rmap_range(folio, start, nr, vma, false); > + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); > + } > > - if (prefault && arch_wants_old_prefaulted_pte()) > - entry = pte_mkold(entry); > - else > - entry = pte_sw_mkyoung(entry); > + do { > + flush_icache_page(vma, page); > + entry = mk_pte(page, vma->vm_page_prot); > > - if (write) > - entry = maybe_mkwrite(pte_mkdirty(entry), vma); > - if (unlikely(uffd_wp)) > - entry = pte_mkuffd_wp(entry); > - /* copy-on-write page */ > - if (write && !(vma->vm_flags & VM_SHARED)) { > + if (prefault && arch_wants_old_prefaulted_pte()) > + entry = pte_mkold(entry); > + else > + entry = pte_sw_mkyoung(entry); > + > + if (write) > + entry = maybe_mkwrite(pte_mkdirty(entry), vma); > + if (unlikely(uffd_wp)) > + entry = pte_mkuffd_wp(entry); > + set_pte_at(vma->vm_mm, addr, pte, entry); > + > + /* no need to invalidate: a not-present page won't be cached */ > + update_mmu_cache(vma, addr, pte); > + } while (pte++, page++, addr += PAGE_SIZE, --nr > 0); > +} > + > +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > +{ > + struct folio *folio = page_folio(page); > + struct vm_area_struct *vma = vmf->vma; > + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && > + !(vma->vm_flags & VM_SHARED); Here too. > + > + if (cow) { > inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > page_add_new_anon_rmap(page, vma, addr); > lru_cache_add_inactive_or_unevictable(page, vma); > - } else { > - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); > - page_add_file_rmap(page, vma, false); > } > - set_pte_at(vma->vm_mm, addr, vmf->pte, entry); > + > + do_set_pte_range(vmf, folio, addr, vmf->pte, > + folio_page_idx(folio, page), 1); > } > > static bool vmf_pte_changed(struct vm_fault *vmf) > @@ -4361,9 +4383,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf) > if (likely(!vmf_pte_changed(vmf))) { > do_set_pte(vmf, page, vmf->address); > > - /* no need to invalidate: a not-present page won't be cached */ > - update_mmu_cache(vma, vmf->address, vmf->pte); > - > ret = 0; > } else { > update_mmu_tlb(vma, vmf->address, vmf->pte); > -- > 2.30.2 > > Thanks, Chih-En Lin
On 03.02.23 14:30, Yin, Fengwei wrote: > > > On 2/3/2023 9:25 PM, David Hildenbrand wrote: >> On 03.02.23 14:16, Yin Fengwei wrote: >>> do_set_pte_range() allows to setup page table entries for a >>> specific range. It calls folio_add_file_rmap_range() to take >>> advantage of batched rmap update for large folio. >>> >>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> >>> --- >>> include/linux/mm.h | 3 +++ >>> mm/filemap.c | 1 - >>> mm/memory.c | 59 ++++++++++++++++++++++++++++++---------------- >>> 3 files changed, 42 insertions(+), 21 deletions(-) >>> >>> diff --git a/include/linux/mm.h b/include/linux/mm.h >>> index d6f8f41514cc..93192f04b276 100644 >>> --- a/include/linux/mm.h >>> +++ b/include/linux/mm.h >>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) >>> vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); >>> void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); >>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >>> + unsigned long addr, pte_t *pte, >>> + unsigned long start, unsigned int nr); >>> vm_fault_t finish_fault(struct vm_fault *vmf); >>> vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); >>> diff --git a/mm/filemap.c b/mm/filemap.c >>> index f444684db9f2..74046a3a0ff5 100644 >>> --- a/mm/filemap.c >>> +++ b/mm/filemap.c >>> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, >>> ref_count++; >>> do_set_pte(vmf, page, addr); >>> - update_mmu_cache(vma, addr, vmf->pte); >>> } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); >>> /* Restore the vmf->pte */ >>> diff --git a/mm/memory.c b/mm/memory.c >>> index 7a04a1130ec1..3754b2ef166a 100644 >>> --- a/mm/memory.c >>> +++ b/mm/memory.c >>> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) >>> } >>> #endif >>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >>> + unsigned long addr, pte_t *pte, >>> + unsigned long start, unsigned int nr) >>> { >>> struct vm_area_struct *vma = vmf->vma; >>> bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); >>> bool write = vmf->flags & FAULT_FLAG_WRITE; >>> + bool cow = write && !(vma->vm_flags & VM_SHARED); >>> bool prefault = vmf->address != addr; >>> + struct page *page = folio_page(folio, start); >>> pte_t entry; >>> - flush_icache_page(vma, page); >>> - entry = mk_pte(page, vma->vm_page_prot); >>> + if (!cow) { >>> + folio_add_file_rmap_range(folio, start, nr, vma, false); >>> + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); >>> + } >>> - if (prefault && arch_wants_old_prefaulted_pte()) >>> - entry = pte_mkold(entry); >>> - else >>> - entry = pte_sw_mkyoung(entry); >>> + do { >>> + flush_icache_page(vma, page); >>> + entry = mk_pte(page, vma->vm_page_prot); >>> - if (write) >>> - entry = maybe_mkwrite(pte_mkdirty(entry), vma); >>> - if (unlikely(uffd_wp)) >>> - entry = pte_mkuffd_wp(entry); >>> - /* copy-on-write page */ >>> - if (write && !(vma->vm_flags & VM_SHARED)) { >>> + if (prefault && arch_wants_old_prefaulted_pte()) >>> + entry = pte_mkold(entry); >>> + else >>> + entry = pte_sw_mkyoung(entry); >>> + >>> + if (write) >>> + entry = maybe_mkwrite(pte_mkdirty(entry), vma); >>> + if (unlikely(uffd_wp)) >>> + entry = pte_mkuffd_wp(entry); >>> + set_pte_at(vma->vm_mm, addr, pte, entry); >>> + >>> + /* no need to invalidate: a not-present page won't be cached */ >>> + update_mmu_cache(vma, addr, pte); >>> + } while (pte++, page++, addr += PAGE_SIZE, --nr > 0); >>> +} >>> + >>> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >>> +{ >>> + struct folio *folio = page_folio(page); >>> + struct vm_area_struct *vma = vmf->vma; >>> + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && >>> + !(vma->vm_flags & VM_SHARED); >>> + >>> + if (cow) { >>> inc_mm_counter(vma->vm_mm, MM_ANONPAGES); >>> page_add_new_anon_rmap(page, vma, addr); >> >> As raised, we cannot PTE-map a multi-page folio that way. >> >> This function only supports single-page anon folios. >> >> page_add_new_anon_rmap() -> folio_add_new_anon_rmap(). As that documents: >> >> "If the folio is large, it is accounted as a THP" -- for example, we would only increment the "entire mapcount" and set the PageAnonExclusive bit only on the head page. >> >> So this really doesn't work for multi-page folios and if the function would be used for that, we'd be in trouble. >> >> We'd want some fence here to detect that and bail out if we'd be instructed to do that. At least a WARN_ON_ONCE() I guess. >> update_mmu_tlb(vma, vmf->address, vmf->pte); >> >> Right now the function looks like it might just handle that. > You are right. I thought moving cow case out of it can make it explicit. > But looks like it doesn't. I will add WARN_ON_ONCE(). Thanks. I guess I would move the cow check into do_set_pte_range() as well, and verify in there that we are really only dealing with a single-page folio, commenting that rmap code would need serious adjustment to make it work and that current code never passes a multi-page folio. Thanks!
On 2/3/2023 9:32 PM, Chih-En Lin wrote: > On Fri, Feb 03, 2023 at 09:16:35PM +0800, Yin Fengwei wrote: >> do_set_pte_range() allows to setup page table entries for a >> specific range. It calls folio_add_file_rmap_range() to take >> advantage of batched rmap update for large folio. >> >> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> >> --- >> include/linux/mm.h | 3 +++ >> mm/filemap.c | 1 - >> mm/memory.c | 59 ++++++++++++++++++++++++++++++---------------- >> 3 files changed, 42 insertions(+), 21 deletions(-) >> >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index d6f8f41514cc..93192f04b276 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) >> >> vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); >> void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >> + unsigned long addr, pte_t *pte, >> + unsigned long start, unsigned int nr); >> >> vm_fault_t finish_fault(struct vm_fault *vmf); >> vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); >> diff --git a/mm/filemap.c b/mm/filemap.c >> index f444684db9f2..74046a3a0ff5 100644 >> --- a/mm/filemap.c >> +++ b/mm/filemap.c >> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, >> >> ref_count++; >> do_set_pte(vmf, page, addr); >> - update_mmu_cache(vma, addr, vmf->pte); >> } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); >> >> /* Restore the vmf->pte */ >> diff --git a/mm/memory.c b/mm/memory.c >> index 7a04a1130ec1..3754b2ef166a 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) >> } >> #endif >> >> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >> + unsigned long addr, pte_t *pte, >> + unsigned long start, unsigned int nr) >> { >> struct vm_area_struct *vma = vmf->vma; >> bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); >> bool write = vmf->flags & FAULT_FLAG_WRITE; >> + bool cow = write && !(vma->vm_flags & VM_SHARED); > > Why don't use is_cow_mapping()? > Is there anything messed up with VM_MAYWRITE? My understanding is it's not related with the mapping. It's related with what operation triggers fault here. Say, if it's a writable mapping, and if the read operation triggers fault here, no cow or maybe_mkwrite() needed here. Thanks. Regards Yin, Fengwei > >> bool prefault = vmf->address != addr; >> + struct page *page = folio_page(folio, start); >> pte_t entry; >> >> - flush_icache_page(vma, page); >> - entry = mk_pte(page, vma->vm_page_prot); >> + if (!cow) { >> + folio_add_file_rmap_range(folio, start, nr, vma, false); >> + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); >> + } >> >> - if (prefault && arch_wants_old_prefaulted_pte()) >> - entry = pte_mkold(entry); >> - else >> - entry = pte_sw_mkyoung(entry); >> + do { >> + flush_icache_page(vma, page); >> + entry = mk_pte(page, vma->vm_page_prot); >> >> - if (write) >> - entry = maybe_mkwrite(pte_mkdirty(entry), vma); >> - if (unlikely(uffd_wp)) >> - entry = pte_mkuffd_wp(entry); >> - /* copy-on-write page */ >> - if (write && !(vma->vm_flags & VM_SHARED)) { >> + if (prefault && arch_wants_old_prefaulted_pte()) >> + entry = pte_mkold(entry); >> + else >> + entry = pte_sw_mkyoung(entry); >> + >> + if (write) >> + entry = maybe_mkwrite(pte_mkdirty(entry), vma); >> + if (unlikely(uffd_wp)) >> + entry = pte_mkuffd_wp(entry); >> + set_pte_at(vma->vm_mm, addr, pte, entry); >> + >> + /* no need to invalidate: a not-present page won't be cached */ >> + update_mmu_cache(vma, addr, pte); >> + } while (pte++, page++, addr += PAGE_SIZE, --nr > 0); >> +} >> + >> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >> +{ >> + struct folio *folio = page_folio(page); >> + struct vm_area_struct *vma = vmf->vma; >> + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && >> + !(vma->vm_flags & VM_SHARED); > > Here too. > >> + >> + if (cow) { >> inc_mm_counter(vma->vm_mm, MM_ANONPAGES); >> page_add_new_anon_rmap(page, vma, addr); >> lru_cache_add_inactive_or_unevictable(page, vma); >> - } else { >> - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); >> - page_add_file_rmap(page, vma, false); >> } >> - set_pte_at(vma->vm_mm, addr, vmf->pte, entry); >> + >> + do_set_pte_range(vmf, folio, addr, vmf->pte, >> + folio_page_idx(folio, page), 1); >> } >> >> static bool vmf_pte_changed(struct vm_fault *vmf) >> @@ -4361,9 +4383,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf) >> if (likely(!vmf_pte_changed(vmf))) { >> do_set_pte(vmf, page, vmf->address); >> >> - /* no need to invalidate: a not-present page won't be cached */ >> - update_mmu_cache(vma, vmf->address, vmf->pte); >> - >> ret = 0; >> } else { >> update_mmu_tlb(vma, vmf->address, vmf->pte); >> -- >> 2.30.2 >> >> > > Thanks, > Chih-En Lin >
On 2/3/2023 9:34 PM, David Hildenbrand wrote: > On 03.02.23 14:30, Yin, Fengwei wrote: >> >> >> On 2/3/2023 9:25 PM, David Hildenbrand wrote: >>> On 03.02.23 14:16, Yin Fengwei wrote: >>>> do_set_pte_range() allows to setup page table entries for a >>>> specific range. It calls folio_add_file_rmap_range() to take >>>> advantage of batched rmap update for large folio. >>>> >>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> >>>> --- >>>> include/linux/mm.h | 3 +++ >>>> mm/filemap.c | 1 - >>>> mm/memory.c | 59 ++++++++++++++++++++++++++++++---------------- >>>> 3 files changed, 42 insertions(+), 21 deletions(-) >>>> >>>> diff --git a/include/linux/mm.h b/include/linux/mm.h >>>> index d6f8f41514cc..93192f04b276 100644 >>>> --- a/include/linux/mm.h >>>> +++ b/include/linux/mm.h >>>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) >>>> vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); >>>> void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); >>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >>>> + unsigned long addr, pte_t *pte, >>>> + unsigned long start, unsigned int nr); >>>> vm_fault_t finish_fault(struct vm_fault *vmf); >>>> vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); >>>> diff --git a/mm/filemap.c b/mm/filemap.c >>>> index f444684db9f2..74046a3a0ff5 100644 >>>> --- a/mm/filemap.c >>>> +++ b/mm/filemap.c >>>> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, >>>> ref_count++; >>>> do_set_pte(vmf, page, addr); >>>> - update_mmu_cache(vma, addr, vmf->pte); >>>> } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); >>>> /* Restore the vmf->pte */ >>>> diff --git a/mm/memory.c b/mm/memory.c >>>> index 7a04a1130ec1..3754b2ef166a 100644 >>>> --- a/mm/memory.c >>>> +++ b/mm/memory.c >>>> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) >>>> } >>>> #endif >>>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >>>> + unsigned long addr, pte_t *pte, >>>> + unsigned long start, unsigned int nr) >>>> { >>>> struct vm_area_struct *vma = vmf->vma; >>>> bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); >>>> bool write = vmf->flags & FAULT_FLAG_WRITE; >>>> + bool cow = write && !(vma->vm_flags & VM_SHARED); >>>> bool prefault = vmf->address != addr; >>>> + struct page *page = folio_page(folio, start); >>>> pte_t entry; >>>> - flush_icache_page(vma, page); >>>> - entry = mk_pte(page, vma->vm_page_prot); >>>> + if (!cow) { >>>> + folio_add_file_rmap_range(folio, start, nr, vma, false); >>>> + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); >>>> + } >>>> - if (prefault && arch_wants_old_prefaulted_pte()) >>>> - entry = pte_mkold(entry); >>>> - else >>>> - entry = pte_sw_mkyoung(entry); >>>> + do { >>>> + flush_icache_page(vma, page); >>>> + entry = mk_pte(page, vma->vm_page_prot); >>>> - if (write) >>>> - entry = maybe_mkwrite(pte_mkdirty(entry), vma); >>>> - if (unlikely(uffd_wp)) >>>> - entry = pte_mkuffd_wp(entry); >>>> - /* copy-on-write page */ >>>> - if (write && !(vma->vm_flags & VM_SHARED)) { >>>> + if (prefault && arch_wants_old_prefaulted_pte()) >>>> + entry = pte_mkold(entry); >>>> + else >>>> + entry = pte_sw_mkyoung(entry); >>>> + >>>> + if (write) >>>> + entry = maybe_mkwrite(pte_mkdirty(entry), vma); >>>> + if (unlikely(uffd_wp)) >>>> + entry = pte_mkuffd_wp(entry); >>>> + set_pte_at(vma->vm_mm, addr, pte, entry); >>>> + >>>> + /* no need to invalidate: a not-present page won't be cached */ >>>> + update_mmu_cache(vma, addr, pte); >>>> + } while (pte++, page++, addr += PAGE_SIZE, --nr > 0); >>>> +} >>>> + >>>> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >>>> +{ >>>> + struct folio *folio = page_folio(page); >>>> + struct vm_area_struct *vma = vmf->vma; >>>> + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && >>>> + !(vma->vm_flags & VM_SHARED); >>>> + >>>> + if (cow) { >>>> inc_mm_counter(vma->vm_mm, MM_ANONPAGES); >>>> page_add_new_anon_rmap(page, vma, addr); >>> >>> As raised, we cannot PTE-map a multi-page folio that way. >>> >>> This function only supports single-page anon folios. >>> >>> page_add_new_anon_rmap() -> folio_add_new_anon_rmap(). As that documents: >>> >>> "If the folio is large, it is accounted as a THP" -- for example, we would only increment the "entire mapcount" and set the PageAnonExclusive bit only on the head page. >>> >>> So this really doesn't work for multi-page folios and if the function would be used for that, we'd be in trouble. >>> >>> We'd want some fence here to detect that and bail out if we'd be instructed to do that. At least a WARN_ON_ONCE() I guess. >>> update_mmu_tlb(vma, vmf->address, vmf->pte); >>> >>> Right now the function looks like it might just handle that. >> You are right. I thought moving cow case out of it can make it explicit. >> But looks like it doesn't. I will add WARN_ON_ONCE(). Thanks. > > I guess I would move the cow check into do_set_pte_range() as well, and verify in there that we are really only dealing with a single-page folio, commenting that rmap code would need serious adjustment to make it work and that current code never passes a multi-page folio. Yes. There is !cow in do_set_pte_range() already. I planned to add WARN_ON_ONCE() on else case there. Thanks. Regards Yin, Fengwei > > Thanks! >
On Fri, Feb 03, 2023 at 09:38:15PM +0800, Yin, Fengwei wrote: > > > On 2/3/2023 9:32 PM, Chih-En Lin wrote: > > On Fri, Feb 03, 2023 at 09:16:35PM +0800, Yin Fengwei wrote: > >> do_set_pte_range() allows to setup page table entries for a > >> specific range. It calls folio_add_file_rmap_range() to take > >> advantage of batched rmap update for large folio. > >> > >> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> > >> --- > >> include/linux/mm.h | 3 +++ > >> mm/filemap.c | 1 - > >> mm/memory.c | 59 ++++++++++++++++++++++++++++++---------------- > >> 3 files changed, 42 insertions(+), 21 deletions(-) > >> > >> diff --git a/include/linux/mm.h b/include/linux/mm.h > >> index d6f8f41514cc..93192f04b276 100644 > >> --- a/include/linux/mm.h > >> +++ b/include/linux/mm.h > >> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) > >> > >> vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); > >> void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); > >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > >> + unsigned long addr, pte_t *pte, > >> + unsigned long start, unsigned int nr); > >> > >> vm_fault_t finish_fault(struct vm_fault *vmf); > >> vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); > >> diff --git a/mm/filemap.c b/mm/filemap.c > >> index f444684db9f2..74046a3a0ff5 100644 > >> --- a/mm/filemap.c > >> +++ b/mm/filemap.c > >> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, > >> > >> ref_count++; > >> do_set_pte(vmf, page, addr); > >> - update_mmu_cache(vma, addr, vmf->pte); > >> } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); > >> > >> /* Restore the vmf->pte */ > >> diff --git a/mm/memory.c b/mm/memory.c > >> index 7a04a1130ec1..3754b2ef166a 100644 > >> --- a/mm/memory.c > >> +++ b/mm/memory.c > >> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) > >> } > >> #endif > >> > >> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, > >> + unsigned long addr, pte_t *pte, > >> + unsigned long start, unsigned int nr) > >> { > >> struct vm_area_struct *vma = vmf->vma; > >> bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); > >> bool write = vmf->flags & FAULT_FLAG_WRITE; > >> + bool cow = write && !(vma->vm_flags & VM_SHARED); > > > > Why don't use is_cow_mapping()? > > Is there anything messed up with VM_MAYWRITE? > My understanding is it's not related with the mapping. It's related with > what operation triggers fault here. Say, if it's a writable mapping, and > if the read operation triggers fault here, no cow or maybe_mkwrite() needed > here. Thanks. Sorry, I didn't describe the thing properly. It makes sense for the relationship with the write/read fault. I'm just wondering if "!(vma->vm_flags & VM_SHARED)" is enough to determine the COW page? And, I also found it in do_fault(). Like, copy_present_pte() use is_cow_mapping() for COW page and "vm_flags & VM_SHARED" for shared mapping. So, I looked up the commit that introduced the is_cow_mapping(), 67121172f9753 ("Allow arbitrary read-only shared pfn-remapping too"). Here is the commit message: " The VM layer (for historical reasons) turns a read-only shared mmap into a private-like mapping with the VM_MAYWRITE bit clear. Thus checking just VM_SHARED isn't actually sufficient. So use a trivial helper function for the cases where we wanted to inquire if a mapping was COW-like or not. " hmm, but it is v2.6.15. So is "vm_flags & VM_SHARED" enough to check the COW page now? Thanks, Chih-En Lin
On 2/3/2023 10:30 PM, Chih-En Lin wrote: > On Fri, Feb 03, 2023 at 09:38:15PM +0800, Yin, Fengwei wrote: >> >> >> On 2/3/2023 9:32 PM, Chih-En Lin wrote: >>> On Fri, Feb 03, 2023 at 09:16:35PM +0800, Yin Fengwei wrote: >>>> do_set_pte_range() allows to setup page table entries for a >>>> specific range. It calls folio_add_file_rmap_range() to take >>>> advantage of batched rmap update for large folio. >>>> >>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> >>>> --- >>>> include/linux/mm.h | 3 +++ >>>> mm/filemap.c | 1 - >>>> mm/memory.c | 59 ++++++++++++++++++++++++++++++---------------- >>>> 3 files changed, 42 insertions(+), 21 deletions(-) >>>> >>>> diff --git a/include/linux/mm.h b/include/linux/mm.h >>>> index d6f8f41514cc..93192f04b276 100644 >>>> --- a/include/linux/mm.h >>>> +++ b/include/linux/mm.h >>>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) >>>> >>>> vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); >>>> void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); >>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >>>> + unsigned long addr, pte_t *pte, >>>> + unsigned long start, unsigned int nr); >>>> >>>> vm_fault_t finish_fault(struct vm_fault *vmf); >>>> vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); >>>> diff --git a/mm/filemap.c b/mm/filemap.c >>>> index f444684db9f2..74046a3a0ff5 100644 >>>> --- a/mm/filemap.c >>>> +++ b/mm/filemap.c >>>> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, >>>> >>>> ref_count++; >>>> do_set_pte(vmf, page, addr); >>>> - update_mmu_cache(vma, addr, vmf->pte); >>>> } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); >>>> >>>> /* Restore the vmf->pte */ >>>> diff --git a/mm/memory.c b/mm/memory.c >>>> index 7a04a1130ec1..3754b2ef166a 100644 >>>> --- a/mm/memory.c >>>> +++ b/mm/memory.c >>>> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) >>>> } >>>> #endif >>>> >>>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) >>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, >>>> + unsigned long addr, pte_t *pte, >>>> + unsigned long start, unsigned int nr) >>>> { >>>> struct vm_area_struct *vma = vmf->vma; >>>> bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); >>>> bool write = vmf->flags & FAULT_FLAG_WRITE; >>>> + bool cow = write && !(vma->vm_flags & VM_SHARED); >>> >>> Why don't use is_cow_mapping()? >>> Is there anything messed up with VM_MAYWRITE? >> My understanding is it's not related with the mapping. It's related with >> what operation triggers fault here. Say, if it's a writable mapping, and >> if the read operation triggers fault here, no cow or maybe_mkwrite() needed >> here. Thanks. > > Sorry, I didn't describe the thing properly. > It makes sense for the relationship with the write/read fault. > I'm just wondering if "!(vma->vm_flags & VM_SHARED)" is enough to determine > the COW page? And, I also found it in do_fault(). > > Like, copy_present_pte() use is_cow_mapping() for COW page and > "vm_flags & VM_SHARED" for shared mapping. > > So, I looked up the commit that introduced the is_cow_mapping(), > 67121172f9753 ("Allow arbitrary read-only shared pfn-remapping too"). > > Here is the commit message: > " > The VM layer (for historical reasons) turns a read-only shared mmap into > a private-like mapping with the VM_MAYWRITE bit clear. Thus checking > just VM_SHARED isn't actually sufficient. > > So use a trivial helper function for the cases where we wanted to inquire > if a mapping was COW-like or not. > " > > hmm, but it is v2.6.15. > So is "vm_flags & VM_SHARED" enough to check the COW page now? Thanks for the detail info here. Yes. VM_MAYWRITE bit of vma->vm_flags needs be checked for COW also. In the page fault path, the VM_MAYWRITE bit was checked in sanitize_fault_flags(): /* Write faults on read-only mappings are impossible ... */ if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) return VM_FAULT_SIGSEGV; and bail out early if it's write fault and no VM_MAYWRITE. My understanding is that sanitize_fault_flags() is called first before hit do_set_pte()/do_set_pte_range(). Regards Yin, Fengwei > > Thanks, > Chih-En Lin
diff --git a/include/linux/mm.h b/include/linux/mm.h index d6f8f41514cc..93192f04b276 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, + unsigned long addr, pte_t *pte, + unsigned long start, unsigned int nr); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); diff --git a/mm/filemap.c b/mm/filemap.c index f444684db9f2..74046a3a0ff5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, ref_count++; do_set_pte(vmf, page, addr); - update_mmu_cache(vma, addr, vmf->pte); } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); /* Restore the vmf->pte */ diff --git a/mm/memory.c b/mm/memory.c index 7a04a1130ec1..3754b2ef166a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) } #endif -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio, + unsigned long addr, pte_t *pte, + unsigned long start, unsigned int nr) { struct vm_area_struct *vma = vmf->vma; bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); bool write = vmf->flags & FAULT_FLAG_WRITE; + bool cow = write && !(vma->vm_flags & VM_SHARED); bool prefault = vmf->address != addr; + struct page *page = folio_page(folio, start); pte_t entry; - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); + if (!cow) { + folio_add_file_rmap_range(folio, start, nr, vma, false); + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); + } - if (prefault && arch_wants_old_prefaulted_pte()) - entry = pte_mkold(entry); - else - entry = pte_sw_mkyoung(entry); + do { + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); - if (write) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (unlikely(uffd_wp)) - entry = pte_mkuffd_wp(entry); - /* copy-on-write page */ - if (write && !(vma->vm_flags & VM_SHARED)) { + if (prefault && arch_wants_old_prefaulted_pte()) + entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); + + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (unlikely(uffd_wp)) + entry = pte_mkuffd_wp(entry); + set_pte_at(vma->vm_mm, addr, pte, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, addr, pte); + } while (pte++, page++, addr += PAGE_SIZE, --nr > 0); +} + +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) +{ + struct folio *folio = page_folio(page); + struct vm_area_struct *vma = vmf->vma; + bool cow = (vmf->flags & FAULT_FLAG_WRITE) && + !(vma->vm_flags & VM_SHARED); + + if (cow) { inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); - } else { - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, vma, false); } - set_pte_at(vma->vm_mm, addr, vmf->pte, entry); + + do_set_pte_range(vmf, folio, addr, vmf->pte, + folio_page_idx(folio, page), 1); } static bool vmf_pte_changed(struct vm_fault *vmf) @@ -4361,9 +4383,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf) if (likely(!vmf_pte_changed(vmf))) { do_set_pte(vmf, page, vmf->address); - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); - ret = 0; } else { update_mmu_tlb(vma, vmf->address, vmf->pte);
do_set_pte_range() allows to setup page table entries for a specific range. It calls folio_add_file_rmap_range() to take advantage of batched rmap update for large folio. Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> --- include/linux/mm.h | 3 +++ mm/filemap.c | 1 - mm/memory.c | 59 ++++++++++++++++++++++++++++++---------------- 3 files changed, 42 insertions(+), 21 deletions(-)