Message ID | 20240911065600.1002644-2-dev.jain@arm.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Do not shatter hugezeropage on wp-fault | expand |
On 11.09.24 08:55, Dev Jain wrote: > In preparation for the second patch, abstract away the THP allocation > logic present in the create_huge_pmd() path, which corresponds to the > faulting case when no page is present. > > There should be no functional change as a result of applying > this patch. > Hi, > Signed-off-by: Dev Jain <dev.jain@arm.com> > --- > mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------ > 1 file changed, 67 insertions(+), 43 deletions(-) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 67c86a5d64a6..b96a1ff2bf40 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, > } > EXPORT_SYMBOL_GPL(thp_get_unmapped_area); > > -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, > - struct page *page, gfp_t gfp) > +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma, > + unsigned long haddr, unsigned long addr) I suggest calling this something like "vma_alloc_anon_folio_pmd()"? Then it's more consistent with vma_alloc_folio(). Also, likely we should just only pass in "addr" and calculate "haddr" ourselves, it's cheap and reduces the number of function parameters. > { > - struct vm_area_struct *vma = vmf->vma; > - struct folio *folio = page_folio(page); > - pgtable_t pgtable; > - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; > - vm_fault_t ret = 0; > + const int order = HPAGE_PMD_ORDER; > + struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true); > > - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); > + if (unlikely(!folio)) { > + count_vm_event(THP_FAULT_FALLBACK); > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); > + goto out; > + } > > + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); > if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { > folio_put(folio); > count_vm_event(THP_FAULT_FALLBACK); > count_vm_event(THP_FAULT_FALLBACK_CHARGE); > - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); > - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > - return VM_FAULT_FALLBACK; > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > + goto out; > } > folio_throttle_swaprate(folio, gfp); > > - pgtable = pte_alloc_one(vma->vm_mm); > - if (unlikely(!pgtable)) { > - ret = VM_FAULT_OOM; > - goto release; > - } > - > - folio_zero_user(folio, vmf->address); > + folio_zero_user(folio, addr); > /* > * The memory barrier inside __folio_mark_uptodate makes sure that > * folio_zero_user writes become visible before the set_pmd_at() > * write. > */ > __folio_mark_uptodate(folio); > +out: > + return folio; > +} > + > +static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma) > +{ > + count_vm_event(THP_FAULT_ALLOC); > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); > + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); > +} Why isn't that moved into map_pmd_thp() Note that in this patch you do: map_pmd_thp(folio, vmf, vma, haddr); spin_unlock(vmf->ptl); __pmd_thp_fault_success_stats(vma); But in patch #2 map_pmd_thp(folio, vmf, vma, haddr); __pmd_thp_fault_success_stats(vma); goto unlock; release: folio_put(folio); unlock: spin_unlock(vmf->ptl); Please make that consistent, meaning: 1) Inline __pmd_thp_fault_success_stats() into map_pmd_thp(). No need to have the separated out. 2) Either do the PTL unlocking in __pmd_thp_fault_success_stats() or in the caller. In the caller is likely easiest. Adjusting the counters should be cheap, if not we could revisit this later with real data. > + > +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, > + struct vm_area_struct *vma, unsigned long haddr) > +{ > + pmd_t entry; > + > + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); > + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); > + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); > + folio_add_lru_vma(folio, vma); > + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); > + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); It's quite weird to see a mixture of haddr and vmf->address, and likely this mixture is wrong or not not required. Looking at arc's update_mmu_cache_pmd() implementation, I cannot see how passing in the unaligned address would do the right thing. But maybe arc also doesn't trigger that code path ... who knows :) Staring at some other update_mmu_cache_pmd() users, it's quite inconsistent. Primarily only do_huge_pmd_numa_page() and __do_huge_pmd_anonymous_page() use the unaligned address. The others seem to use the aligned address ... as one would expect when modifying a PMD. I suggest to change this function to *not* pass in the vmf, and rename it to something like: static void folio_map_anon_pmd(struct folio *folio, struct vm_area_struct *vma, pmd_t *pmd, unsigned long haddr) Then use haddr also to do the update_mmu_cache_pmd(). > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); > + mm_inc_nr_ptes(vma->vm_mm); > +} > + > +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) > +{ > + struct vm_area_struct *vma = vmf->vma; > + struct folio *folio; > + pgtable_t pgtable; > + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; > + vm_fault_t ret = 0; > + gfp_t gfp = vma_thp_gfp_mask(vma); Nit: While at it, try to use reverse christmas-tree where possible, makes things more reasible. You could make haddr const. struct vm_area_struct *vma = vmf->vma; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; gfp_t gfp = vma_thp_gfp_mask(vma); struct folio *folio; vm_fault_t ret = 0; ... > + > + folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address); > + if (unlikely(!folio)) { > + ret = VM_FAULT_FALLBACK; > + goto release; > + } > + > + pgtable = pte_alloc_one(vma->vm_mm); > + if (unlikely(!pgtable)) { > + ret = VM_FAULT_OOM; > + goto release; > + } > > vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); > + Nit Unrelated change. > if (unlikely(!pmd_none(*vmf->pmd))) { > goto unlock_release;
On 11.09.24 11:27, David Hildenbrand wrote: > On 11.09.24 08:55, Dev Jain wrote: >> In preparation for the second patch, abstract away the THP allocation >> logic present in the create_huge_pmd() path, which corresponds to the >> faulting case when no page is present. >> >> There should be no functional change as a result of applying >> this patch. >> > > Hi, > >> Signed-off-by: Dev Jain <dev.jain@arm.com> >> --- >> mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------ >> 1 file changed, 67 insertions(+), 43 deletions(-) >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 67c86a5d64a6..b96a1ff2bf40 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, >> } >> EXPORT_SYMBOL_GPL(thp_get_unmapped_area); >> >> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, >> - struct page *page, gfp_t gfp) >> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma, >> + unsigned long haddr, unsigned long addr) > > I suggest calling this something like "vma_alloc_anon_folio_pmd()"? Then > it's more consistent with vma_alloc_folio(). > > Also, likely we should just only pass in "addr" and calculate "haddr" > ourselves, it's cheap and reduces the number of function parameters. > >> { >> - struct vm_area_struct *vma = vmf->vma; >> - struct folio *folio = page_folio(page); >> - pgtable_t pgtable; >> - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >> - vm_fault_t ret = 0; >> + const int order = HPAGE_PMD_ORDER; >> + struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true); >> >> - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); >> + if (unlikely(!folio)) { >> + count_vm_event(THP_FAULT_FALLBACK); >> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); >> + goto out; >> + } >> >> + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); >> if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { >> folio_put(folio); >> count_vm_event(THP_FAULT_FALLBACK); >> count_vm_event(THP_FAULT_FALLBACK_CHARGE); >> - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); >> - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >> - return VM_FAULT_FALLBACK; >> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); >> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >> + goto out; >> } >> folio_throttle_swaprate(folio, gfp); >> >> - pgtable = pte_alloc_one(vma->vm_mm); >> - if (unlikely(!pgtable)) { >> - ret = VM_FAULT_OOM; >> - goto release; >> - } >> - >> - folio_zero_user(folio, vmf->address); >> + folio_zero_user(folio, addr); >> /* >> * The memory barrier inside __folio_mark_uptodate makes sure that >> * folio_zero_user writes become visible before the set_pmd_at() >> * write. >> */ >> __folio_mark_uptodate(folio); >> +out: >> + return folio; >> +} >> + >> +static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma) >> +{ >> + count_vm_event(THP_FAULT_ALLOC); >> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); >> + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); >> +} > > Why isn't that moved into map_pmd_thp() > > Note that in this patch you do: > > map_pmd_thp(folio, vmf, vma, haddr); > spin_unlock(vmf->ptl); > __pmd_thp_fault_success_stats(vma); > > But in patch #2 > > map_pmd_thp(folio, vmf, vma, haddr); > __pmd_thp_fault_success_stats(vma); > goto unlock; > release: > folio_put(folio); > unlock: > spin_unlock(vmf->ptl); > > Please make that consistent, meaning: > > 1) Inline __pmd_thp_fault_success_stats() into map_pmd_thp(). No need to > have the separated out. > > 2) Either do the PTL unlocking in __pmd_thp_fault_success_stats() or in > the caller. In the caller is likely easiest. Adjusting the counters > should be cheap, if not we could revisit this later with real data. > >> + >> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >> + struct vm_area_struct *vma, unsigned long haddr) >> +{ >> + pmd_t entry; >> + >> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >> + folio_add_lru_vma(folio, vma); >> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); > > It's quite weird to see a mixture of haddr and vmf->address, and likely > this mixture is wrong or not not required. > > Looking at arc's update_mmu_cache_pmd() implementation, I cannot see how > passing in the unaligned address would do the right thing. But maybe arc > also doesn't trigger that code path ... who knows :) > > > Staring at some other update_mmu_cache_pmd() users, it's quite > inconsistent. Primarily only do_huge_pmd_numa_page() and > __do_huge_pmd_anonymous_page() use the unaligned address. The others > seem to use the aligned address ... as one would expect when modifying a > PMD. > > > I suggest to change this function to *not* pass in the vmf, and rename > it to something like: > > static void folio_map_anon_pmd(struct folio *folio, struct > vm_area_struct *vma, pmd_t *pmd, unsigned long haddr) ... or better "map_anon_folio_pmd" so it better matches vma_alloc_folio_pmd() suggested above. Could also do vma_map_anon_folio_pmd() :)
On 2024/9/11 14:55, Dev Jain wrote: > In preparation for the second patch, abstract away the THP allocation > logic present in the create_huge_pmd() path, which corresponds to the > faulting case when no page is present. > > There should be no functional change as a result of applying > this patch. > > Signed-off-by: Dev Jain <dev.jain@arm.com> > --- > mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------ > 1 file changed, 67 insertions(+), 43 deletions(-) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 67c86a5d64a6..b96a1ff2bf40 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, > } > EXPORT_SYMBOL_GPL(thp_get_unmapped_area); > > -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, > - struct page *page, gfp_t gfp) > +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma, > + unsigned long haddr, unsigned long addr) > { > - struct vm_area_struct *vma = vmf->vma; > - struct folio *folio = page_folio(page); > - pgtable_t pgtable; > - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; > - vm_fault_t ret = 0; > + const int order = HPAGE_PMD_ORDER; Maybe move vma_thp_gfp_mask() into this function too. > + struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true); > > - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); > + if (unlikely(!folio)) { > + count_vm_event(THP_FAULT_FALLBACK); > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); > + goto out; > + } > > + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); > if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { > folio_put(folio); > count_vm_event(THP_FAULT_FALLBACK); > count_vm_event(THP_FAULT_FALLBACK_CHARGE); > - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); > - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > - return VM_FAULT_FALLBACK; > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > + goto out; We need to return NULL here as folio not set to null, > } > folio_throttle_swaprate(folio, gfp); > > - pgtable = pte_alloc_one(vma->vm_mm); > - if (unlikely(!pgtable)) { > - ret = VM_FAULT_OOM; > - goto release; > - } > - > - folio_zero_user(folio, vmf->address); > + folio_zero_user(folio, addr); > /* > * The memory barrier inside __folio_mark_uptodate makes sure that > * folio_zero_user writes become visible before the set_pmd_at() > * write. > */ > __folio_mark_uptodate(folio); > +out: > + return folio; > +} > +
On 9/11/24 14:57, David Hildenbrand wrote: > On 11.09.24 08:55, Dev Jain wrote: >> In preparation for the second patch, abstract away the THP allocation >> logic present in the create_huge_pmd() path, which corresponds to the >> faulting case when no page is present. >> >> There should be no functional change as a result of applying >> this patch. >> > > Hi, > >> Signed-off-by: Dev Jain <dev.jain@arm.com> >> --- >> mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------ >> 1 file changed, 67 insertions(+), 43 deletions(-) >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 67c86a5d64a6..b96a1ff2bf40 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file >> *filp, unsigned long addr, >> } >> EXPORT_SYMBOL_GPL(thp_get_unmapped_area); >> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, >> - struct page *page, gfp_t gfp) >> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct >> vm_area_struct *vma, >> + unsigned long haddr, unsigned long addr) > > I suggest calling this something like "vma_alloc_anon_folio_pmd()"? > Then it's more consistent with vma_alloc_folio(). > > Also, likely we should just only pass in "addr" and calculate "haddr" > ourselves, it's cheap and reduces the number of function parameters. Makes sense, thanks. > >> { >> - struct vm_area_struct *vma = vmf->vma; >> - struct folio *folio = page_folio(page); >> - pgtable_t pgtable; >> - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >> - vm_fault_t ret = 0; >> + const int order = HPAGE_PMD_ORDER; >> + struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, >> true); >> - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); >> + if (unlikely(!folio)) { >> + count_vm_event(THP_FAULT_FALLBACK); >> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); >> + goto out; >> + } >> + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); >> if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { >> folio_put(folio); >> count_vm_event(THP_FAULT_FALLBACK); >> count_vm_event(THP_FAULT_FALLBACK_CHARGE); >> - count_mthp_stat(HPAGE_PMD_ORDER, >> MTHP_STAT_ANON_FAULT_FALLBACK); >> - count_mthp_stat(HPAGE_PMD_ORDER, >> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >> - return VM_FAULT_FALLBACK; >> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); >> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >> + goto out; >> } >> folio_throttle_swaprate(folio, gfp); >> - pgtable = pte_alloc_one(vma->vm_mm); >> - if (unlikely(!pgtable)) { >> - ret = VM_FAULT_OOM; >> - goto release; >> - } >> - >> - folio_zero_user(folio, vmf->address); >> + folio_zero_user(folio, addr); >> /* >> * The memory barrier inside __folio_mark_uptodate makes sure that >> * folio_zero_user writes become visible before the set_pmd_at() >> * write. >> */ >> __folio_mark_uptodate(folio); >> +out: >> + return folio; >> +} >> + >> +static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma) >> +{ >> + count_vm_event(THP_FAULT_ALLOC); >> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); >> + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); >> +} > > Why isn't that moved into map_pmd_thp() > > Note that in this patch you do: > > map_pmd_thp(folio, vmf, vma, haddr); > spin_unlock(vmf->ptl); > __pmd_thp_fault_success_stats(vma); > > But in patch #2 > > map_pmd_thp(folio, vmf, vma, haddr); > __pmd_thp_fault_success_stats(vma); > goto unlock; > release: > folio_put(folio); > unlock: > spin_unlock(vmf->ptl); Yes, while writing it I knew about this inconsistency, but I wanted to reduce latency by dropping the lock before. But in do_huge_zero_wp_pmd(), I couldn't figure out a way to call the stat function after dropping the lock, without I guess, introducing too many labels and goto jumps and the like. In the current code, the lock gets dropped first. > > Please make that consistent, meaning: > > 1) Inline __pmd_thp_fault_success_stats() into map_pmd_thp(). No need > to have the separated out. > > 2) Either do the PTL unlocking in __pmd_thp_fault_success_stats() or in > the caller. In the caller is likely easiest. Adjusting the counters > should be cheap, if not we could revisit this later with real data. I will then call it in map_pmd_thp(), that is cleaner...I did find occurrences of these stat computations after taking the lock, for example, in do_swap_page(): count_vm_event(PGMAJFAULT) so I guess it should be alright. > >> + >> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >> + struct vm_area_struct *vma, unsigned long haddr) >> +{ >> + pmd_t entry; >> + >> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >> + folio_add_lru_vma(folio, vma); >> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); > > It's quite weird to see a mixture of haddr and vmf->address, and > likely this mixture is wrong or not not required. > > Looking at arc's update_mmu_cache_pmd() implementation, I cannot see > how passing in the unaligned address would do the right thing. But > maybe arc also doesn't trigger that code path ... who knows :) > > > Staring at some other update_mmu_cache_pmd() users, it's quite > inconsistent. Primarily only do_huge_pmd_numa_page() and > __do_huge_pmd_anonymous_page() use the unaligned address. The others > seem to use the aligned address ... as one would expect when modifying > a PMD. > > > I suggest to change this function to *not* pass in the vmf, and rename > it to something like: > > static void folio_map_anon_pmd(struct folio *folio, struct > vm_area_struct *vma, pmd_t *pmd, unsigned long haddr) > > Then use haddr also to do the update_mmu_cache_pmd(). The code I changed, already was passing vmf->address to update_mmu_cache_pmd(). I did not change any of the haddr and vmf->address semantics, so really can't comment on this. I agree with the name change; vmf will be required for set_pmd_at(vmf->pmd), so I should just pass pmd? > >> + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); >> + mm_inc_nr_ptes(vma->vm_mm); >> +} >> + >> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) >> +{ >> + struct vm_area_struct *vma = vmf->vma; >> + struct folio *folio; >> + pgtable_t pgtable; >> + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >> + vm_fault_t ret = 0; >> + gfp_t gfp = vma_thp_gfp_mask(vma); > > Nit: While at it, try to use reverse christmas-tree where possible, > makes things more reasible. You could make haddr const. > > struct vm_area_struct *vma = vmf->vma; > unsigned long haddr = vmf->address & HPAGE_PMD_MASK; > gfp_t gfp = vma_thp_gfp_mask(vma); > struct folio *folio; > vm_fault_t ret = 0; Okay. > ... > >> + >> + folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address); >> + if (unlikely(!folio)) { >> + ret = VM_FAULT_FALLBACK; >> + goto release; >> + } >> + >> + pgtable = pte_alloc_one(vma->vm_mm); >> + if (unlikely(!pgtable)) { >> + ret = VM_FAULT_OOM; >> + goto release; >> + } >> vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); >> + > > Nit Unrelated change. Are you asking me to align this line with the below line? > >> if (unlikely(!pmd_none(*vmf->pmd))) { >> goto unlock_release; > >
On 9/11/24 14:59, David Hildenbrand wrote: > On 11.09.24 11:27, David Hildenbrand wrote: >> On 11.09.24 08:55, Dev Jain wrote: >>> In preparation for the second patch, abstract away the THP allocation >>> logic present in the create_huge_pmd() path, which corresponds to the >>> faulting case when no page is present. >>> >>> There should be no functional change as a result of applying >>> this patch. >>> >> >> Hi, >> >>> Signed-off-by: Dev Jain <dev.jain@arm.com> >>> --- >>> mm/huge_memory.c | 110 >>> +++++++++++++++++++++++++++++------------------ >>> 1 file changed, 67 insertions(+), 43 deletions(-) >>> >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>> index 67c86a5d64a6..b96a1ff2bf40 100644 >>> --- a/mm/huge_memory.c >>> +++ b/mm/huge_memory.c >>> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct >>> file *filp, unsigned long addr, >>> } >>> EXPORT_SYMBOL_GPL(thp_get_unmapped_area); >>> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault >>> *vmf, >>> - struct page *page, gfp_t gfp) >>> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct >>> vm_area_struct *vma, >>> + unsigned long haddr, unsigned long addr) >> >> I suggest calling this something like "vma_alloc_anon_folio_pmd()"? Then >> it's more consistent with vma_alloc_folio(). >> >> Also, likely we should just only pass in "addr" and calculate "haddr" >> ourselves, it's cheap and reduces the number of function parameters. >> >>> { >>> - struct vm_area_struct *vma = vmf->vma; >>> - struct folio *folio = page_folio(page); >>> - pgtable_t pgtable; >>> - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >>> - vm_fault_t ret = 0; >>> + const int order = HPAGE_PMD_ORDER; >>> + struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, >>> true); >>> - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); >>> + if (unlikely(!folio)) { >>> + count_vm_event(THP_FAULT_FALLBACK); >>> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); >>> + goto out; >>> + } >>> + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); >>> if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { >>> folio_put(folio); >>> count_vm_event(THP_FAULT_FALLBACK); >>> count_vm_event(THP_FAULT_FALLBACK_CHARGE); >>> - count_mthp_stat(HPAGE_PMD_ORDER, >>> MTHP_STAT_ANON_FAULT_FALLBACK); >>> - count_mthp_stat(HPAGE_PMD_ORDER, >>> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >>> - return VM_FAULT_FALLBACK; >>> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); >>> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >>> + goto out; >>> } >>> folio_throttle_swaprate(folio, gfp); >>> - pgtable = pte_alloc_one(vma->vm_mm); >>> - if (unlikely(!pgtable)) { >>> - ret = VM_FAULT_OOM; >>> - goto release; >>> - } >>> - >>> - folio_zero_user(folio, vmf->address); >>> + folio_zero_user(folio, addr); >>> /* >>> * The memory barrier inside __folio_mark_uptodate makes sure >>> that >>> * folio_zero_user writes become visible before the set_pmd_at() >>> * write. >>> */ >>> __folio_mark_uptodate(folio); >>> +out: >>> + return folio; >>> +} >>> + >>> +static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma) >>> +{ >>> + count_vm_event(THP_FAULT_ALLOC); >>> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); >>> + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); >>> +} >> >> Why isn't that moved into map_pmd_thp() >> >> Note that in this patch you do: >> >> map_pmd_thp(folio, vmf, vma, haddr); >> spin_unlock(vmf->ptl); >> __pmd_thp_fault_success_stats(vma); >> >> But in patch #2 >> >> map_pmd_thp(folio, vmf, vma, haddr); >> __pmd_thp_fault_success_stats(vma); >> goto unlock; >> release: >> folio_put(folio); >> unlock: >> spin_unlock(vmf->ptl); >> >> Please make that consistent, meaning: >> >> 1) Inline __pmd_thp_fault_success_stats() into map_pmd_thp(). No need to >> have the separated out. >> >> 2) Either do the PTL unlocking in __pmd_thp_fault_success_stats() or in >> the caller. In the caller is likely easiest. Adjusting the counters >> should be cheap, if not we could revisit this later with real data. >> >>> + >>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >>> + struct vm_area_struct *vma, unsigned long haddr) >>> +{ >>> + pmd_t entry; >>> + >>> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >>> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >>> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >>> + folio_add_lru_vma(folio, vma); >>> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >>> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); >> >> It's quite weird to see a mixture of haddr and vmf->address, and likely >> this mixture is wrong or not not required. >> >> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see how >> passing in the unaligned address would do the right thing. But maybe arc >> also doesn't trigger that code path ... who knows :) >> >> >> Staring at some other update_mmu_cache_pmd() users, it's quite >> inconsistent. Primarily only do_huge_pmd_numa_page() and >> __do_huge_pmd_anonymous_page() use the unaligned address. The others >> seem to use the aligned address ... as one would expect when modifying a >> PMD. >> >> >> I suggest to change this function to *not* pass in the vmf, and rename >> it to something like: >> >> static void folio_map_anon_pmd(struct folio *folio, struct >> vm_area_struct *vma, pmd_t *pmd, unsigned long haddr) > > ... or better "map_anon_folio_pmd" so it better matches > vma_alloc_folio_pmd() suggested above. I'll vote for this. > >
On 9/11/24 16:22, Kefeng Wang wrote: > > > On 2024/9/11 14:55, Dev Jain wrote: >> In preparation for the second patch, abstract away the THP allocation >> logic present in the create_huge_pmd() path, which corresponds to the >> faulting case when no page is present. >> >> There should be no functional change as a result of applying >> this patch. >> >> Signed-off-by: Dev Jain <dev.jain@arm.com> >> --- >> mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------ >> 1 file changed, 67 insertions(+), 43 deletions(-) >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 67c86a5d64a6..b96a1ff2bf40 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file >> *filp, unsigned long addr, >> } >> EXPORT_SYMBOL_GPL(thp_get_unmapped_area); >> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, >> - struct page *page, gfp_t gfp) >> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct >> vm_area_struct *vma, >> + unsigned long haddr, unsigned long addr) >> { >> - struct vm_area_struct *vma = vmf->vma; >> - struct folio *folio = page_folio(page); >> - pgtable_t pgtable; >> - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >> - vm_fault_t ret = 0; >> + const int order = HPAGE_PMD_ORDER; > > Maybe move vma_thp_gfp_mask() into this function too. That's better, thanks. > >> + struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, >> true); >> - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); >> + if (unlikely(!folio)) { >> + count_vm_event(THP_FAULT_FALLBACK); >> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); >> + goto out; >> + } >> + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); >> if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { >> folio_put(folio); >> count_vm_event(THP_FAULT_FALLBACK); >> count_vm_event(THP_FAULT_FALLBACK_CHARGE); >> - count_mthp_stat(HPAGE_PMD_ORDER, >> MTHP_STAT_ANON_FAULT_FALLBACK); >> - count_mthp_stat(HPAGE_PMD_ORDER, >> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >> - return VM_FAULT_FALLBACK; >> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); >> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >> + goto out; > > We need to return NULL here as folio not set to null, My bad for assuming that folio_put() also sets folio to NULL. I read through the code path for that and I guess it does not. Thanks. > >> } >> folio_throttle_swaprate(folio, gfp); >> - pgtable = pte_alloc_one(vma->vm_mm); >> - if (unlikely(!pgtable)) { >> - ret = VM_FAULT_OOM; >> - goto release; >> - } >> - >> - folio_zero_user(folio, vmf->address); >> + folio_zero_user(folio, addr); >> /* >> * The memory barrier inside __folio_mark_uptodate makes sure that >> * folio_zero_user writes become visible before the set_pmd_at() >> * write. >> */ >> __folio_mark_uptodate(folio); >> +out: >> + return folio; >> +} >> +
[...] >> >>> + >>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >>> + struct vm_area_struct *vma, unsigned long haddr) >>> +{ >>> + pmd_t entry; >>> + >>> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >>> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >>> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >>> + folio_add_lru_vma(folio, vma); >>> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >>> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); >> >> It's quite weird to see a mixture of haddr and vmf->address, and >> likely this mixture is wrong or not not required. >> >> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see >> how passing in the unaligned address would do the right thing. But >> maybe arc also doesn't trigger that code path ... who knows :) >> >> >> Staring at some other update_mmu_cache_pmd() users, it's quite >> inconsistent. Primarily only do_huge_pmd_numa_page() and >> __do_huge_pmd_anonymous_page() use the unaligned address. The others >> seem to use the aligned address ... as one would expect when modifying >> a PMD. >> >> >> I suggest to change this function to *not* pass in the vmf, and rename >> it to something like: >> >> static void folio_map_anon_pmd(struct folio *folio, struct >> vm_area_struct *vma, pmd_t *pmd, unsigned long haddr) >> >> Then use haddr also to do the update_mmu_cache_pmd(). > > The code I changed, already was passing vmf->address to > update_mmu_cache_pmd(). > I did not change any of the haddr and vmf->address semantics, so really > can't comment > on this. I'm not saying that you changed it. I say, "we touch it we fix it" :). We could do that in a separate cleanup patch upfront. > I agree with the name change; vmf will be required for > set_pmd_at(vmf->pmd), so I should > just pass pmd? >> >>> + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); >>> + mm_inc_nr_ptes(vma->vm_mm); >>> +} >>> + >>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) >>> +{ >>> + struct vm_area_struct *vma = vmf->vma; >>> + struct folio *folio; >>> + pgtable_t pgtable; >>> + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >>> + vm_fault_t ret = 0; >>> + gfp_t gfp = vma_thp_gfp_mask(vma); >> >> Nit: While at it, try to use reverse christmas-tree where possible, >> makes things more reasible. You could make haddr const. >> >> struct vm_area_struct *vma = vmf->vma; >> unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >> gfp_t gfp = vma_thp_gfp_mask(vma); >> struct folio *folio; >> vm_fault_t ret = 0; > > Okay. >> ... >> >>> + >>> + folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address); >>> + if (unlikely(!folio)) { >>> + ret = VM_FAULT_FALLBACK; >>> + goto release; >>> + } >>> + >>> + pgtable = pte_alloc_one(vma->vm_mm); >>> + if (unlikely(!pgtable)) { >>> + ret = VM_FAULT_OOM; >>> + goto release; >>> + } >>> vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); >>> + >> >> Nit Unrelated change. > > Are you asking me to align this line with the below line? No, just pointing out that you add a newline in a code section you don't really touch. Sometimes that's deliberate, sometimes not (e.g., going back and forth while reworking the code and accidentally leaving that in). We try to avoid such unrelated changes because it adds noise to the patches. If it was deliberate, feel free to leave it in.
On 9/11/24 14:57, David Hildenbrand wrote: > On 11.09.24 08:55, Dev Jain wrote: >> In preparation for the second patch, abstract away the THP allocation >> logic present in the create_huge_pmd() path, which corresponds to the >> faulting case when no page is present. >> >> There should be no functional change as a result of applying >> this patch. >> > > [...] > >> + >> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >> + struct vm_area_struct *vma, unsigned long haddr) >> +{ >> + pmd_t entry; >> + >> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >> + folio_add_lru_vma(folio, vma); >> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); > > It's quite weird to see a mixture of haddr and vmf->address, and > likely this mixture is wrong or not not required. > > Looking at arc's update_mmu_cache_pmd() implementation, I cannot see > how passing in the unaligned address would do the right thing. But > maybe arc also doesn't trigger that code path ... who knows :) If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd() calls update_mmu_cache_range() which is already expecting an unaligned address? But... > > > Staring at some other update_mmu_cache_pmd() users, it's quite > inconsistent. Primarily only do_huge_pmd_numa_page() and > __do_huge_pmd_anonymous_page() use the unaligned address. The others > seem to use the aligned address ... as one would expect when modifying > a PMD. Looking at riscv: update_mmu_cache_pmd()->update_mmu_cache()->update_mmu_cache_range(). The argument getting passed to local_flush_tlb_page() seems like, should expect an aligned address. > > > I suggest to change this function to *not* pass in the vmf, and rename > it to something like: > > static void folio_map_anon_pmd(struct folio *folio, struct > vm_area_struct *vma, pmd_t *pmd, unsigned long haddr) > > Then use haddr also to do the update_mmu_cache_pmd(). > >> + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); >> + mm_inc_nr_ptes(vma->vm_mm); >> +} >> + >> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) >> +{ >> + struct vm_area_struct *vma = vmf->vma; >> + struct folio *folio; >> + pgtable_t pgtable; >> + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >> + vm_fault_t ret = 0; >> + gfp_t gfp = vma_thp_gfp_mask(vma); > > Nit: While at it, try to use reverse christmas-tree where possible, > makes things more reasible. You could make haddr const. > > struct vm_area_struct *vma = vmf->vma; > unsigned long haddr = vmf->address & HPAGE_PMD_MASK; > gfp_t gfp = vma_thp_gfp_mask(vma); > struct folio *folio; > vm_fault_t ret = 0; > ... > >> + >> + folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address); >> + if (unlikely(!folio)) { >> + ret = VM_FAULT_FALLBACK; >> + goto release; >> + } >> + >> + pgtable = pte_alloc_one(vma->vm_mm); >> + if (unlikely(!pgtable)) { >> + ret = VM_FAULT_OOM; >> + goto release; >> + } >> vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); >> + > > Nit Unrelated change. > >> if (unlikely(!pmd_none(*vmf->pmd))) { >> goto unlock_release; > >
On 9/11/24 18:05, David Hildenbrand wrote: > [...] > >>> >>>> + >>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >>>> + struct vm_area_struct *vma, unsigned long haddr) >>>> +{ >>>> + pmd_t entry; >>>> + >>>> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >>>> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >>>> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >>>> + folio_add_lru_vma(folio, vma); >>>> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >>>> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); >>> >>> It's quite weird to see a mixture of haddr and vmf->address, and >>> likely this mixture is wrong or not not required. >>> >>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see >>> how passing in the unaligned address would do the right thing. But >>> maybe arc also doesn't trigger that code path ... who knows :) >>> >>> >>> Staring at some other update_mmu_cache_pmd() users, it's quite >>> inconsistent. Primarily only do_huge_pmd_numa_page() and >>> __do_huge_pmd_anonymous_page() use the unaligned address. The others >>> seem to use the aligned address ... as one would expect when modifying >>> a PMD. >>> >>> >>> I suggest to change this function to *not* pass in the vmf, and rename >>> it to something like: >>> >>> static void folio_map_anon_pmd(struct folio *folio, struct >>> vm_area_struct *vma, pmd_t *pmd, unsigned long haddr) >>> >>> Then use haddr also to do the update_mmu_cache_pmd(). >> >> The code I changed, already was passing vmf->address to >> update_mmu_cache_pmd(). >> I did not change any of the haddr and vmf->address semantics, so really >> can't comment >> on this. > > I'm not saying that you changed it. I say, "we touch it we fix it" :). > We could do that in a separate cleanup patch upfront. Sure. > >> I agree with the name change; vmf will be required for >> set_pmd_at(vmf->pmd), so I should >> just pass pmd? >>> >>>> + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); >>>> + mm_inc_nr_ptes(vma->vm_mm); >>>> +} >>>> + >>>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) >>>> +{ >>>> + struct vm_area_struct *vma = vmf->vma; >>>> + struct folio *folio; >>>> + pgtable_t pgtable; >>>> + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >>>> + vm_fault_t ret = 0; >>>> + gfp_t gfp = vma_thp_gfp_mask(vma); >>> >>> Nit: While at it, try to use reverse christmas-tree where possible, >>> makes things more reasible. You could make haddr const. >>> >>> struct vm_area_struct *vma = vmf->vma; >>> unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >>> gfp_t gfp = vma_thp_gfp_mask(vma); >>> struct folio *folio; >>> vm_fault_t ret = 0; >> >> Okay. >>> ... >>> >>>> + >>>> + folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address); >>>> + if (unlikely(!folio)) { >>>> + ret = VM_FAULT_FALLBACK; >>>> + goto release; >>>> + } >>>> + >>>> + pgtable = pte_alloc_one(vma->vm_mm); >>>> + if (unlikely(!pgtable)) { >>>> + ret = VM_FAULT_OOM; >>>> + goto release; >>>> + } >>>> vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); >>>> + >>> >>> Nit Unrelated change. >> >> Are you asking me to align this line with the below line? > > No, just pointing out that you add a newline in a code section you > don't really touch. Sometimes that's deliberate, sometimes not (e.g., > going back and forth while reworking the code and accidentally leaving > that in). We try to avoid such unrelated changes because it adds noise > to the patches. > > If it was deliberate, feel free to leave it in. Ah, I accidentally inserted that new line. Will revert that.
On 11.09.24 14:53, Dev Jain wrote: > > On 9/11/24 14:57, David Hildenbrand wrote: >> On 11.09.24 08:55, Dev Jain wrote: >>> In preparation for the second patch, abstract away the THP allocation >>> logic present in the create_huge_pmd() path, which corresponds to the >>> faulting case when no page is present. >>> >>> There should be no functional change as a result of applying >>> this patch. >>> >> >> [...] >> >>> + >>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >>> + struct vm_area_struct *vma, unsigned long haddr) >>> +{ >>> + pmd_t entry; >>> + >>> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >>> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >>> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >>> + folio_add_lru_vma(folio, vma); >>> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >>> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); >> >> It's quite weird to see a mixture of haddr and vmf->address, and >> likely this mixture is wrong or not not required. >> >> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see >> how passing in the unaligned address would do the right thing. But >> maybe arc also doesn't trigger that code path ... who knows :) > > If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd() > calls update_mmu_cache_range() which is already expecting an unaligned > address? But... So update_mmu_cache_pmd() calls update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR); But update_mmu_cache_range() only aligns it *to page boundary*: unsigned long vaddr = vaddr_unaligned & PAGE_MASK; We obtain the correct hugepage-aligned physical address from the PTE phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS; Then, we look at the offset in our folio unsigned long offset = offset_in_folio(folio, paddr); And adjust both vaddr and paddr paddr -= offset; vaddr -= offset; To then use that combination with __inv_icache_pages(paddr, vaddr, nr); If I am not wrong, getting a non-hugepage aligned vaddr messes up things here. But only regarding the icache I think.
On 9/11/24 18:30, David Hildenbrand wrote: > On 11.09.24 14:53, Dev Jain wrote: >> >> On 9/11/24 14:57, David Hildenbrand wrote: >>> On 11.09.24 08:55, Dev Jain wrote: >>>> In preparation for the second patch, abstract away the THP allocation >>>> logic present in the create_huge_pmd() path, which corresponds to the >>>> faulting case when no page is present. >>>> >>>> There should be no functional change as a result of applying >>>> this patch. >>>> >>> >>> [...] >>> >>>> + >>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >>>> + struct vm_area_struct *vma, unsigned long haddr) >>>> +{ >>>> + pmd_t entry; >>>> + >>>> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >>>> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >>>> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >>>> + folio_add_lru_vma(folio, vma); >>>> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >>>> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); >>> >>> It's quite weird to see a mixture of haddr and vmf->address, and >>> likely this mixture is wrong or not not required. >>> >>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see >>> how passing in the unaligned address would do the right thing. But >>> maybe arc also doesn't trigger that code path ... who knows :) >> >> If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd() >> calls update_mmu_cache_range() which is already expecting an unaligned >> address? But... > > So update_mmu_cache_pmd() calls > > update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR); > > But update_mmu_cache_range() only aligns it *to page boundary*: > > unsigned long vaddr = vaddr_unaligned & PAGE_MASK; Ah, totally missed that it was PAGE_MASK. Thanks. > > We obtain the correct hugepage-aligned physical address from the PTE > > phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS; > > Then, we look at the offset in our folio > > unsigned long offset = offset_in_folio(folio, paddr); > > And adjust both vaddr and paddr > > paddr -= offset; > vaddr -= offset; > > To then use that combination with > > __inv_icache_pages(paddr, vaddr, nr); > > If I am not wrong, getting a non-hugepage aligned vaddr messes up > things here. But only regarding the icache I think. Looks like it... > >
On 11.09.24 15:05, Dev Jain wrote: > > On 9/11/24 18:30, David Hildenbrand wrote: >> On 11.09.24 14:53, Dev Jain wrote: >>> >>> On 9/11/24 14:57, David Hildenbrand wrote: >>>> On 11.09.24 08:55, Dev Jain wrote: >>>>> In preparation for the second patch, abstract away the THP allocation >>>>> logic present in the create_huge_pmd() path, which corresponds to the >>>>> faulting case when no page is present. >>>>> >>>>> There should be no functional change as a result of applying >>>>> this patch. >>>>> >>>> >>>> [...] >>>> >>>>> + >>>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >>>>> + struct vm_area_struct *vma, unsigned long haddr) >>>>> +{ >>>>> + pmd_t entry; >>>>> + >>>>> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >>>>> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >>>>> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >>>>> + folio_add_lru_vma(folio, vma); >>>>> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >>>>> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); >>>> >>>> It's quite weird to see a mixture of haddr and vmf->address, and >>>> likely this mixture is wrong or not not required. >>>> >>>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see >>>> how passing in the unaligned address would do the right thing. But >>>> maybe arc also doesn't trigger that code path ... who knows :) >>> >>> If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd() >>> calls update_mmu_cache_range() which is already expecting an unaligned >>> address? But... >> >> So update_mmu_cache_pmd() calls >> >> update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR); >> >> But update_mmu_cache_range() only aligns it *to page boundary*: >> >> unsigned long vaddr = vaddr_unaligned & PAGE_MASK; > > Ah, totally missed that it was PAGE_MASK. Thanks. >> >> We obtain the correct hugepage-aligned physical address from the PTE >> >> phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS; >> >> Then, we look at the offset in our folio >> >> unsigned long offset = offset_in_folio(folio, paddr); >> >> And adjust both vaddr and paddr >> >> paddr -= offset; >> vaddr -= offset; >> >> To then use that combination with >> >> __inv_icache_pages(paddr, vaddr, nr); >> >> If I am not wrong, getting a non-hugepage aligned vaddr messes up >> things here. But only regarding the icache I think. > > Looks like it... As we are adding a fresh page where there previously wasn't anything mapped (no icache invaldiation required?), and because most anon mappings are not executable, maybe that's why nobody notices so far.
On 9/11/24 18:44, David Hildenbrand wrote: > On 11.09.24 15:05, Dev Jain wrote: >> >> On 9/11/24 18:30, David Hildenbrand wrote: >>> On 11.09.24 14:53, Dev Jain wrote: >>>> >>>> On 9/11/24 14:57, David Hildenbrand wrote: >>>>> On 11.09.24 08:55, Dev Jain wrote: >>>>>> In preparation for the second patch, abstract away the THP >>>>>> allocation >>>>>> logic present in the create_huge_pmd() path, which corresponds to >>>>>> the >>>>>> faulting case when no page is present. >>>>>> >>>>>> There should be no functional change as a result of applying >>>>>> this patch. >>>>>> >>>>> >>>>> [...] >>>>> >>>>>> + >>>>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, >>>>>> + struct vm_area_struct *vma, unsigned long haddr) >>>>>> +{ >>>>>> + pmd_t entry; >>>>>> + >>>>>> + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); >>>>>> + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >>>>>> + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); >>>>>> + folio_add_lru_vma(folio, vma); >>>>>> + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >>>>>> + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); >>>>> >>>>> It's quite weird to see a mixture of haddr and vmf->address, and >>>>> likely this mixture is wrong or not not required. >>>>> >>>>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see >>>>> how passing in the unaligned address would do the right thing. But >>>>> maybe arc also doesn't trigger that code path ... who knows :) >>>> >>>> If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd() >>>> calls update_mmu_cache_range() which is already expecting an unaligned >>>> address? But... >>> >>> So update_mmu_cache_pmd() calls >>> >>> update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR); >>> >>> But update_mmu_cache_range() only aligns it *to page boundary*: >>> >>> unsigned long vaddr = vaddr_unaligned & PAGE_MASK; >> >> Ah, totally missed that it was PAGE_MASK. Thanks. >>> >>> We obtain the correct hugepage-aligned physical address from the PTE >>> >>> phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS; >>> >>> Then, we look at the offset in our folio >>> >>> unsigned long offset = offset_in_folio(folio, paddr); >>> >>> And adjust both vaddr and paddr >>> >>> paddr -= offset; >>> vaddr -= offset; >>> >>> To then use that combination with >>> >>> __inv_icache_pages(paddr, vaddr, nr); >>> >>> If I am not wrong, getting a non-hugepage aligned vaddr messes up >>> things here. But only regarding the icache I think. >> >> Looks like it... > > As we are adding a fresh page where there previously wasn't anything > mapped (no icache invaldiation required?), and because most anon > mappings are not executable, maybe that's why nobody notices so far. Thanks for the observation!
Hi Dev, kernel test robot noticed the following build warnings: [auto build test WARNING on v6.11-rc7] [also build test WARNING on linus/master] [cannot apply to akpm-mm/mm-everything next-20240912] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Dev-Jain/mm-Abstract-THP-allocation/20240911-145809 base: v6.11-rc7 patch link: https://lore.kernel.org/r/20240911065600.1002644-2-dev.jain%40arm.com patch subject: [PATCH v3 1/2] mm: Abstract THP allocation config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20240912/202409122144.jqe4JROY-lkp@intel.com/config) compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240912/202409122144.jqe4JROY-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202409122144.jqe4JROY-lkp@intel.com/ All warnings (new ones prefixed by >>): >> mm/huge_memory.c:1012:6: warning: variable 'pgtable' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] 1012 | if (unlikely(!folio)) { | ^~~~~~~~~~~~~~~~ include/linux/compiler.h:77:22: note: expanded from macro 'unlikely' 77 | # define unlikely(x) __builtin_expect(!!(x), 0) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ mm/huge_memory.c:1051:6: note: uninitialized use occurs here 1051 | if (pgtable) | ^~~~~~~ mm/huge_memory.c:1012:2: note: remove the 'if' if its condition is always false 1012 | if (unlikely(!folio)) { | ^~~~~~~~~~~~~~~~~~~~~~~ 1013 | ret = VM_FAULT_FALLBACK; | ~~~~~~~~~~~~~~~~~~~~~~~~ 1014 | goto release; | ~~~~~~~~~~~~~ 1015 | } | ~ mm/huge_memory.c:1006:19: note: initialize the variable 'pgtable' to silence this warning 1006 | pgtable_t pgtable; | ^ | = NULL 1 warning generated. vim +1012 mm/huge_memory.c 1001 1002 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) 1003 { 1004 struct vm_area_struct *vma = vmf->vma; 1005 struct folio *folio; 1006 pgtable_t pgtable; 1007 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1008 vm_fault_t ret = 0; 1009 gfp_t gfp = vma_thp_gfp_mask(vma); 1010 1011 folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address); > 1012 if (unlikely(!folio)) { 1013 ret = VM_FAULT_FALLBACK; 1014 goto release; 1015 } 1016 1017 pgtable = pte_alloc_one(vma->vm_mm); 1018 if (unlikely(!pgtable)) { 1019 ret = VM_FAULT_OOM; 1020 goto release; 1021 } 1022 1023 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1024 1025 if (unlikely(!pmd_none(*vmf->pmd))) { 1026 goto unlock_release; 1027 } else { 1028 ret = check_stable_address_space(vma->vm_mm); 1029 if (ret) 1030 goto unlock_release; 1031 1032 /* Deliver the page fault to userland */ 1033 if (userfaultfd_missing(vma)) { 1034 spin_unlock(vmf->ptl); 1035 folio_put(folio); 1036 pte_free(vma->vm_mm, pgtable); 1037 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1038 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1039 return ret; 1040 } 1041 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 1042 map_pmd_thp(folio, vmf, vma, haddr); 1043 spin_unlock(vmf->ptl); 1044 __pmd_thp_fault_success_stats(vma); 1045 } 1046 1047 return 0; 1048 unlock_release: 1049 spin_unlock(vmf->ptl); 1050 release: 1051 if (pgtable) 1052 pte_free(vma->vm_mm, pgtable); 1053 if (folio) 1054 folio_put(folio); 1055 return ret; 1056
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 67c86a5d64a6..b96a1ff2bf40 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, - struct page *page, gfp_t gfp) +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma, + unsigned long haddr, unsigned long addr) { - struct vm_area_struct *vma = vmf->vma; - struct folio *folio = page_folio(page); - pgtable_t pgtable; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - vm_fault_t ret = 0; + const int order = HPAGE_PMD_ORDER; + struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true); - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (unlikely(!folio)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + goto out; + } + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); - return VM_FAULT_FALLBACK; + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + goto out; } folio_throttle_swaprate(folio, gfp); - pgtable = pte_alloc_one(vma->vm_mm); - if (unlikely(!pgtable)) { - ret = VM_FAULT_OOM; - goto release; - } - - folio_zero_user(folio, vmf->address); + folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that * folio_zero_user writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); +out: + return folio; +} + +static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma) +{ + count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); +} + +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long haddr) +{ + pmd_t entry; + + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(vma->vm_mm); +} + +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + pgtable_t pgtable; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret = 0; + gfp_t gfp = vma_thp_gfp_mask(vma); + + folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address); + if (unlikely(!folio)) { + ret = VM_FAULT_FALLBACK; + goto release; + } + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { goto unlock_release; } else { - pmd_t entry; - ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock_release; @@ -997,20 +1038,10 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } - - entry = mk_huge_pmd(page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm_inc_nr_ptes(vma->vm_mm); + map_pmd_thp(folio, vmf, vma, haddr); spin_unlock(vmf->ptl); - count_vm_event(THP_FAULT_ALLOC); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); - count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + __pmd_thp_fault_success_stats(vma); } return 0; @@ -1019,7 +1050,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, release: if (pgtable) pte_free(vma->vm_mm, pgtable); - folio_put(folio); + if (folio) + folio_put(folio); return ret; } @@ -1077,8 +1109,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - gfp_t gfp; - struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret; @@ -1129,14 +1159,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - gfp = vma_thp_gfp_mask(vma); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); - if (unlikely(!folio)) { - count_vm_event(THP_FAULT_FALLBACK); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } - return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); + + return __do_huge_pmd_anonymous_page(vmf); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
In preparation for the second patch, abstract away the THP allocation logic present in the create_huge_pmd() path, which corresponds to the faulting case when no page is present. There should be no functional change as a result of applying this patch. Signed-off-by: Dev Jain <dev.jain@arm.com> --- mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 43 deletions(-)