diff mbox series

[2/5] mm/khugepaged: Convert hpage_collapse_scan_pmd() to use folios

Message ID 20231016200510.7387-3-vishal.moola@gmail.com (mailing list archive)
State New
Headers show
Series Some khugepaged folio conversions | expand

Commit Message

Vishal Moola Oct. 16, 2023, 8:05 p.m. UTC
Replaces 5 calls to compound_head(), and removes 1466 bytes of kernel
text.

Previously, to determine if any pte was shared, the page mapcount
corresponding exactly to the pte was checked. This gave us a precise
number of shared ptes. Using folio_estimated_sharers() instead uses
the mapcount of the head page, giving us an estimate for tail page ptes.

This means if a tail page's mapcount is greater than its head page's
mapcount, folio_estimated_sharers() would be underestimating the number of
shared ptes, and vice versa.

Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
---
 mm/khugepaged.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

Comments

Yang Shi Oct. 17, 2023, 8:41 p.m. UTC | #1
On Mon, Oct 16, 2023 at 1:06 PM Vishal Moola (Oracle)
<vishal.moola@gmail.com> wrote:
>
> Replaces 5 calls to compound_head(), and removes 1466 bytes of kernel
> text.
>
> Previously, to determine if any pte was shared, the page mapcount
> corresponding exactly to the pte was checked. This gave us a precise
> number of shared ptes. Using folio_estimated_sharers() instead uses
> the mapcount of the head page, giving us an estimate for tail page ptes.
>
> This means if a tail page's mapcount is greater than its head page's
> mapcount, folio_estimated_sharers() would be underestimating the number of
> shared ptes, and vice versa.
>
> Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> ---
>  mm/khugepaged.c | 26 ++++++++++++--------------
>  1 file changed, 12 insertions(+), 14 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 7a552fe16c92..67aac53b31c8 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1245,7 +1245,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>         pte_t *pte, *_pte;
>         int result = SCAN_FAIL, referenced = 0;
>         int none_or_zero = 0, shared = 0;
> -       struct page *page = NULL;
> +       struct folio *folio = NULL;
>         unsigned long _address;
>         spinlock_t *ptl;
>         int node = NUMA_NO_NODE, unmapped = 0;
> @@ -1316,13 +1316,13 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>                 if (pte_write(pteval))
>                         writable = true;
>
> -               page = vm_normal_page(vma, _address, pteval);
> -               if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> +               folio = vm_normal_folio(vma, _address, pteval);
> +               if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
>                         result = SCAN_PAGE_NULL;
>                         goto out_unmap;
>                 }
>
> -               if (page_mapcount(page) > 1) {
> +               if (folio_estimated_sharers(folio) > 1) {

This doesn't look correct. The max_ptes_shared is used to control the
cap of shared PTEs. IIRC, folio_estimated_sharers() just reads the
mapcount of the head page. If we set max_ptes_shared to 256, and just
the head page is shared, but "shared" will return 512 and prevent from
collapsing the area even though just one PTE is shared. This breaks
the semantics of max_ptes_shared.

>                         ++shared;
>                         if (cc->is_khugepaged &&
>                             shared > khugepaged_max_ptes_shared) {
> @@ -1332,29 +1332,27 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>                         }
>                 }
>
> -               page = compound_head(page);
> -
>                 /*
>                  * Record which node the original page is from and save this
>                  * information to cc->node_load[].
>                  * Khugepaged will allocate hugepage from the node has the max
>                  * hit record.
>                  */
> -               node = page_to_nid(page);
> +               node = folio_nid(folio);
>                 if (hpage_collapse_scan_abort(node, cc)) {
>                         result = SCAN_SCAN_ABORT;
>                         goto out_unmap;
>                 }
>                 cc->node_load[node]++;
> -               if (!PageLRU(page)) {
> +               if (!folio_test_lru(folio)) {
>                         result = SCAN_PAGE_LRU;
>                         goto out_unmap;
>                 }
> -               if (PageLocked(page)) {
> +               if (folio_test_locked(folio)) {
>                         result = SCAN_PAGE_LOCK;
>                         goto out_unmap;
>                 }
> -               if (!PageAnon(page)) {
> +               if (!folio_test_anon(folio)) {
>                         result = SCAN_PAGE_ANON;
>                         goto out_unmap;
>                 }
> @@ -1369,7 +1367,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>                  * has excessive GUP pins (i.e. 512).  Anyway the same check
>                  * will be done again later the risk seems low.
>                  */
> -               if (!is_refcount_suitable(page)) {
> +               if (!is_refcount_suitable(&folio->page)) {
>                         result = SCAN_PAGE_COUNT;
>                         goto out_unmap;
>                 }
> @@ -1379,8 +1377,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>                  * enough young pte to justify collapsing the page
>                  */
>                 if (cc->is_khugepaged &&
> -                   (pte_young(pteval) || page_is_young(page) ||
> -                    PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
> +                   (pte_young(pteval) || folio_test_young(folio) ||
> +                    folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
>                                                                      address)))
>                         referenced++;
>         }
> @@ -1402,7 +1400,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>                 *mmap_locked = false;
>         }
>  out:
> -       trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
> +       trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
>                                      none_or_zero, result, unmapped);
>         return result;
>  }
> --
> 2.40.1
>
Vishal Moola Oct. 18, 2023, 5:33 p.m. UTC | #2
On Tue, Oct 17, 2023 at 1:41 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Mon, Oct 16, 2023 at 1:06 PM Vishal Moola (Oracle)
> <vishal.moola@gmail.com> wrote:
> >
> > Replaces 5 calls to compound_head(), and removes 1466 bytes of kernel
> > text.
> >
> > Previously, to determine if any pte was shared, the page mapcount
> > corresponding exactly to the pte was checked. This gave us a precise
> > number of shared ptes. Using folio_estimated_sharers() instead uses
> > the mapcount of the head page, giving us an estimate for tail page ptes.
> >
> > This means if a tail page's mapcount is greater than its head page's
> > mapcount, folio_estimated_sharers() would be underestimating the number of
> > shared ptes, and vice versa.
> >
> > Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> > ---
> >  mm/khugepaged.c | 26 ++++++++++++--------------
> >  1 file changed, 12 insertions(+), 14 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 7a552fe16c92..67aac53b31c8 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1245,7 +1245,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> >         pte_t *pte, *_pte;
> >         int result = SCAN_FAIL, referenced = 0;
> >         int none_or_zero = 0, shared = 0;
> > -       struct page *page = NULL;
> > +       struct folio *folio = NULL;
> >         unsigned long _address;
> >         spinlock_t *ptl;
> >         int node = NUMA_NO_NODE, unmapped = 0;
> > @@ -1316,13 +1316,13 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> >                 if (pte_write(pteval))
> >                         writable = true;
> >
> > -               page = vm_normal_page(vma, _address, pteval);
> > -               if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> > +               folio = vm_normal_folio(vma, _address, pteval);
> > +               if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
> >                         result = SCAN_PAGE_NULL;
> >                         goto out_unmap;
> >                 }
> >
> > -               if (page_mapcount(page) > 1) {
> > +               if (folio_estimated_sharers(folio) > 1) {
>
> This doesn't look correct. The max_ptes_shared is used to control the
> cap of shared PTEs. IIRC, folio_estimated_sharers() just reads the
> mapcount of the head page. If we set max_ptes_shared to 256, and just
> the head page is shared, but "shared" will return 512 and prevent from
> collapsing the area even though just one PTE is shared. This breaks
> the semantics of max_ptes_shared.

In my testing this replacement appears to do the opposite (underestimating
instead of overestimating), which admittedly still breaks the semantics of
max_ptes_shared. It appears that this happens quite frequently in many
regular use cases, so I'll go back to checking each individual subpage's
mapcount in v2.

> >                         ++shared;
> >                         if (cc->is_khugepaged &&
> >                             shared > khugepaged_max_ptes_shared) {
> > @@ -1332,29 +1332,27 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> >                         }
> >                 }
> >
> > -               page = compound_head(page);
> > -
> >                 /*
> >                  * Record which node the original page is from and save this
> >                  * information to cc->node_load[].
> >                  * Khugepaged will allocate hugepage from the node has the max
> >                  * hit record.
> >                  */
> > -               node = page_to_nid(page);
> > +               node = folio_nid(folio);
> >                 if (hpage_collapse_scan_abort(node, cc)) {
> >                         result = SCAN_SCAN_ABORT;
> >                         goto out_unmap;
> >                 }
> >                 cc->node_load[node]++;
> > -               if (!PageLRU(page)) {
> > +               if (!folio_test_lru(folio)) {
> >                         result = SCAN_PAGE_LRU;
> >                         goto out_unmap;
> >                 }
> > -               if (PageLocked(page)) {
> > +               if (folio_test_locked(folio)) {
> >                         result = SCAN_PAGE_LOCK;
> >                         goto out_unmap;
> >                 }
> > -               if (!PageAnon(page)) {
> > +               if (!folio_test_anon(folio)) {
> >                         result = SCAN_PAGE_ANON;
> >                         goto out_unmap;
> >                 }
> > @@ -1369,7 +1367,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> >                  * has excessive GUP pins (i.e. 512).  Anyway the same check
> >                  * will be done again later the risk seems low.
> >                  */
> > -               if (!is_refcount_suitable(page)) {
> > +               if (!is_refcount_suitable(&folio->page)) {
> >                         result = SCAN_PAGE_COUNT;
> >                         goto out_unmap;
> >                 }
> > @@ -1379,8 +1377,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> >                  * enough young pte to justify collapsing the page
> >                  */
> >                 if (cc->is_khugepaged &&
> > -                   (pte_young(pteval) || page_is_young(page) ||
> > -                    PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
> > +                   (pte_young(pteval) || folio_test_young(folio) ||
> > +                    folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
> >                                                                      address)))
> >                         referenced++;
> >         }
> > @@ -1402,7 +1400,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> >                 *mmap_locked = false;
> >         }
> >  out:
> > -       trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
> > +       trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
> >                                      none_or_zero, result, unmapped);
> >         return result;
> >  }
> > --
> > 2.40.1
> >
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 7a552fe16c92..67aac53b31c8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1245,7 +1245,7 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	pte_t *pte, *_pte;
 	int result = SCAN_FAIL, referenced = 0;
 	int none_or_zero = 0, shared = 0;
-	struct page *page = NULL;
+	struct folio *folio = NULL;
 	unsigned long _address;
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
@@ -1316,13 +1316,13 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 		if (pte_write(pteval))
 			writable = true;
 
-		page = vm_normal_page(vma, _address, pteval);
-		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
+		folio = vm_normal_folio(vma, _address, pteval);
+		if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
 			result = SCAN_PAGE_NULL;
 			goto out_unmap;
 		}
 
-		if (page_mapcount(page) > 1) {
+		if (folio_estimated_sharers(folio) > 1) {
 			++shared;
 			if (cc->is_khugepaged &&
 			    shared > khugepaged_max_ptes_shared) {
@@ -1332,29 +1332,27 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 			}
 		}
 
-		page = compound_head(page);
-
 		/*
 		 * Record which node the original page is from and save this
 		 * information to cc->node_load[].
 		 * Khugepaged will allocate hugepage from the node has the max
 		 * hit record.
 		 */
-		node = page_to_nid(page);
+		node = folio_nid(folio);
 		if (hpage_collapse_scan_abort(node, cc)) {
 			result = SCAN_SCAN_ABORT;
 			goto out_unmap;
 		}
 		cc->node_load[node]++;
-		if (!PageLRU(page)) {
+		if (!folio_test_lru(folio)) {
 			result = SCAN_PAGE_LRU;
 			goto out_unmap;
 		}
-		if (PageLocked(page)) {
+		if (folio_test_locked(folio)) {
 			result = SCAN_PAGE_LOCK;
 			goto out_unmap;
 		}
-		if (!PageAnon(page)) {
+		if (!folio_test_anon(folio)) {
 			result = SCAN_PAGE_ANON;
 			goto out_unmap;
 		}
@@ -1369,7 +1367,7 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 		 * has excessive GUP pins (i.e. 512).  Anyway the same check
 		 * will be done again later the risk seems low.
 		 */
-		if (!is_refcount_suitable(page)) {
+		if (!is_refcount_suitable(&folio->page)) {
 			result = SCAN_PAGE_COUNT;
 			goto out_unmap;
 		}
@@ -1379,8 +1377,8 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 		 * enough young pte to justify collapsing the page
 		 */
 		if (cc->is_khugepaged &&
-		    (pte_young(pteval) || page_is_young(page) ||
-		     PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+		    (pte_young(pteval) || folio_test_young(folio) ||
+		     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
 								     address)))
 			referenced++;
 	}
@@ -1402,7 +1400,7 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 		*mmap_locked = false;
 	}
 out:
-	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+	trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
 				     none_or_zero, result, unmapped);
 	return result;
 }