diff mbox series

[RFC,4/6] mm: Add COW PTE fallback function

Message ID 20220519183127.3909598-5-shiyn.lin@gmail.com (mailing list archive)
State New
Headers show
Series Introduce Copy-On-Write to Page Table | expand

Commit Message

Chih-En Lin May 19, 2022, 6:31 p.m. UTC
The lifetime of COW PTE will handle by ownership and a reference count.
When the process wants to write the COW PTE, which reference count is 1,
it will reuse the COW PTE instead of copying then free.

Only the owner will update its RSS state and the record of page table
bytes allocation. So we need to handle when the non-owner process gets
the fallback COW PTE.

This commit prepares for the following implementation of the reference
count for COW PTE.

Signed-off-by: Chih-En Lin <shiyn.lin@gmail.com>
---
 mm/memory.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

Comments

Christophe Leroy May 20, 2022, 2:21 p.m. UTC | #1
Le 19/05/2022 à 20:31, Chih-En Lin a écrit :
> The lifetime of COW PTE will handle by ownership and a reference count.
> When the process wants to write the COW PTE, which reference count is 1,
> it will reuse the COW PTE instead of copying then free.
> 
> Only the owner will update its RSS state and the record of page table
> bytes allocation. So we need to handle when the non-owner process gets
> the fallback COW PTE.
> 
> This commit prepares for the following implementation of the reference
> count for COW PTE.
> 
> Signed-off-by: Chih-En Lin <shiyn.lin@gmail.com>
> ---
>   mm/memory.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 66 insertions(+)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 76e3af9639d9..dcb678cbb051 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1000,6 +1000,34 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
>          return new_page;
>   }
> 
> +static inline void cow_pte_rss(struct mm_struct *mm, struct vm_area_struct *vma,
> +       pmd_t *pmdp, unsigned long addr, unsigned long end, bool inc_dec)

Parenthesis alignment is not correct.

You should run 'scripts/checkpatch.pl --strict' on you patch.

> +{
> +       int rss[NR_MM_COUNTERS];
> +       pte_t *orig_ptep, *ptep;
> +       struct page *page;
> +
> +       init_rss_vec(rss);
> +
> +       ptep = pte_offset_map(pmdp, addr);
> +       orig_ptep = ptep;
> +       arch_enter_lazy_mmu_mode();
> +       do {
> +               if (pte_none(*ptep) || pte_special(*ptep))
> +                       continue;
> +
> +               page = vm_normal_page(vma, addr, *ptep);
> +               if (page) {
> +                       if (inc_dec)
> +                               rss[mm_counter(page)]++;
> +                       else
> +                               rss[mm_counter(page)]--;
> +               }
> +       } while (ptep++, addr += PAGE_SIZE, addr != end);
> +       arch_leave_lazy_mmu_mode();
> +       add_mm_rss_vec(mm, rss);
> +}
> +
>   static int
>   copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>                 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
> @@ -4554,6 +4582,44 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
>          return VM_FAULT_FALLBACK;
>   }
> 
> +/* COW PTE fallback to normal PTE:
> + * - two state here
> + *   - After break child :   [parent, rss=1, ref=1, write=NO , owner=parent]
> + *                        to [parent, rss=1, ref=1, write=YES, owner=NULL  ]
> + *   - After break parent:   [child , rss=0, ref=1, write=NO , owner=NULL  ]
> + *                        to [child , rss=1, ref=1, write=YES, owner=NULL  ]
> + */
> +void cow_pte_fallback(struct vm_area_struct *vma, pmd_t *pmd,
> +               unsigned long addr)

There should be a prototype in a header somewhere for a non static function.

You are encouraged to run 'make mm/memory.o C=2' to check sparse reports.

> +{
> +       struct mm_struct *mm = vma->vm_mm;
> +       unsigned long start, end;
> +       pmd_t new;
> +
> +       BUG_ON(pmd_write(*pmd));

You seem to add a lot of BUG_ONs(). Are they really necessary ? See 
https://docs.kernel.org/process/deprecated.html?highlight=bug_on#bug-and-bug-on

You may also use VM_BUG_ON().

> +
> +       start = addr & PMD_MASK;
> +       end = (addr + PMD_SIZE) & PMD_MASK;
> +
> +       /* If pmd is not owner, it needs to increase the rss.
> +        * Since only the owner has the RSS state for the COW PTE.
> +        */
> +       if (!cow_pte_owner_is_same(pmd, pmd)) {
> +               cow_pte_rss(mm, vma, pmd, start, end, true /* inc */);
> +               mm_inc_nr_ptes(mm);
> +               smp_wmb();
> +               pmd_populate(mm, pmd, pmd_page(*pmd));
> +       }
> +
> +       /* Reuse the pte page */
> +       set_cow_pte_owner(pmd, NULL);
> +       new = pmd_mkwrite(*pmd);
> +       set_pmd_at(mm, addr, pmd, new);
> +
> +       BUG_ON(!pmd_write(*pmd));
> +       BUG_ON(pmd_page(*pmd)->cow_pte_owner);
> +}
> +
>   /*
>    * These routines also need to handle stuff like marking pages dirty
>    * and/or accessed for architectures that don't do it in hardware (most
> --
> 2.36.1
>
Chih-En Lin May 21, 2022, 4:15 a.m. UTC | #2
On Fri, May 20, 2022 at 02:21:54PM +0000, Christophe Leroy wrote:
> > +/* COW PTE fallback to normal PTE:
> > + * - two state here
> > + *   - After break child :   [parent, rss=1, ref=1, write=NO , owner=parent]
> > + *                        to [parent, rss=1, ref=1, write=YES, owner=NULL  ]
> > + *   - After break parent:   [child , rss=0, ref=1, write=NO , owner=NULL  ]
> > + *                        to [child , rss=1, ref=1, write=YES, owner=NULL  ]
> > + */
> > +void cow_pte_fallback(struct vm_area_struct *vma, pmd_t *pmd,
> > +               unsigned long addr)
> 
> There should be a prototype in a header somewhere for a non static function.
> 
> You are encouraged to run 'make mm/memory.o C=2' to check sparse reports.
> 

I will do all the above checking before sending the next version.

> > +{
> > +       struct mm_struct *mm = vma->vm_mm;
> > +       unsigned long start, end;
> > +       pmd_t new;
> > +
> > +       BUG_ON(pmd_write(*pmd));
> 
> You seem to add a lot of BUG_ONs(). Are they really necessary ? See 
> https://docs.kernel.org/process/deprecated.html?highlight=bug_on#bug-and-bug-on
> 
> You may also use VM_BUG_ON().
> 

Sure.
I added BUG_ON() when doing the debug.
I will consider again which one is necessary.
And change to use VM_BUG_ON().

Thanks.
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index 76e3af9639d9..dcb678cbb051 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1000,6 +1000,34 @@  page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
 	return new_page;
 }
 
+static inline void cow_pte_rss(struct mm_struct *mm, struct vm_area_struct *vma,
+	pmd_t *pmdp, unsigned long addr, unsigned long end, bool inc_dec)
+{
+	int rss[NR_MM_COUNTERS];
+	pte_t *orig_ptep, *ptep;
+	struct page *page;
+
+	init_rss_vec(rss);
+
+	ptep = pte_offset_map(pmdp, addr);
+	orig_ptep = ptep;
+	arch_enter_lazy_mmu_mode();
+	do {
+		if (pte_none(*ptep) || pte_special(*ptep))
+			continue;
+
+		page = vm_normal_page(vma, addr, *ptep);
+		if (page) {
+			if (inc_dec)
+				rss[mm_counter(page)]++;
+			else
+				rss[mm_counter(page)]--;
+		}
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
+	add_mm_rss_vec(mm, rss);
+}
+
 static int
 copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	       pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
@@ -4554,6 +4582,44 @@  static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
 	return VM_FAULT_FALLBACK;
 }
 
+/* COW PTE fallback to normal PTE:
+ * - two state here
+ *   - After break child :   [parent, rss=1, ref=1, write=NO , owner=parent]
+ *                        to [parent, rss=1, ref=1, write=YES, owner=NULL  ]
+ *   - After break parent:   [child , rss=0, ref=1, write=NO , owner=NULL  ]
+ *                        to [child , rss=1, ref=1, write=YES, owner=NULL  ]
+ */
+void cow_pte_fallback(struct vm_area_struct *vma, pmd_t *pmd,
+		unsigned long addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long start, end;
+	pmd_t new;
+
+	BUG_ON(pmd_write(*pmd));
+
+	start = addr & PMD_MASK;
+	end = (addr + PMD_SIZE) & PMD_MASK;
+
+	/* If pmd is not owner, it needs to increase the rss.
+	 * Since only the owner has the RSS state for the COW PTE.
+	 */
+	if (!cow_pte_owner_is_same(pmd, pmd)) {
+		cow_pte_rss(mm, vma, pmd, start, end, true /* inc */);
+		mm_inc_nr_ptes(mm);
+		smp_wmb();
+		pmd_populate(mm, pmd, pmd_page(*pmd));
+	}
+
+	/* Reuse the pte page */
+	set_cow_pte_owner(pmd, NULL);
+	new = pmd_mkwrite(*pmd);
+	set_pmd_at(mm, addr, pmd, new);
+
+	BUG_ON(!pmd_write(*pmd));
+	BUG_ON(pmd_page(*pmd)->cow_pte_owner);
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most