diff mbox

arm64: dirty memory check, enable soft dirty for arm64

Message ID 1512029649-61312-1-git-send-email-bin.lu@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bin Lu Nov. 30, 2017, 8:14 a.m. UTC
arch/arm64/Kconfig: enable CONFIG_HAVE_ARCH_SOFT_DIRTY
    arch/arm64/include/asm/pgtable.h: enable functions for soft dirty

    Checkpoint/Restore In Userspace(CRIU) needs the page's change
    to be soft tracked. This allows to do a pre checkpoint and then dump
    only touched pages.

    This is done by using a PTE bit (PTE_DIRTY) when
    the page is backed in memory, and a new _PAGE_SWP_SOFT_DIRTY bit when
    the page is swapped out.

    The bit of PTE_DIRTY is already defined in pgtable-prot.h.

    The _PAGE_SWP_SOFT_DIRTY bit is set as 59 in the swap pte.

    pmd_dirty and pmd_mkclean for THP page MADV_FREE also were
    supported in this patch.

    This patch has been tested with ZDTM test suite in CRIU.
    Such as: ./zdtm.py run -t zdtm/static/env00 --pre 10
Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 arch/arm64/Kconfig               |  1 +
 arch/arm64/include/asm/pgtable.h | 46 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

Comments

Steve Capper Dec. 5, 2017, 1:24 p.m. UTC | #1
Hi Bin,

On Thu, Nov 30, 2017 at 12:14:09AM -0800, Bin Lu wrote:
>     arch/arm64/Kconfig: enable CONFIG_HAVE_ARCH_SOFT_DIRTY
>     arch/arm64/include/asm/pgtable.h: enable functions for soft dirty
> 
>     Checkpoint/Restore In Userspace(CRIU) needs the page's change
>     to be soft tracked. This allows to do a pre checkpoint and then dump
>     only touched pages.
> 
>     This is done by using a PTE bit (PTE_DIRTY) when
>     the page is backed in memory, and a new _PAGE_SWP_SOFT_DIRTY bit when
>     the page is swapped out.
> 
>     The bit of PTE_DIRTY is already defined in pgtable-prot.h.
> 
>     The _PAGE_SWP_SOFT_DIRTY bit is set as 59 in the swap pte.
> 
>     pmd_dirty and pmd_mkclean for THP page MADV_FREE also were
>     supported in this patch.
> 
>     This patch has been tested with ZDTM test suite in CRIU.
>     Such as: ./zdtm.py run -t zdtm/static/env00 --pre 10
> Signed-off-by: Bin Lu <bin.lu@arm.com>
> ---
>  arch/arm64/Kconfig               |  1 +
>  arch/arm64/include/asm/pgtable.h | 46 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 47 insertions(+)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index dfd9086..933f5be 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -73,6 +73,7 @@ config ARM64
>  	select HAVE_ARCH_MMAP_RND_BITS
>  	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
>  	select HAVE_ARCH_SECCOMP_FILTER
> +	select HAVE_ARCH_SOFT_DIRTY
>  	select HAVE_ARCH_TRACEHOOK
>  	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>  	select HAVE_ARM_SMCCC
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 6eae342..8ab251e 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -306,6 +306,28 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
>  	return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT);
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
> +static inline bool pte_soft_dirty(pte_t pte)
> +{
> +        return pte_sw_dirty(pte);
> +}
> +
> +static inline pte_t pte_mksoft_dirty(pte_t pte)
> +{
> +        return pte_mkdirty(pte);
> +}
> +
> +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> +{
> +        return pte_mkclean(pte);
> +}
> +
> +#define pmd_soft_dirty(pmd)    pte_soft_dirty(pmd_pte(pmd))
> +#define pmd_mksoft_dirty(pmd)  pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
> +#define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)))
> +
> +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
> +
>  #ifdef CONFIG_NUMA_BALANCING
>  /*
>   * See the comment in include/asm-generic/pgtable.h
> @@ -695,10 +717,12 @@ extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
>   *	bits 2-7:	swap type
>   *	bits 8-57:	swap offset
>   *	bit  58:	PTE_PROT_NONE (must be zero)
> + *	bit  59:	swap software dirty tracking
>   */
>  #define __SWP_TYPE_SHIFT	2
>  #define __SWP_TYPE_BITS		6
>  #define __SWP_OFFSET_BITS	50
> +#define __SWP_PROT_NONE_BITS	1
>  #define __SWP_TYPE_MASK		((1 << __SWP_TYPE_BITS) - 1)
>  #define __SWP_OFFSET_SHIFT	(__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
>  #define __SWP_OFFSET_MASK	((1UL << __SWP_OFFSET_BITS) - 1)
> @@ -710,6 +734,28 @@ extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
>  #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
>  #define __swp_entry_to_pte(swp)	((pte_t) { (swp).val })
>  
> +#ifdef CONFIG_MEM_SOFT_DIRTY
> +#define _PAGE_SWP_SOFT_DIRTY   (1UL << (__SWP_OFFSET_SHIFT + __SWP_OFFSET_BITS + __SWP_PROT_NONE_BITS))
> +#else
> +#define _PAGE_SWP_SOFT_DIRTY    0UL
> +#endif /* CONFIG_MEM_SOFT_DIRTY */
> +
> +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
> +static inline bool pte_swp_soft_dirty(pte_t pte)
> +{
> +        return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY);
> +}
> +
> +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
> +{
> +        return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
> +}
> +
> +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
> +{
> +        return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
> +}
> +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
>  /*
>   * Ensure that there are not more swap files than can be encoded in the kernel
>   * PTEs.
> --

This looks good to me. Soft dirty pte's are created by both pte_mkdirty
and pte_mksoft_dirty. If a soft-dirty bit is cleared by
clear_soft_dirty(.), then the pte is also made read-only. Meaning
hardware DBM won't set the pte as dirty on write, a fault will take place
and then the pte will be made both writable and soft dirty.

Some changes to ptep_set_wrprotect(.) to preserve the dirty bit may
cause more soft dirty pte's to appear:
http://lists.infradead.org/pipermail/linux-arm-kernel/2017-December/545151.html

I *think* that shouldn't affect the use case I understand for the
patch - to identify whether or not a page has been written to since
last cleaned. 

FWIW:
Reviewed-by: Steve Capper <steve.capper@arm.com>

Cheers,
Catalin Marinas Jan. 2, 2018, 6:19 p.m. UTC | #2
Hi Bin,

On Thu, Nov 30, 2017 at 12:14:09AM -0800, Bin Lu wrote:
>     pmd_dirty and pmd_mkclean for THP page MADV_FREE also were
>     supported in this patch.
[...]
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -306,6 +306,28 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
>  	return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT);
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
> +static inline bool pte_soft_dirty(pte_t pte)
> +{
> +        return pte_sw_dirty(pte);
> +}
> +
> +static inline pte_t pte_mksoft_dirty(pte_t pte)
> +{
> +        return pte_mkdirty(pte);
> +}
> +
> +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> +{
> +        return pte_mkclean(pte);
> +}
> +
> +#define pmd_soft_dirty(pmd)    pte_soft_dirty(pmd_pte(pmd))
> +#define pmd_mksoft_dirty(pmd)  pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
> +#define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)))

IIUC, a pmd_mkclean() would result in a pte_soft_dirty() == false. Is
this the expected behaviour with MADV_FREE? The x86 code sets both
_PAGE_DIRTY and _PAGE_SOFT_DIRTY in pmd_mkdirty() but only clears
_PAGE_DIRTY in pmd_mkclean().
Steve Capper Jan. 3, 2018, 1:03 p.m. UTC | #3
Hi Catalin,

On Tue, Jan 02, 2018 at 06:19:10PM +0000, Catalin Marinas wrote:
> Hi Bin,
> 
> On Thu, Nov 30, 2017 at 12:14:09AM -0800, Bin Lu wrote:
> >     pmd_dirty and pmd_mkclean for THP page MADV_FREE also were
> >     supported in this patch.
> [...]
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -306,6 +306,28 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
> >  	return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT);
> >  }
> >  
> > +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
> > +static inline bool pte_soft_dirty(pte_t pte)
> > +{
> > +        return pte_sw_dirty(pte);
> > +}
> > +
> > +static inline pte_t pte_mksoft_dirty(pte_t pte)
> > +{
> > +        return pte_mkdirty(pte);
> > +}
> > +
> > +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> > +{
> > +        return pte_mkclean(pte);
> > +}
> > +
> > +#define pmd_soft_dirty(pmd)    pte_soft_dirty(pmd_pte(pmd))
> > +#define pmd_mksoft_dirty(pmd)  pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
> > +#define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)))
> 
> IIUC, a pmd_mkclean() would result in a pte_soft_dirty() == false. Is
> this the expected behaviour with MADV_FREE? The x86 code sets both
> _PAGE_DIRTY and _PAGE_SOFT_DIRTY in pmd_mkdirty() but only clears
> _PAGE_DIRTY in pmd_mkclean().
> 

For MADV_FREE, my understanding is that userspace hints to the memory
manager that pages are no longer needed and if no subsequent
modifications are made to the page then the kernel can free the pages
(and swap in a zero page should userspace attempt to read back the page
after it's been freed).

We would get a divergence in behaviour between arm64 and x86 when a
userspace program calls MADV_FREE on a soft dirty page, and where the
memory manager has not yet freed it.

So on x86 the page would be flagged as soft dirty and on arm64 this
would not be soft dirty.  I originally thought that this would be okay
as userspace has flagged the page as no longer needed. Having had
another big think about it though, I think we'd have one potential
problem with the following course of events:

 * CRIU dumps out page A from running process,
 * process subsequently modifies page A -> marked as soft dirty,
 * process calls MADV_FREE, page A no longer soft dirty,
 * CRIU does not pick up on the changes made to page A, so the other end
   gets an old page restored.

Apologies for missing this before, I neglected to make the distinction
between stale data and dropped data (userspace knows about dropped data
because the page comes back in as zeros).

The arm64 pte_dirty() checks for software and hardware dirty, thus our
pte_mkclean() should clear both states lest MADV_FREE would result in no
pages being swapped out under memory pressure.

Adding an extra dirty bit would fix this (but then we'd have effectively
three dirty bits for ptes), it is not clear to me whether or not the
HW/SW dirty bit logic could be tweaked for arm64 to give us behaviour
parity with x86.

Anyway some more thought required :-).

Cheers,
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index dfd9086..933f5be 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -73,6 +73,7 @@  config ARM64
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
 	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_SOFT_DIRTY
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARM_SMCCC
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 6eae342..8ab251e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -306,6 +306,28 @@  static inline pgprot_t mk_sect_prot(pgprot_t prot)
 	return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT);
 }
 
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline bool pte_soft_dirty(pte_t pte)
+{
+        return pte_sw_dirty(pte);
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+        return pte_mkdirty(pte);
+}
+
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+        return pte_mkclean(pte);
+}
+
+#define pmd_soft_dirty(pmd)    pte_soft_dirty(pmd_pte(pmd))
+#define pmd_mksoft_dirty(pmd)  pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
+#define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)))
+
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * See the comment in include/asm-generic/pgtable.h
@@ -695,10 +717,12 @@  extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
  *	bits 2-7:	swap type
  *	bits 8-57:	swap offset
  *	bit  58:	PTE_PROT_NONE (must be zero)
+ *	bit  59:	swap software dirty tracking
  */
 #define __SWP_TYPE_SHIFT	2
 #define __SWP_TYPE_BITS		6
 #define __SWP_OFFSET_BITS	50
+#define __SWP_PROT_NONE_BITS	1
 #define __SWP_TYPE_MASK		((1 << __SWP_TYPE_BITS) - 1)
 #define __SWP_OFFSET_SHIFT	(__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
 #define __SWP_OFFSET_MASK	((1UL << __SWP_OFFSET_BITS) - 1)
@@ -710,6 +734,28 @@  extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(swp)	((pte_t) { (swp).val })
 
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SWP_SOFT_DIRTY   (1UL << (__SWP_OFFSET_SHIFT + __SWP_OFFSET_BITS + __SWP_PROT_NONE_BITS))
+#else
+#define _PAGE_SWP_SOFT_DIRTY    0UL
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline bool pte_swp_soft_dirty(pte_t pte)
+{
+        return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+        return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+        return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
+}
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 /*
  * Ensure that there are not more swap files than can be encoded in the kernel
  * PTEs.