diff mbox series

riscv/mm: Add soft-dirty page tracking support

Message ID 20240710033004.3923527-1-zhangchunyan@iscas.ac.cn (mailing list archive)
State Deferred
Headers show
Series riscv/mm: Add soft-dirty page tracking support | expand

Checks

Context Check Description
conchuod/patch-1-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-1-test-2 success .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-1-test-3 success .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-1-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-1-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-1-test-6 warning .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-1-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-1-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-1-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-1-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-1-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-1-test-12 success .github/scripts/patches/tests/verify_signedoff.sh
conchuod/vmtest-fixes-PR fail merge-conflict

Commit Message

Chunyan Zhang July 10, 2024, 3:30 a.m. UTC
The PTE bit (9) is reserved for software, so we can use it for
soft-dirty tracking. This patch adds its standard handlers for
PTE, PMD, and swap entry.

To add swap PTE soft-dirty tracking, we borrow bit (4) which is
available for swap PTEs on RISC-V systems.

This patch has been tested with the kselftest mm suite in which
soft-dirty and madv_populate run and pass, and no regressions
are observed in any of the other tests.

Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
---
 arch/riscv/Kconfig                    |  1 +
 arch/riscv/include/asm/pgtable-bits.h | 13 ++++++
 arch/riscv/include/asm/pgtable.h      | 65 ++++++++++++++++++++++++++-
 3 files changed, 78 insertions(+), 1 deletion(-)

Comments

Alexandre Ghiti July 15, 2024, 11:21 a.m. UTC | #1
Hi Chunyan,

On 10/07/2024 05:30, Chunyan Zhang wrote:
> The PTE bit (9) is reserved for software, so we can use it for
> soft-dirty tracking. This patch adds its standard handlers for
> PTE, PMD, and swap entry.


Unfortunately, ZONE_DEVICE has just used this last bit and should be 
merged in 6.11.

I'm currently discussing internally how we can get 2 other PTE bits from 
RVI in order to have the same number of available bits as x86 and arm64. 
I guess that for now, if we really have a usecase for softdirty (and I 
think we do with CRIU), we'll have to make ZONE_DEVICE and softdirty 
mutually exclusive.


>
> To add swap PTE soft-dirty tracking, we borrow bit (4) which is
> available for swap PTEs on RISC-V systems.
>
> This patch has been tested with the kselftest mm suite in which
> soft-dirty and madv_populate run and pass, and no regressions
> are observed in any of the other tests.


Did you give CRIU a try?

Thanks,

Alex


>
> Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> ---
>   arch/riscv/Kconfig                    |  1 +
>   arch/riscv/include/asm/pgtable-bits.h | 13 ++++++
>   arch/riscv/include/asm/pgtable.h      | 65 ++++++++++++++++++++++++++-
>   3 files changed, 78 insertions(+), 1 deletion(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index b94176e25be1..2e3ad2925a6b 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -118,6 +118,7 @@ config RISCV
>   	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
>   	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
>   	select HAVE_ARCH_SECCOMP_FILTER
> +	select HAVE_ARCH_SOFT_DIRTY
>   	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
>   	select HAVE_ARCH_TRACEHOOK
>   	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
> diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
> index 179bd4afece4..bab48f5fd1e2 100644
> --- a/arch/riscv/include/asm/pgtable-bits.h
> +++ b/arch/riscv/include/asm/pgtable-bits.h
> @@ -19,6 +19,19 @@
>   #define _PAGE_SOFT      (3 << 8)    /* Reserved for software */
>   
>   #define _PAGE_SPECIAL   (1 << 8)    /* RSW: 0x1 */
> +
> +#ifdef CONFIG_MEM_SOFT_DIRTY
> +#define _PAGE_SOFT_DIRTY	(1 << 9)    /* RSW: 0x2 for software dirty tracking */
> +/*
> + * BIT 4 is not involved into swap entry computation, so we
> + * can borrow it for swap page soft-dirty tracking.
> + */
> +#define _PAGE_SWP_SOFT_DIRTY	_PAGE_USER
> +#else
> +#define _PAGE_SOFT_DIRTY	0
> +#define _PAGE_SWP_SOFT_DIRTY	0
> +#endif /* CONFIG_MEM_SOFT_DIRTY */
> +
>   #define _PAGE_TABLE     _PAGE_PRESENT
>   
>   /*
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index aad8b8ca51f1..46f512f52580 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -408,7 +408,7 @@ static inline pte_t pte_mkwrite_novma(pte_t pte)
>   
>   static inline pte_t pte_mkdirty(pte_t pte)
>   {
> -	return __pte(pte_val(pte) | _PAGE_DIRTY);
> +	return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
>   }
>   
>   static inline pte_t pte_mkclean(pte_t pte)
> @@ -436,6 +436,36 @@ static inline pte_t pte_mkhuge(pte_t pte)
>   	return pte;
>   }
>   
> +static inline int pte_soft_dirty(pte_t pte)
> +{
> +	return pte_val(pte) & _PAGE_SOFT_DIRTY;
> +}
> +
> +static inline pte_t pte_mksoft_dirty(pte_t pte)
> +{
> +	return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
> +}
> +
> +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> +{
> +	return __pte(pte_val(pte) & ~(_PAGE_SOFT_DIRTY));
> +}
> +
> +static inline int pte_swp_soft_dirty(pte_t pte)
> +{
> +	return pte_val(pte) & _PAGE_SWP_SOFT_DIRTY;
> +}
> +
> +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
> +{
> +	return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
> +}
> +
> +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
> +{
> +	return __pte(pte_val(pte) & ~(_PAGE_SWP_SOFT_DIRTY));
> +}
> +
>   #ifdef CONFIG_RISCV_ISA_SVNAPOT
>   #define pte_leaf_size(pte)	(pte_napot(pte) ?				\
>   					napot_cont_size(napot_cont_order(pte)) :\
> @@ -721,6 +751,38 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
>   	return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
>   }
>   
> +static inline int pmd_soft_dirty(pmd_t pmd)
> +{
> +	return pte_soft_dirty(pmd_pte(pmd));
> +}
> +
> +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
> +{
> +	return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
> +}
> +
> +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
> +{
> +	return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
> +}
> +
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +static inline int pmd_swp_soft_dirty(pmd_t pmd)
> +{
> +	return pte_swp_soft_dirty(pmd_pte(pmd));
> +}
> +
> +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
> +{
> +	return pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd)));
> +}
> +
> +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
> +{
> +	return pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd)));
> +}
> +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +
>   static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>   				pmd_t *pmdp, pmd_t pmd)
>   {
> @@ -811,6 +873,7 @@ extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>    * Format of swap PTE:
>    *	bit            0:	_PAGE_PRESENT (zero)
>    *	bit       1 to 3:       _PAGE_LEAF (zero)
> + *	bit            4:	_PAGE_SWP_SOFT_DIRTY
>    *	bit            5:	_PAGE_PROT_NONE (zero)
>    *	bit            6:	exclusive marker
>    *	bits      7 to 11:	swap type
Chunyan Zhang July 16, 2024, 2:16 a.m. UTC | #2
Hi Alex,

On Mon, 15 Jul 2024 at 19:21, Alexandre Ghiti <alex@ghiti.fr> wrote:
>
> Hi Chunyan,
>
> On 10/07/2024 05:30, Chunyan Zhang wrote:
> > The PTE bit (9) is reserved for software, so we can use it for
> > soft-dirty tracking. This patch adds its standard handlers for
> > PTE, PMD, and swap entry.
>
>
> Unfortunately, ZONE_DEVICE has just used this last bit and should be
> merged in 6.11.

Yes, I read the patch just now.

> I'm currently discussing internally how we can get 2 other PTE bits from
> RVI in order to have the same number of available bits as x86 and arm64.

Yes I noticed that PTE bits reserved for software are too limited on RISC-V.

Besides softdirty, we probably can support uffd write-protect on
RISC-V if we will have two PTE bits for use.

> I guess that for now, if we really have a usecase for softdirty (and I
> think we do with CRIU), we'll have to make ZONE_DEVICE and soft-dirty
> mutually exclusive.

Yes, I also learned that CRIU uses soft-dirty.

> >
> > To add swap PTE soft-dirty tracking, we borrow bit (4) which is
> > available for swap PTEs on RISC-V systems.
> >
> > This patch has been tested with the kselftest mm suite in which
> > soft-dirty and madv_populate run and pass, and no regressions
> > are observed in any of the other tests.
>
>
> Did you give CRIU a try?

I haven't tried CRIU, actually I found soft-dirty was missing on
RISC-V by the way of running mm selftest cases.

I can cook a new patch to implement soft-dirty and ZONE_DEVICE share
the PTE bit(9), and make both features mutually exclusive if this
solution is accepted.

Or not to add soft-dirty until we have more other PTE bits that can be
used for software.

I'm open to listen to suggestions.

Thanks,
Chunyan

>
> Thanks,
>
> Alex
>
>
> >
> > Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > ---
> >   arch/riscv/Kconfig                    |  1 +
> >   arch/riscv/include/asm/pgtable-bits.h | 13 ++++++
> >   arch/riscv/include/asm/pgtable.h      | 65 ++++++++++++++++++++++++++-
> >   3 files changed, 78 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index b94176e25be1..2e3ad2925a6b 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -118,6 +118,7 @@ config RISCV
> >       select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
> >       select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
> >       select HAVE_ARCH_SECCOMP_FILTER
> > +     select HAVE_ARCH_SOFT_DIRTY
> >       select HAVE_ARCH_THREAD_STRUCT_WHITELIST
> >       select HAVE_ARCH_TRACEHOOK
> >       select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
> > diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
> > index 179bd4afece4..bab48f5fd1e2 100644
> > --- a/arch/riscv/include/asm/pgtable-bits.h
> > +++ b/arch/riscv/include/asm/pgtable-bits.h
> > @@ -19,6 +19,19 @@
> >   #define _PAGE_SOFT      (3 << 8)    /* Reserved for software */
> >
> >   #define _PAGE_SPECIAL   (1 << 8)    /* RSW: 0x1 */
> > +
> > +#ifdef CONFIG_MEM_SOFT_DIRTY
> > +#define _PAGE_SOFT_DIRTY     (1 << 9)    /* RSW: 0x2 for software dirty tracking */
> > +/*
> > + * BIT 4 is not involved into swap entry computation, so we
> > + * can borrow it for swap page soft-dirty tracking.
> > + */
> > +#define _PAGE_SWP_SOFT_DIRTY _PAGE_USER
> > +#else
> > +#define _PAGE_SOFT_DIRTY     0
> > +#define _PAGE_SWP_SOFT_DIRTY 0
> > +#endif /* CONFIG_MEM_SOFT_DIRTY */
> > +
> >   #define _PAGE_TABLE     _PAGE_PRESENT
> >
> >   /*
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index aad8b8ca51f1..46f512f52580 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -408,7 +408,7 @@ static inline pte_t pte_mkwrite_novma(pte_t pte)
> >
> >   static inline pte_t pte_mkdirty(pte_t pte)
> >   {
> > -     return __pte(pte_val(pte) | _PAGE_DIRTY);
> > +     return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
> >   }
> >
> >   static inline pte_t pte_mkclean(pte_t pte)
> > @@ -436,6 +436,36 @@ static inline pte_t pte_mkhuge(pte_t pte)
> >       return pte;
> >   }
> >
> > +static inline int pte_soft_dirty(pte_t pte)
> > +{
> > +     return pte_val(pte) & _PAGE_SOFT_DIRTY;
> > +}
> > +
> > +static inline pte_t pte_mksoft_dirty(pte_t pte)
> > +{
> > +     return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
> > +}
> > +
> > +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> > +{
> > +     return __pte(pte_val(pte) & ~(_PAGE_SOFT_DIRTY));
> > +}
> > +
> > +static inline int pte_swp_soft_dirty(pte_t pte)
> > +{
> > +     return pte_val(pte) & _PAGE_SWP_SOFT_DIRTY;
> > +}
> > +
> > +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
> > +{
> > +     return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
> > +}
> > +
> > +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
> > +{
> > +     return __pte(pte_val(pte) & ~(_PAGE_SWP_SOFT_DIRTY));
> > +}
> > +
> >   #ifdef CONFIG_RISCV_ISA_SVNAPOT
> >   #define pte_leaf_size(pte)  (pte_napot(pte) ?                               \
> >                                       napot_cont_size(napot_cont_order(pte)) :\
> > @@ -721,6 +751,38 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
> >       return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
> >   }
> >
> > +static inline int pmd_soft_dirty(pmd_t pmd)
> > +{
> > +     return pte_soft_dirty(pmd_pte(pmd));
> > +}
> > +
> > +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
> > +{
> > +     return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
> > +}
> > +
> > +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
> > +{
> > +     return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
> > +}
> > +
> > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > +static inline int pmd_swp_soft_dirty(pmd_t pmd)
> > +{
> > +     return pte_swp_soft_dirty(pmd_pte(pmd));
> > +}
> > +
> > +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
> > +{
> > +     return pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd)));
> > +}
> > +
> > +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
> > +{
> > +     return pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd)));
> > +}
> > +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> > +
> >   static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
> >                               pmd_t *pmdp, pmd_t pmd)
> >   {
> > @@ -811,6 +873,7 @@ extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> >    * Format of swap PTE:
> >    *  bit            0:       _PAGE_PRESENT (zero)
> >    *  bit       1 to 3:       _PAGE_LEAF (zero)
> > + *   bit            4:       _PAGE_SWP_SOFT_DIRTY
> >    *  bit            5:       _PAGE_PROT_NONE (zero)
> >    *  bit            6:       exclusive marker
> >    *  bits      7 to 11:      swap type
Alexandre Ghiti July 16, 2024, 1 p.m. UTC | #3
Hi Chunyan,

On 16/07/2024 04:16, Chunyan Zhang wrote:
> Hi Alex,
>
> On Mon, 15 Jul 2024 at 19:21, Alexandre Ghiti <alex@ghiti.fr> wrote:
>> Hi Chunyan,
>>
>> On 10/07/2024 05:30, Chunyan Zhang wrote:
>>> The PTE bit (9) is reserved for software, so we can use it for
>>> soft-dirty tracking. This patch adds its standard handlers for
>>> PTE, PMD, and swap entry.
>>
>> Unfortunately, ZONE_DEVICE has just used this last bit and should be
>> merged in 6.11.
> Yes, I read the patch just now.
>
>> I'm currently discussing internally how we can get 2 other PTE bits from
>> RVI in order to have the same number of available bits as x86 and arm64.
> Yes I noticed that PTE bits reserved for software are too limited on RISC-V.
>
> Besides softdirty, we probably can support uffd write-protect on
> RISC-V if we will have two PTE bits for use.


Indeed, softdirty and uffd-wp will use two PTE bits.


>
>> I guess that for now, if we really have a usecase for softdirty (and I
>> think we do with CRIU), we'll have to make ZONE_DEVICE and soft-dirty
>> mutually exclusive.
> Yes, I also learned that CRIU uses soft-dirty.
>
>>> To add swap PTE soft-dirty tracking, we borrow bit (4) which is
>>> available for swap PTEs on RISC-V systems.
>>>
>>> This patch has been tested with the kselftest mm suite in which
>>> soft-dirty and madv_populate run and pass, and no regressions
>>> are observed in any of the other tests.
>>
>> Did you give CRIU a try?
> I haven't tried CRIU, actually I found soft-dirty was missing on
> RISC-V by the way of running mm selftest cases.


Since CRIU is the main user (?) of softdirty, it would be really nice if 
you can test it :)


>
> I can cook a new patch to implement soft-dirty and ZONE_DEVICE share
> the PTE bit(9), and make both features mutually exclusive if this
> solution is accepted.


I agree with this solution, let's implement both softdirty and uffd-wp 
by sharing the last PTE bit that ZONE_DEVICE stole. At least it will 
allow people to play with them.

Do you intend to work on uffd-wp? This is on my todo list, so up to you.


>
> Or not to add soft-dirty until we have more other PTE bits that can be
> used for software.
>
> I'm open to listen to suggestions.
>
> Thanks,
> Chunyan


Thanks,

Alex


>
>> Thanks,
>>
>> Alex
>>
>>
>>> Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
>>> ---
>>>    arch/riscv/Kconfig                    |  1 +
>>>    arch/riscv/include/asm/pgtable-bits.h | 13 ++++++
>>>    arch/riscv/include/asm/pgtable.h      | 65 ++++++++++++++++++++++++++-
>>>    3 files changed, 78 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index b94176e25be1..2e3ad2925a6b 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -118,6 +118,7 @@ config RISCV
>>>        select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
>>>        select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
>>>        select HAVE_ARCH_SECCOMP_FILTER
>>> +     select HAVE_ARCH_SOFT_DIRTY
>>>        select HAVE_ARCH_THREAD_STRUCT_WHITELIST
>>>        select HAVE_ARCH_TRACEHOOK
>>>        select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
>>> diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
>>> index 179bd4afece4..bab48f5fd1e2 100644
>>> --- a/arch/riscv/include/asm/pgtable-bits.h
>>> +++ b/arch/riscv/include/asm/pgtable-bits.h
>>> @@ -19,6 +19,19 @@
>>>    #define _PAGE_SOFT      (3 << 8)    /* Reserved for software */
>>>
>>>    #define _PAGE_SPECIAL   (1 << 8)    /* RSW: 0x1 */
>>> +
>>> +#ifdef CONFIG_MEM_SOFT_DIRTY
>>> +#define _PAGE_SOFT_DIRTY     (1 << 9)    /* RSW: 0x2 for software dirty tracking */
>>> +/*
>>> + * BIT 4 is not involved into swap entry computation, so we
>>> + * can borrow it for swap page soft-dirty tracking.
>>> + */
>>> +#define _PAGE_SWP_SOFT_DIRTY _PAGE_USER
>>> +#else
>>> +#define _PAGE_SOFT_DIRTY     0
>>> +#define _PAGE_SWP_SOFT_DIRTY 0
>>> +#endif /* CONFIG_MEM_SOFT_DIRTY */
>>> +
>>>    #define _PAGE_TABLE     _PAGE_PRESENT
>>>
>>>    /*
>>> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
>>> index aad8b8ca51f1..46f512f52580 100644
>>> --- a/arch/riscv/include/asm/pgtable.h
>>> +++ b/arch/riscv/include/asm/pgtable.h
>>> @@ -408,7 +408,7 @@ static inline pte_t pte_mkwrite_novma(pte_t pte)
>>>
>>>    static inline pte_t pte_mkdirty(pte_t pte)
>>>    {
>>> -     return __pte(pte_val(pte) | _PAGE_DIRTY);
>>> +     return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
>>>    }
>>>
>>>    static inline pte_t pte_mkclean(pte_t pte)
>>> @@ -436,6 +436,36 @@ static inline pte_t pte_mkhuge(pte_t pte)
>>>        return pte;
>>>    }
>>>
>>> +static inline int pte_soft_dirty(pte_t pte)
>>> +{
>>> +     return pte_val(pte) & _PAGE_SOFT_DIRTY;
>>> +}
>>> +
>>> +static inline pte_t pte_mksoft_dirty(pte_t pte)
>>> +{
>>> +     return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
>>> +}
>>> +
>>> +static inline pte_t pte_clear_soft_dirty(pte_t pte)
>>> +{
>>> +     return __pte(pte_val(pte) & ~(_PAGE_SOFT_DIRTY));
>>> +}
>>> +
>>> +static inline int pte_swp_soft_dirty(pte_t pte)
>>> +{
>>> +     return pte_val(pte) & _PAGE_SWP_SOFT_DIRTY;
>>> +}
>>> +
>>> +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
>>> +{
>>> +     return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
>>> +}
>>> +
>>> +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
>>> +{
>>> +     return __pte(pte_val(pte) & ~(_PAGE_SWP_SOFT_DIRTY));
>>> +}
>>> +
>>>    #ifdef CONFIG_RISCV_ISA_SVNAPOT
>>>    #define pte_leaf_size(pte)  (pte_napot(pte) ?                               \
>>>                                        napot_cont_size(napot_cont_order(pte)) :\
>>> @@ -721,6 +751,38 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
>>>        return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
>>>    }
>>>
>>> +static inline int pmd_soft_dirty(pmd_t pmd)
>>> +{
>>> +     return pte_soft_dirty(pmd_pte(pmd));
>>> +}
>>> +
>>> +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
>>> +{
>>> +     return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
>>> +}
>>> +
>>> +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
>>> +{
>>> +     return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
>>> +}
>>> +
>>> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>>> +static inline int pmd_swp_soft_dirty(pmd_t pmd)
>>> +{
>>> +     return pte_swp_soft_dirty(pmd_pte(pmd));
>>> +}
>>> +
>>> +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
>>> +{
>>> +     return pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd)));
>>> +}
>>> +
>>> +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
>>> +{
>>> +     return pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd)));
>>> +}
>>> +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>> +
>>>    static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>>>                                pmd_t *pmdp, pmd_t pmd)
>>>    {
>>> @@ -811,6 +873,7 @@ extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>>>     * Format of swap PTE:
>>>     *  bit            0:       _PAGE_PRESENT (zero)
>>>     *  bit       1 to 3:       _PAGE_LEAF (zero)
>>> + *   bit            4:       _PAGE_SWP_SOFT_DIRTY
>>>     *  bit            5:       _PAGE_PROT_NONE (zero)
>>>     *  bit            6:       exclusive marker
>>>     *  bits      7 to 11:      swap type
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Chunyan Zhang July 17, 2024, 1:15 a.m. UTC | #4
On Tue, 16 Jul 2024 at 21:00, Alexandre Ghiti <alex@ghiti.fr> wrote:
>
> Hi Chunyan,
>
> On 16/07/2024 04:16, Chunyan Zhang wrote:
> > Hi Alex,
> >
> > On Mon, 15 Jul 2024 at 19:21, Alexandre Ghiti <alex@ghiti.fr> wrote:
> >> Hi Chunyan,
> >>
> >> On 10/07/2024 05:30, Chunyan Zhang wrote:
> >>> The PTE bit (9) is reserved for software, so we can use it for
> >>> soft-dirty tracking. This patch adds its standard handlers for
> >>> PTE, PMD, and swap entry.
> >>
> >> Unfortunately, ZONE_DEVICE has just used this last bit and should be
> >> merged in 6.11.
> > Yes, I read the patch just now.
> >
> >> I'm currently discussing internally how we can get 2 other PTE bits from
> >> RVI in order to have the same number of available bits as x86 and arm64.
> > Yes I noticed that PTE bits reserved for software are too limited on RISC-V.
> >
> > Besides softdirty, we probably can support uffd write-protect on
> > RISC-V if we will have two PTE bits for use.
>
>
> Indeed, softdirty and uffd-wp will use two PTE bits.
>
>
> >
> >> I guess that for now, if we really have a usecase for softdirty (and I
> >> think we do with CRIU), we'll have to make ZONE_DEVICE and soft-dirty
> >> mutually exclusive.
> > Yes, I also learned that CRIU uses soft-dirty.
> >
> >>> To add swap PTE soft-dirty tracking, we borrow bit (4) which is
> >>> available for swap PTEs on RISC-V systems.
> >>>
> >>> This patch has been tested with the kselftest mm suite in which
> >>> soft-dirty and madv_populate run and pass, and no regressions
> >>> are observed in any of the other tests.
> >>
> >> Did you give CRIU a try?
> > I haven't tried CRIU, actually I found soft-dirty was missing on
> > RISC-V by the way of running mm selftest cases.
>
>
> Since CRIU is the main user (?) of softdirty, it would be really nice if
> you can test it :)

Sure, and will keep you updated with the progress.

> >
> > I can cook a new patch to implement soft-dirty and ZONE_DEVICE share
> > the PTE bit(9), and make both features mutually exclusive if this
> > solution is accepted.
>
>
> I agree with this solution, let's implement both softdirty and uffd-wp
> by sharing the last PTE bit that ZONE_DEVICE stole. At least it will
> allow people to play with them.

Ok, then I will do next.

> Do you intend to work on uffd-wp? This is on my todo list, so up to you.

Yes, this is not hard for me, let me take it.

Thanks for your review,
Chunyan

> > Or not to add soft-dirty until we have more other PTE bits that can be
> > used for software.
> >
> > I'm open to listen to suggestions.
> >
> > Thanks,
> > Chunyan
>
>
> Thanks,
>
> Alex
>
>
> >
> >> Thanks,
> >>
> >> Alex
> >>
> >>
> >>> Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> >>> ---
> >>>    arch/riscv/Kconfig                    |  1 +
> >>>    arch/riscv/include/asm/pgtable-bits.h | 13 ++++++
> >>>    arch/riscv/include/asm/pgtable.h      | 65 ++++++++++++++++++++++++++-
> >>>    3 files changed, 78 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> >>> index b94176e25be1..2e3ad2925a6b 100644
> >>> --- a/arch/riscv/Kconfig
> >>> +++ b/arch/riscv/Kconfig
> >>> @@ -118,6 +118,7 @@ config RISCV
> >>>        select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
> >>>        select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
> >>>        select HAVE_ARCH_SECCOMP_FILTER
> >>> +     select HAVE_ARCH_SOFT_DIRTY
> >>>        select HAVE_ARCH_THREAD_STRUCT_WHITELIST
> >>>        select HAVE_ARCH_TRACEHOOK
> >>>        select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
> >>> diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
> >>> index 179bd4afece4..bab48f5fd1e2 100644
> >>> --- a/arch/riscv/include/asm/pgtable-bits.h
> >>> +++ b/arch/riscv/include/asm/pgtable-bits.h
> >>> @@ -19,6 +19,19 @@
> >>>    #define _PAGE_SOFT      (3 << 8)    /* Reserved for software */
> >>>
> >>>    #define _PAGE_SPECIAL   (1 << 8)    /* RSW: 0x1 */
> >>> +
> >>> +#ifdef CONFIG_MEM_SOFT_DIRTY
> >>> +#define _PAGE_SOFT_DIRTY     (1 << 9)    /* RSW: 0x2 for software dirty tracking */
> >>> +/*
> >>> + * BIT 4 is not involved into swap entry computation, so we
> >>> + * can borrow it for swap page soft-dirty tracking.
> >>> + */
> >>> +#define _PAGE_SWP_SOFT_DIRTY _PAGE_USER
> >>> +#else
> >>> +#define _PAGE_SOFT_DIRTY     0
> >>> +#define _PAGE_SWP_SOFT_DIRTY 0
> >>> +#endif /* CONFIG_MEM_SOFT_DIRTY */
> >>> +
> >>>    #define _PAGE_TABLE     _PAGE_PRESENT
> >>>
> >>>    /*
> >>> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> >>> index aad8b8ca51f1..46f512f52580 100644
> >>> --- a/arch/riscv/include/asm/pgtable.h
> >>> +++ b/arch/riscv/include/asm/pgtable.h
> >>> @@ -408,7 +408,7 @@ static inline pte_t pte_mkwrite_novma(pte_t pte)
> >>>
> >>>    static inline pte_t pte_mkdirty(pte_t pte)
> >>>    {
> >>> -     return __pte(pte_val(pte) | _PAGE_DIRTY);
> >>> +     return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
> >>>    }
> >>>
> >>>    static inline pte_t pte_mkclean(pte_t pte)
> >>> @@ -436,6 +436,36 @@ static inline pte_t pte_mkhuge(pte_t pte)
> >>>        return pte;
> >>>    }
> >>>
> >>> +static inline int pte_soft_dirty(pte_t pte)
> >>> +{
> >>> +     return pte_val(pte) & _PAGE_SOFT_DIRTY;
> >>> +}
> >>> +
> >>> +static inline pte_t pte_mksoft_dirty(pte_t pte)
> >>> +{
> >>> +     return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
> >>> +}
> >>> +
> >>> +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> >>> +{
> >>> +     return __pte(pte_val(pte) & ~(_PAGE_SOFT_DIRTY));
> >>> +}
> >>> +
> >>> +static inline int pte_swp_soft_dirty(pte_t pte)
> >>> +{
> >>> +     return pte_val(pte) & _PAGE_SWP_SOFT_DIRTY;
> >>> +}
> >>> +
> >>> +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
> >>> +{
> >>> +     return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
> >>> +}
> >>> +
> >>> +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
> >>> +{
> >>> +     return __pte(pte_val(pte) & ~(_PAGE_SWP_SOFT_DIRTY));
> >>> +}
> >>> +
> >>>    #ifdef CONFIG_RISCV_ISA_SVNAPOT
> >>>    #define pte_leaf_size(pte)  (pte_napot(pte) ?                               \
> >>>                                        napot_cont_size(napot_cont_order(pte)) :\
> >>> @@ -721,6 +751,38 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
> >>>        return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
> >>>    }
> >>>
> >>> +static inline int pmd_soft_dirty(pmd_t pmd)
> >>> +{
> >>> +     return pte_soft_dirty(pmd_pte(pmd));
> >>> +}
> >>> +
> >>> +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
> >>> +{
> >>> +     return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
> >>> +}
> >>> +
> >>> +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
> >>> +{
> >>> +     return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
> >>> +}
> >>> +
> >>> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> >>> +static inline int pmd_swp_soft_dirty(pmd_t pmd)
> >>> +{
> >>> +     return pte_swp_soft_dirty(pmd_pte(pmd));
> >>> +}
> >>> +
> >>> +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
> >>> +{
> >>> +     return pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd)));
> >>> +}
> >>> +
> >>> +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
> >>> +{
> >>> +     return pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd)));
> >>> +}
> >>> +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> >>> +
> >>>    static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
> >>>                                pmd_t *pmdp, pmd_t pmd)
> >>>    {
> >>> @@ -811,6 +873,7 @@ extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> >>>     * Format of swap PTE:
> >>>     *  bit            0:       _PAGE_PRESENT (zero)
> >>>     *  bit       1 to 3:       _PAGE_LEAF (zero)
> >>> + *   bit            4:       _PAGE_SWP_SOFT_DIRTY
> >>>     *  bit            5:       _PAGE_PROT_NONE (zero)
> >>>     *  bit            6:       exclusive marker
> >>>     *  bits      7 to 11:      swap type
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index b94176e25be1..2e3ad2925a6b 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -118,6 +118,7 @@  config RISCV
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_SOFT_DIRTY
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
index 179bd4afece4..bab48f5fd1e2 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -19,6 +19,19 @@ 
 #define _PAGE_SOFT      (3 << 8)    /* Reserved for software */
 
 #define _PAGE_SPECIAL   (1 << 8)    /* RSW: 0x1 */
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SOFT_DIRTY	(1 << 9)    /* RSW: 0x2 for software dirty tracking */
+/*
+ * BIT 4 is not involved into swap entry computation, so we
+ * can borrow it for swap page soft-dirty tracking.
+ */
+#define _PAGE_SWP_SOFT_DIRTY	_PAGE_USER
+#else
+#define _PAGE_SOFT_DIRTY	0
+#define _PAGE_SWP_SOFT_DIRTY	0
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
 #define _PAGE_TABLE     _PAGE_PRESENT
 
 /*
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index aad8b8ca51f1..46f512f52580 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -408,7 +408,7 @@  static inline pte_t pte_mkwrite_novma(pte_t pte)
 
 static inline pte_t pte_mkdirty(pte_t pte)
 {
-	return __pte(pte_val(pte) | _PAGE_DIRTY);
+	return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
 
 static inline pte_t pte_mkclean(pte_t pte)
@@ -436,6 +436,36 @@  static inline pte_t pte_mkhuge(pte_t pte)
 	return pte;
 }
 
+static inline int pte_soft_dirty(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_SOFT_DIRTY;
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~(_PAGE_SOFT_DIRTY));
+}
+
+static inline int pte_swp_soft_dirty(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+	return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+	return __pte(pte_val(pte) & ~(_PAGE_SWP_SOFT_DIRTY));
+}
+
 #ifdef CONFIG_RISCV_ISA_SVNAPOT
 #define pte_leaf_size(pte)	(pte_napot(pte) ?				\
 					napot_cont_size(napot_cont_order(pte)) :\
@@ -721,6 +751,38 @@  static inline pmd_t pmd_mkdirty(pmd_t pmd)
 	return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
 }
 
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+	return pte_soft_dirty(pmd_pte(pmd));
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+	return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
+}
+
+static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+{
+	return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+	return pte_swp_soft_dirty(pmd_pte(pmd));
+}
+
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+	return pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd)));
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+	return pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd)));
+}
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 				pmd_t *pmdp, pmd_t pmd)
 {
@@ -811,6 +873,7 @@  extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
  * Format of swap PTE:
  *	bit            0:	_PAGE_PRESENT (zero)
  *	bit       1 to 3:       _PAGE_LEAF (zero)
+ *	bit            4:	_PAGE_SWP_SOFT_DIRTY
  *	bit            5:	_PAGE_PROT_NONE (zero)
  *	bit            6:	exclusive marker
  *	bits      7 to 11:	swap type