diff mbox series

[RFC] mm: introduce alloc hook to apply PTE_CONT

Message ID 1637658760-5813-1-git-send-email-huangzhaoyang@gmail.com (mailing list archive)
State New
Headers show
Series [RFC] mm: introduce alloc hook to apply PTE_CONT | expand

Commit Message

Zhaoyang Huang Nov. 23, 2021, 9:12 a.m. UTC
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Since there is no PTE_CONT when rodata_full in ARM64, introducing a
hook function to apply PTE_CONT on the proper page blocks.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 arch/arm64/include/asm/page.h |  5 +++++
 arch/arm64/mm/pageattr.c      | 45 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

Comments

Ard Biesheuvel Nov. 23, 2021, 9:57 a.m. UTC | #1
On Tue, 23 Nov 2021 at 10:13, Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
>
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
>
> Since there is no PTE_CONT when rodata_full in ARM64, introducing a
> hook function to apply PTE_CONT on the proper page blocks.
>

Given the discussion around your previous patch, I would expect a
meticulous explanation here why it is guaranteed to be safe to
manipulate the PTE_CONT attribute like this, and how the proposed
logic is correct for all supported page sizes.

Without using an intermediate invalid mapping for the entire range,
this is never going to work reliably (this is the break-before-make
requirement). And given that marking the entire block invalid will
create intermediate states that are not permitted (a valid PTE_CONT
mapping and an invalid ~PTE_CONT mapping covering the same VA), the
only way to apply changes like these is to temporarily switch all CPUs
to a different translation via TTBR1. And this is not going to happen.

Also, you never replied to my question regarding the use case and the
performance gain.

In summary, NAK to this patch or any of the previous ones regarding
PTE_CONT. If you do insist on pursuing this further, please provide an
elaborate and rock solid explanation why your approach is 100% valid
and correct (for all page sizes). And make sure you include an
explanation how your changes comply with the architectural
break-before-make requirements around PTE_CONT attributes.



> ---
>  arch/arm64/include/asm/page.h |  5 +++++
>  arch/arm64/mm/pageattr.c      | 45 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 50 insertions(+)
>
> diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
> index f98c91b..53cdd09 100644
> --- a/arch/arm64/include/asm/page.h
> +++ b/arch/arm64/include/asm/page.h
> @@ -46,6 +46,11 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
>
>  #include <asm/memory.h>
>
> +#define HAVE_ARCH_ALLOC_PAGE
> +#define HAVE_ARCH_FREE_PAGE
> +
> +extern void arch_alloc_page(struct page *page, int order);
> +extern void arch_free_page(struct page *page, int order);
>  #endif /* !__ASSEMBLY__ */
>
>  #define VM_DATA_DEFAULT_FLAGS  (VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED)
> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> index a3bacd7..815a06d 100644
> --- a/arch/arm64/mm/pageattr.c
> +++ b/arch/arm64/mm/pageattr.c
> @@ -239,3 +239,48 @@ bool kernel_page_present(struct page *page)
>         ptep = pte_offset_kernel(pmdp, addr);
>         return pte_valid(READ_ONCE(*ptep));
>  }
> +
> +void arch_alloc_page(struct page *page, int order)
> +{
> +       unsigned long addr;
> +       unsigned long cont_pte_low_bound;
> +
> +       if (!rodata_full)
> +               return;
> +
> +       addr = (u64)page_address(page);
> +       if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> +               order -= 4;
> +               do {
> +                       cont_pte_low_bound = addr & CONT_PTE_MASK;
> +                       __change_memory_common(cont_pte_low_bound,
> +                                       (~CONT_PTE_MASK + 1), __pgprot(PTE_CONT), __pgprot(0));
> +                       addr = (u64)page_address(page);
> +                       page += 4;
> +                       order--;
> +               }while (order >= 0);
> +       }
> +}
> +
> +void arch_free_page(struct page *page, int order)
> +{
> +       unsigned long addr;
> +       unsigned long cont_pte_low_bound;
> +
> +       if (!rodata_full)
> +               return;
> +
> +       addr = (u64)page_address(page);
> +       if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> +               order -= 4;
> +               do {
> +                       cont_pte_low_bound = addr & CONT_PTE_MASK;
> +                       __change_memory_common(cont_pte_low_bound,
> +                                       (~CONT_PTE_MASK + 1), __pgprot(0), __pgprot(PTE_CONT));
> +                       addr = (u64)page_address(page);
> +                       page += 4;
> +                       order--;
> +               }while (order >= 0);
> +       }
> +}
> +
> --
> 1.9.1
>
Zhaoyang Huang Nov. 24, 2021, 8:08 a.m. UTC | #2
On Tue, Nov 23, 2021 at 5:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Tue, 23 Nov 2021 at 10:13, Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> >
> > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> >
> > Since there is no PTE_CONT when rodata_full in ARM64, introducing a
> > hook function to apply PTE_CONT on the proper page blocks.
> >
>
> Given the discussion around your previous patch, I would expect a
> meticulous explanation here why it is guaranteed to be safe to
> manipulate the PTE_CONT attribute like this, and how the proposed
> logic is correct for all supported page sizes.
>
> Without using an intermediate invalid mapping for the entire range,
> this is never going to work reliably (this is the break-before-make
> requirement). And given that marking the entire block invalid will
> create intermediate states that are not permitted (a valid PTE_CONT
> mapping and an invalid ~PTE_CONT mapping covering the same VA), the
> only way to apply changes like these is to temporarily switch all CPUs
> to a different translation via TTBR1. And this is not going to happen.
As there is no safe way to modify PTE_CONT on a live mapping, please
forget all previous patches except current one.
>
> Also, you never replied to my question regarding the use case and the
> performance gain.
In our android based system, the multimedia related cases suffers from
small pte granularity mostly which use high order page blocks quite a
lot. The performance gap even be visible.
>
> In summary, NAK to this patch or any of the previous ones regarding
> PTE_CONT. If you do insist on pursuing this further, please provide an
> elaborate and rock solid explanation why your approach is 100% valid
> and correct (for all page sizes). And make sure you include an
> explanation how your changes comply with the architectural
> break-before-make requirements around PTE_CONT attributes.
IMHO, It is safe to modify the page block's pte undering
*arch_alloc/free_pages* as there is no one else aware of it.
Furthermore, I do think tlbflush and barriers are needed for
synchronization. With regards to page sizes issue, I think replacing
the hard code const value to CONT_PTE_XXX could wrap the difference
and make it correct.

- if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
-     order -= 4;
+ if ((order >= CONT_PTE_SHIFT) && (addr & ~CONT_PTE_MASK) == 0) {
+    order -= CONT_PTE_SHIFT;
    do {
        cont_pte_low_bound = addr & CONT_PTE_MASK;
    __change_memory_common(cont_pte_low_bound,
    (~CONT_PTE_MASK + 1), __pgprot(PTE_CONT), __pgprot(0));
    addr = (u64)page_address(page);
-     page += 4;
+
    page += CONT_PTES;
    order--;
}while (order >= 0);
>
>
>
> > ---
> >  arch/arm64/include/asm/page.h |  5 +++++
> >  arch/arm64/mm/pageattr.c      | 45 +++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 50 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
> > index f98c91b..53cdd09 100644
> > --- a/arch/arm64/include/asm/page.h
> > +++ b/arch/arm64/include/asm/page.h
> > @@ -46,6 +46,11 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
> >
> >  #include <asm/memory.h>
> >
> > +#define HAVE_ARCH_ALLOC_PAGE
> > +#define HAVE_ARCH_FREE_PAGE
> > +
> > +extern void arch_alloc_page(struct page *page, int order);
> > +extern void arch_free_page(struct page *page, int order);
> >  #endif /* !__ASSEMBLY__ */
> >
> >  #define VM_DATA_DEFAULT_FLAGS  (VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED)
> > diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> > index a3bacd7..815a06d 100644
> > --- a/arch/arm64/mm/pageattr.c
> > +++ b/arch/arm64/mm/pageattr.c
> > @@ -239,3 +239,48 @@ bool kernel_page_present(struct page *page)
> >         ptep = pte_offset_kernel(pmdp, addr);
> >         return pte_valid(READ_ONCE(*ptep));
> >  }
> > +
> > +void arch_alloc_page(struct page *page, int order)
> > +{
> > +       unsigned long addr;
> > +       unsigned long cont_pte_low_bound;
> > +
> > +       if (!rodata_full)
> > +               return;
> > +
> > +       addr = (u64)page_address(page);
> > +       if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> > +               order -= 4;
> > +               do {
> > +                       cont_pte_low_bound = addr & CONT_PTE_MASK;
> > +                       __change_memory_common(cont_pte_low_bound,
> > +                                       (~CONT_PTE_MASK + 1), __pgprot(PTE_CONT), __pgprot(0));
> > +                       addr = (u64)page_address(page);
> > +                       page += 4;
> > +                       order--;
> > +               }while (order >= 0);
> > +       }
> > +}
> > +
> > +void arch_free_page(struct page *page, int order)
> > +{
> > +       unsigned long addr;
> > +       unsigned long cont_pte_low_bound;
> > +
> > +       if (!rodata_full)
> > +               return;
> > +
> > +       addr = (u64)page_address(page);
> > +       if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> > +               order -= 4;
> > +               do {
> > +                       cont_pte_low_bound = addr & CONT_PTE_MASK;
> > +                       __change_memory_common(cont_pte_low_bound,
> > +                                       (~CONT_PTE_MASK + 1), __pgprot(0), __pgprot(PTE_CONT));
> > +                       addr = (u64)page_address(page);
> > +                       page += 4;
> > +                       order--;
> > +               }while (order >= 0);
> > +       }
> > +}
> > +
> > --
> > 1.9.1
> >
Ard Biesheuvel Nov. 24, 2021, 9:23 a.m. UTC | #3
On Wed, 24 Nov 2021 at 09:08, Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
>
> On Tue, Nov 23, 2021 at 5:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > On Tue, 23 Nov 2021 at 10:13, Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > >
> > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > >
> > > Since there is no PTE_CONT when rodata_full in ARM64, introducing a
> > > hook function to apply PTE_CONT on the proper page blocks.
> > >
> >
> > Given the discussion around your previous patch, I would expect a
> > meticulous explanation here why it is guaranteed to be safe to
> > manipulate the PTE_CONT attribute like this, and how the proposed
> > logic is correct for all supported page sizes.
> >
> > Without using an intermediate invalid mapping for the entire range,
> > this is never going to work reliably (this is the break-before-make
> > requirement). And given that marking the entire block invalid will
> > create intermediate states that are not permitted (a valid PTE_CONT
> > mapping and an invalid ~PTE_CONT mapping covering the same VA), the
> > only way to apply changes like these is to temporarily switch all CPUs
> > to a different translation via TTBR1. And this is not going to happen.
> As there is no safe way to modify PTE_CONT on a live mapping, please
> forget all previous patches except current one.

OK

> >
> > Also, you never replied to my question regarding the use case and the
> > performance gain.
> In our android based system, the multimedia related cases suffers from
> small pte granularity mostly which use high order page blocks quite a
> lot. The performance gap even be visible.

OK, good to know.

> >
> > In summary, NAK to this patch or any of the previous ones regarding
> > PTE_CONT. If you do insist on pursuing this further, please provide an
> > elaborate and rock solid explanation why your approach is 100% valid
> > and correct (for all page sizes). And make sure you include an
> > explanation how your changes comply with the architectural
> > break-before-make requirements around PTE_CONT attributes.
> IMHO, It is safe to modify the page block's pte undering
> *arch_alloc/free_pages* as there is no one else aware of it.

Whether the software is 'aware' or not is irrelevant. Speculative
accesses could be made at any time, and these will trigger a page
table walk if no cached TLB entries exist for the region. If more than
one cached TLB entry exists (which would be the case if an adjacent
entry has the PTE_CONT attribute but not the entry itself), you will
hit a TLB conflict abort.

I guess the behavior under invalid mappings might be subtly different,
since those should not be cached in a TLB, but the fundamental problem
remains: no conflicting entries should exist at any time, and PTE_CONT
must be either set or cleared on all entries in the same contiguous
group. These are contradictory requirements for live translations, so
the only way around it is to uninstall the translation from all CPUs,
perform the update, and reinstall it.

> Furthermore, I do think tlbflush and barriers are needed for
> synchronization.

Careful [repeated] TLB maintenance may reduce the size of the window
where the TLB conflicts may occur, but it does not solve the issue.

> With regards to page sizes issue, I think replacing
> the hard code const value to CONT_PTE_XXX could wrap the difference
> and make it correct.
>
> - if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> -     order -= 4;
> + if ((order >= CONT_PTE_SHIFT) && (addr & ~CONT_PTE_MASK) == 0) {
> +    order -= CONT_PTE_SHIFT;
>     do {
>         cont_pte_low_bound = addr & CONT_PTE_MASK;
>     __change_memory_common(cont_pte_low_bound,
>     (~CONT_PTE_MASK + 1), __pgprot(PTE_CONT), __pgprot(0));
>     addr = (u64)page_address(page);
> -     page += 4;
> +
>     page += CONT_PTES;
>     order--;
> }while (order >= 0);
> >
> >
> >
> > > ---
> > >  arch/arm64/include/asm/page.h |  5 +++++
> > >  arch/arm64/mm/pageattr.c      | 45 +++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 50 insertions(+)
> > >
> > > diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
> > > index f98c91b..53cdd09 100644
> > > --- a/arch/arm64/include/asm/page.h
> > > +++ b/arch/arm64/include/asm/page.h
> > > @@ -46,6 +46,11 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
> > >
> > >  #include <asm/memory.h>
> > >
> > > +#define HAVE_ARCH_ALLOC_PAGE
> > > +#define HAVE_ARCH_FREE_PAGE
> > > +
> > > +extern void arch_alloc_page(struct page *page, int order);
> > > +extern void arch_free_page(struct page *page, int order);
> > >  #endif /* !__ASSEMBLY__ */
> > >
> > >  #define VM_DATA_DEFAULT_FLAGS  (VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED)
> > > diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> > > index a3bacd7..815a06d 100644
> > > --- a/arch/arm64/mm/pageattr.c
> > > +++ b/arch/arm64/mm/pageattr.c
> > > @@ -239,3 +239,48 @@ bool kernel_page_present(struct page *page)
> > >         ptep = pte_offset_kernel(pmdp, addr);
> > >         return pte_valid(READ_ONCE(*ptep));
> > >  }
> > > +
> > > +void arch_alloc_page(struct page *page, int order)
> > > +{
> > > +       unsigned long addr;
> > > +       unsigned long cont_pte_low_bound;
> > > +
> > > +       if (!rodata_full)
> > > +               return;
> > > +
> > > +       addr = (u64)page_address(page);
> > > +       if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> > > +               order -= 4;
> > > +               do {
> > > +                       cont_pte_low_bound = addr & CONT_PTE_MASK;
> > > +                       __change_memory_common(cont_pte_low_bound,
> > > +                                       (~CONT_PTE_MASK + 1), __pgprot(PTE_CONT), __pgprot(0));
> > > +                       addr = (u64)page_address(page);
> > > +                       page += 4;
> > > +                       order--;
> > > +               }while (order >= 0);
> > > +       }
> > > +}
> > > +
> > > +void arch_free_page(struct page *page, int order)
> > > +{
> > > +       unsigned long addr;
> > > +       unsigned long cont_pte_low_bound;
> > > +
> > > +       if (!rodata_full)
> > > +               return;
> > > +
> > > +       addr = (u64)page_address(page);
> > > +       if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> > > +               order -= 4;
> > > +               do {
> > > +                       cont_pte_low_bound = addr & CONT_PTE_MASK;
> > > +                       __change_memory_common(cont_pte_low_bound,
> > > +                                       (~CONT_PTE_MASK + 1), __pgprot(0), __pgprot(PTE_CONT));
> > > +                       addr = (u64)page_address(page);
> > > +                       page += 4;
> > > +                       order--;
> > > +               }while (order >= 0);
> > > +       }
> > > +}
> > > +
> > > --
> > > 1.9.1
> > >
Zhaoyang Huang Nov. 24, 2021, noon UTC | #4
On Wed, Nov 24, 2021 at 5:23 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Wed, 24 Nov 2021 at 09:08, Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> >
> > On Tue, Nov 23, 2021 at 5:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > >
> > > On Tue, 23 Nov 2021 at 10:13, Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > > >
> > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > >
> > > > Since there is no PTE_CONT when rodata_full in ARM64, introducing a
> > > > hook function to apply PTE_CONT on the proper page blocks.
> > > >
> > >
> > > Given the discussion around your previous patch, I would expect a
> > > meticulous explanation here why it is guaranteed to be safe to
> > > manipulate the PTE_CONT attribute like this, and how the proposed
> > > logic is correct for all supported page sizes.
> > >
> > > Without using an intermediate invalid mapping for the entire range,
> > > this is never going to work reliably (this is the break-before-make
> > > requirement). And given that marking the entire block invalid will
> > > create intermediate states that are not permitted (a valid PTE_CONT
> > > mapping and an invalid ~PTE_CONT mapping covering the same VA), the
> > > only way to apply changes like these is to temporarily switch all CPUs
> > > to a different translation via TTBR1. And this is not going to happen.
> > As there is no safe way to modify PTE_CONT on a live mapping, please
> > forget all previous patches except current one.
>
> OK
>
> > >
> > > Also, you never replied to my question regarding the use case and the
> > > performance gain.
> > In our android based system, the multimedia related cases suffers from
> > small pte granularity mostly which use high order page blocks quite a
> > lot. The performance gap even be visible.
>
> OK, good to know.
>
> > >
> > > In summary, NAK to this patch or any of the previous ones regarding
> > > PTE_CONT. If you do insist on pursuing this further, please provide an
> > > elaborate and rock solid explanation why your approach is 100% valid
> > > and correct (for all page sizes). And make sure you include an
> > > explanation how your changes comply with the architectural
> > > break-before-make requirements around PTE_CONT attributes.
> > IMHO, It is safe to modify the page block's pte undering
> > *arch_alloc/free_pages* as there is no one else aware of it.
>
> Whether the software is 'aware' or not is irrelevant. Speculative
> accesses could be made at any time, and these will trigger a page
> table walk if no cached TLB entries exist for the region. If more than
> one cached TLB entry exists (which would be the case if an adjacent
> entry has the PTE_CONT attribute but not the entry itself), you will
> hit a TLB conflict abort.
Could it be a risk that a speculative load racing with setting pte
from VALID to INVALID?
>
> I guess the behavior under invalid mappings might be subtly different,
> since those should not be cached in a TLB, but the fundamental problem
> remains: no conflicting entries should exist at any time, and PTE_CONT
> must be either set or cleared on all entries in the same contiguous
> group. These are contradictory requirements for live translations, so
> the only way around it is to uninstall the translation from all CPUs,
> perform the update, and reinstall it.
>
> > Furthermore, I do think tlbflush and barriers are needed for
> > synchronization.
>
> Careful [repeated] TLB maintenance may reduce the size of the window
> where the TLB conflicts may occur, but it does not solve the issue.
>
> > With regards to page sizes issue, I think replacing
> > the hard code const value to CONT_PTE_XXX could wrap the difference
> > and make it correct.
> >
> > - if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> > -     order -= 4;
> > + if ((order >= CONT_PTE_SHIFT) && (addr & ~CONT_PTE_MASK) == 0) {
> > +    order -= CONT_PTE_SHIFT;
> >     do {
> >         cont_pte_low_bound = addr & CONT_PTE_MASK;
> >     __change_memory_common(cont_pte_low_bound,
> >     (~CONT_PTE_MASK + 1), __pgprot(PTE_CONT), __pgprot(0));
> >     addr = (u64)page_address(page);
> > -     page += 4;
> > +
> >     page += CONT_PTES;
> >     order--;
> > }while (order >= 0);
> > >
> > >
> > >
> > > > ---
> > > >  arch/arm64/include/asm/page.h |  5 +++++
> > > >  arch/arm64/mm/pageattr.c      | 45 +++++++++++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 50 insertions(+)
> > > >
> > > > diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
> > > > index f98c91b..53cdd09 100644
> > > > --- a/arch/arm64/include/asm/page.h
> > > > +++ b/arch/arm64/include/asm/page.h
> > > > @@ -46,6 +46,11 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
> > > >
> > > >  #include <asm/memory.h>
> > > >
> > > > +#define HAVE_ARCH_ALLOC_PAGE
> > > > +#define HAVE_ARCH_FREE_PAGE
> > > > +
> > > > +extern void arch_alloc_page(struct page *page, int order);
> > > > +extern void arch_free_page(struct page *page, int order);
> > > >  #endif /* !__ASSEMBLY__ */
> > > >
> > > >  #define VM_DATA_DEFAULT_FLAGS  (VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED)
> > > > diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> > > > index a3bacd7..815a06d 100644
> > > > --- a/arch/arm64/mm/pageattr.c
> > > > +++ b/arch/arm64/mm/pageattr.c
> > > > @@ -239,3 +239,48 @@ bool kernel_page_present(struct page *page)
> > > >         ptep = pte_offset_kernel(pmdp, addr);
> > > >         return pte_valid(READ_ONCE(*ptep));
> > > >  }
> > > > +
> > > > +void arch_alloc_page(struct page *page, int order)
> > > > +{
> > > > +       unsigned long addr;
> > > > +       unsigned long cont_pte_low_bound;
> > > > +
> > > > +       if (!rodata_full)
> > > > +               return;
> > > > +
> > > > +       addr = (u64)page_address(page);
> > > > +       if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> > > > +               order -= 4;
> > > > +               do {
> > > > +                       cont_pte_low_bound = addr & CONT_PTE_MASK;
> > > > +                       __change_memory_common(cont_pte_low_bound,
> > > > +                                       (~CONT_PTE_MASK + 1), __pgprot(PTE_CONT), __pgprot(0));
> > > > +                       addr = (u64)page_address(page);
> > > > +                       page += 4;
> > > > +                       order--;
> > > > +               }while (order >= 0);
> > > > +       }
> > > > +}
> > > > +
> > > > +void arch_free_page(struct page *page, int order)
> > > > +{
> > > > +       unsigned long addr;
> > > > +       unsigned long cont_pte_low_bound;
> > > > +
> > > > +       if (!rodata_full)
> > > > +               return;
> > > > +
> > > > +       addr = (u64)page_address(page);
> > > > +       if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
> > > > +               order -= 4;
> > > > +               do {
> > > > +                       cont_pte_low_bound = addr & CONT_PTE_MASK;
> > > > +                       __change_memory_common(cont_pte_low_bound,
> > > > +                                       (~CONT_PTE_MASK + 1), __pgprot(0), __pgprot(PTE_CONT));
> > > > +                       addr = (u64)page_address(page);
> > > > +                       page += 4;
> > > > +                       order--;
> > > > +               }while (order >= 0);
> > > > +       }
> > > > +}
> > > > +
> > > > --
> > > > 1.9.1
> > > >
Ard Biesheuvel Nov. 24, 2021, 1:54 p.m. UTC | #5
On Wed, 24 Nov 2021 at 13:01, Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
>
> On Wed, Nov 24, 2021 at 5:23 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > On Wed, 24 Nov 2021 at 09:08, Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> > >
> > > On Tue, Nov 23, 2021 at 5:58 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > > >
> > > > On Tue, 23 Nov 2021 at 10:13, Huangzhaoyang <huangzhaoyang@gmail.com> wrote:
> > > > >
> > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > >
> > > > > Since there is no PTE_CONT when rodata_full in ARM64, introducing a
> > > > > hook function to apply PTE_CONT on the proper page blocks.
> > > > >
> > > >
> > > > Given the discussion around your previous patch, I would expect a
> > > > meticulous explanation here why it is guaranteed to be safe to
> > > > manipulate the PTE_CONT attribute like this, and how the proposed
> > > > logic is correct for all supported page sizes.
> > > >
> > > > Without using an intermediate invalid mapping for the entire range,
> > > > this is never going to work reliably (this is the break-before-make
> > > > requirement). And given that marking the entire block invalid will
> > > > create intermediate states that are not permitted (a valid PTE_CONT
> > > > mapping and an invalid ~PTE_CONT mapping covering the same VA), the
> > > > only way to apply changes like these is to temporarily switch all CPUs
> > > > to a different translation via TTBR1. And this is not going to happen.
> > > As there is no safe way to modify PTE_CONT on a live mapping, please
> > > forget all previous patches except current one.
> >
> > OK
> >
> > > >
> > > > Also, you never replied to my question regarding the use case and the
> > > > performance gain.
> > > In our android based system, the multimedia related cases suffers from
> > > small pte granularity mostly which use high order page blocks quite a
> > > lot. The performance gap even be visible.
> >
> > OK, good to know.
> >
> > > >
> > > > In summary, NAK to this patch or any of the previous ones regarding
> > > > PTE_CONT. If you do insist on pursuing this further, please provide an
> > > > elaborate and rock solid explanation why your approach is 100% valid
> > > > and correct (for all page sizes). And make sure you include an
> > > > explanation how your changes comply with the architectural
> > > > break-before-make requirements around PTE_CONT attributes.
> > > IMHO, It is safe to modify the page block's pte undering
> > > *arch_alloc/free_pages* as there is no one else aware of it.
> >
> > Whether the software is 'aware' or not is irrelevant. Speculative
> > accesses could be made at any time, and these will trigger a page
> > table walk if no cached TLB entries exist for the region. If more than
> > one cached TLB entry exists (which would be the case if an adjacent
> > entry has the PTE_CONT attribute but not the entry itself), you will
> > hit a TLB conflict abort.
> Could it be a risk that a speculative load racing with setting pte
> from VALID to INVALID?

Theorizing about what might go wrong is not very useful. The
architecture simply does not permit what you are proposing here, and
this is reason enough not to do it.

The ARM ARM says the following [DDI0487G.a D5.2 "Misprogramming of the
Contiguous bit"]

"""
If one or more of the following errors is made in programming the
translation tables, the TLB might contain overlapping entries:
- One or more of the contiguous translation table entries does not
have the Contiguous bit set to 1.
- One or more of the contiguous translation table entries holds an
output address that is not consistent with all of the entries pointing
to the same aligned contiguous address range.
- The attributes and permissions of the contiguous entries are not all the same.

Such misprogramming of the translation tables means the output
address, memory permissions, or attributes for a lookup might be
corrupted, and might be equal to values that are not consistent with
any of the programmed translation table values.

In some implementations, such misprogramming might also give rise to a
TLB Conflict abort.
"""

This means that none of the entries in the contiguous group are
reliable when only one entry in the group deviates, and in addition to
TLB conflict aborts, you may trigger all kinds of corruption due to
the output address being bogus.

So NAK again.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index f98c91b..53cdd09 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -46,6 +46,11 @@  struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
 
 #include <asm/memory.h>
 
+#define HAVE_ARCH_ALLOC_PAGE
+#define HAVE_ARCH_FREE_PAGE
+
+extern void arch_alloc_page(struct page *page, int order);
+extern void arch_free_page(struct page *page, int order);
 #endif /* !__ASSEMBLY__ */
 
 #define VM_DATA_DEFAULT_FLAGS	(VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED)
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index a3bacd7..815a06d 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -239,3 +239,48 @@  bool kernel_page_present(struct page *page)
 	ptep = pte_offset_kernel(pmdp, addr);
 	return pte_valid(READ_ONCE(*ptep));
 }
+
+void arch_alloc_page(struct page *page, int order)
+{
+	unsigned long addr;
+	unsigned long cont_pte_low_bound;
+
+	if (!rodata_full)
+		return;
+
+	addr = (u64)page_address(page);
+	if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
+		order -= 4;
+		do {
+			cont_pte_low_bound = addr & CONT_PTE_MASK;
+			__change_memory_common(cont_pte_low_bound,
+					(~CONT_PTE_MASK + 1), __pgprot(PTE_CONT), __pgprot(0));
+			addr = (u64)page_address(page);
+			page += 4;
+			order--;
+		}while (order >= 0);
+	}
+}
+
+void arch_free_page(struct page *page, int order)
+{
+	unsigned long addr;
+	unsigned long cont_pte_low_bound;
+
+	if (!rodata_full)
+		return;
+
+	addr = (u64)page_address(page);
+	if ((order >= 4) && (addr & ~CONT_PTE_MASK) == 0) {
+		order -= 4;
+		do {
+			cont_pte_low_bound = addr & CONT_PTE_MASK;
+			__change_memory_common(cont_pte_low_bound,
+					(~CONT_PTE_MASK + 1), __pgprot(0), __pgprot(PTE_CONT));
+			addr = (u64)page_address(page);
+			page += 4;
+			order--;
+		}while (order >= 0);
+	}
+}
+