[v15,05/17] arms64: untag user pointers passed to memory syscalls
diff mbox series

Message ID 00eb4c63fefc054e2c8d626e8fedfca11d7c2600.1557160186.git.andreyknvl@google.com
State New
Headers show
Series
  • arm64: untag user pointers passed to the kernel
Related show

Commit Message

Andrey Konovalov May 6, 2019, 4:30 p.m. UTC
This patch is a part of a series that extends arm64 kernel ABI to allow to
pass tagged user pointers (with the top byte set to something else other
than 0x00) as syscall arguments.

This patch allows tagged pointers to be passed to the following memory
syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
remap_file_pages, shmat and shmdt.

This is done by untagging pointers passed to these syscalls in the
prologues of their handlers.

Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
---
 arch/arm64/kernel/sys.c | 128 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 1 deletion(-)

Comments

Catalin Marinas May 22, 2019, 11:49 a.m. UTC | #1
On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> This patch is a part of a series that extends arm64 kernel ABI to allow to
> pass tagged user pointers (with the top byte set to something else other
> than 0x00) as syscall arguments.
> 
> This patch allows tagged pointers to be passed to the following memory
> syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
> mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> remap_file_pages, shmat and shmdt.
> 
> This is done by untagging pointers passed to these syscalls in the
> prologues of their handlers.

I'll go through them one by one to see if we can tighten the expected
ABI while having the MTE in mind.

> diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c
> index b44065fb1616..933bb9f3d6ec 100644
> --- a/arch/arm64/kernel/sys.c
> +++ b/arch/arm64/kernel/sys.c
> @@ -35,10 +35,33 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
>  {
>  	if (offset_in_page(off) != 0)
>  		return -EINVAL;
> -
> +	addr = untagged_addr(addr);
>  	return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
>  }

If user passes a tagged pointer to mmap() and the address is honoured
(or MAP_FIXED is given), what is the expected return pointer? Does it
need to be tagged with the value from the hint?

With MTE, we may want to use this as a request for the default colour of
the mapped pages (still under discussion).

> +SYSCALL_DEFINE6(arm64_mmap_pgoff, unsigned long, addr, unsigned long, len,
> +		unsigned long, prot, unsigned long, flags,
> +		unsigned long, fd, unsigned long, pgoff)
> +{
> +	addr = untagged_addr(addr);
> +	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
> +}

We don't have __NR_mmap_pgoff on arm64.

> +SYSCALL_DEFINE5(arm64_mremap, unsigned long, addr, unsigned long, old_len,
> +		unsigned long, new_len, unsigned long, flags,
> +		unsigned long, new_addr)
> +{
> +	addr = untagged_addr(addr);
> +	new_addr = untagged_addr(new_addr);
> +	return ksys_mremap(addr, old_len, new_len, flags, new_addr);
> +}

Similar comment as for mmap(), do we want the tag from new_addr to be
preserved? In addition, should we check that the two tags are identical
or mremap() should become a way to repaint a memory region?

> +SYSCALL_DEFINE2(arm64_munmap, unsigned long, addr, size_t, len)
> +{
> +	addr = untagged_addr(addr);
> +	return ksys_munmap(addr, len);
> +}

This looks fine.

> +SYSCALL_DEFINE1(arm64_brk, unsigned long, brk)
> +{
> +	brk = untagged_addr(brk);
> +	return ksys_brk(brk);
> +}

I wonder whether brk() should simply not accept tags, and should not
return them (similar to the prctl(PR_SET_MM) discussion). We could
document this in the ABI requirements.

> +SYSCALL_DEFINE5(arm64_get_mempolicy, int __user *, policy,
> +		unsigned long __user *, nmask, unsigned long, maxnode,
> +		unsigned long, addr, unsigned long, flags)
> +{
> +	addr = untagged_addr(addr);
> +	return ksys_get_mempolicy(policy, nmask, maxnode, addr, flags);
> +}
> +
> +SYSCALL_DEFINE3(arm64_madvise, unsigned long, start,
> +		size_t, len_in, int, behavior)
> +{
> +	start = untagged_addr(start);
> +	return ksys_madvise(start, len_in, behavior);
> +}
> +
> +SYSCALL_DEFINE6(arm64_mbind, unsigned long, start, unsigned long, len,
> +		unsigned long, mode, const unsigned long __user *, nmask,
> +		unsigned long, maxnode, unsigned int, flags)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mbind(start, len, mode, nmask, maxnode, flags);
> +}
> +
> +SYSCALL_DEFINE2(arm64_mlock, unsigned long, start, size_t, len)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mlock(start, len, VM_LOCKED);
> +}
> +
> +SYSCALL_DEFINE2(arm64_mlock2, unsigned long, start, size_t, len)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mlock(start, len, VM_LOCKED);
> +}
> +
> +SYSCALL_DEFINE2(arm64_munlock, unsigned long, start, size_t, len)
> +{
> +	start = untagged_addr(start);
> +	return ksys_munlock(start, len);
> +}
> +
> +SYSCALL_DEFINE3(arm64_mprotect, unsigned long, start, size_t, len,
> +		unsigned long, prot)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mprotect_pkey(start, len, prot, -1);
> +}
> +
> +SYSCALL_DEFINE3(arm64_msync, unsigned long, start, size_t, len, int, flags)
> +{
> +	start = untagged_addr(start);
> +	return ksys_msync(start, len, flags);
> +}
> +
> +SYSCALL_DEFINE3(arm64_mincore, unsigned long, start, size_t, len,
> +		unsigned char __user *, vec)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mincore(start, len, vec);
> +}

These look fine.

> +SYSCALL_DEFINE5(arm64_remap_file_pages, unsigned long, start,
> +		unsigned long, size, unsigned long, prot,
> +		unsigned long, pgoff, unsigned long, flags)
> +{
> +	start = untagged_addr(start);
> +	return ksys_remap_file_pages(start, size, prot, pgoff, flags);
> +}

While this has been deprecated for some time, I presume user space still
invokes it?

> +SYSCALL_DEFINE3(arm64_shmat, int, shmid, char __user *, shmaddr, int, shmflg)
> +{
> +	shmaddr = untagged_addr(shmaddr);
> +	return ksys_shmat(shmid, shmaddr, shmflg);
> +}
> +
> +SYSCALL_DEFINE1(arm64_shmdt, char __user *, shmaddr)
> +{
> +	shmaddr = untagged_addr(shmaddr);
> +	return ksys_shmdt(shmaddr);
> +}

Do we actually want to allow shared tagged memory? Who's going to tag
it? If not, we can document it as not supported.
Evgenii Stepanov May 22, 2019, 9:16 p.m. UTC | #2
On Wed, May 22, 2019 at 4:49 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> > This patch is a part of a series that extends arm64 kernel ABI to allow to
> > pass tagged user pointers (with the top byte set to something else other
> > than 0x00) as syscall arguments.
> >
> > This patch allows tagged pointers to be passed to the following memory
> > syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
> > mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> > remap_file_pages, shmat and shmdt.
> >
> > This is done by untagging pointers passed to these syscalls in the
> > prologues of their handlers.
>
> I'll go through them one by one to see if we can tighten the expected
> ABI while having the MTE in mind.
>
> > diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c
> > index b44065fb1616..933bb9f3d6ec 100644
> > --- a/arch/arm64/kernel/sys.c
> > +++ b/arch/arm64/kernel/sys.c
> > @@ -35,10 +35,33 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
> >  {
> >       if (offset_in_page(off) != 0)
> >               return -EINVAL;
> > -
> > +     addr = untagged_addr(addr);
> >       return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
> >  }
>
> If user passes a tagged pointer to mmap() and the address is honoured
> (or MAP_FIXED is given), what is the expected return pointer? Does it
> need to be tagged with the value from the hint?

For HWASan the most convenient would be to use the tag from the hint.
But since in the TBI (not MTE) mode the kernel has no idea what
meaning userspace assigns to pointer tags, perhaps it should not try
to guess, and should return raw (zero-tagged) address instead.

> With MTE, we may want to use this as a request for the default colour of
> the mapped pages (still under discussion).

I like this - and in that case it would make sense to return the
pointer that can be immediately dereferenced without crashing the
process, i.e. with the matching tag.

> > +SYSCALL_DEFINE6(arm64_mmap_pgoff, unsigned long, addr, unsigned long, len,
> > +             unsigned long, prot, unsigned long, flags,
> > +             unsigned long, fd, unsigned long, pgoff)
> > +{
> > +     addr = untagged_addr(addr);
> > +     return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
> > +}
>
> We don't have __NR_mmap_pgoff on arm64.
>
> > +SYSCALL_DEFINE5(arm64_mremap, unsigned long, addr, unsigned long, old_len,
> > +             unsigned long, new_len, unsigned long, flags,
> > +             unsigned long, new_addr)
> > +{
> > +     addr = untagged_addr(addr);
> > +     new_addr = untagged_addr(new_addr);
> > +     return ksys_mremap(addr, old_len, new_len, flags, new_addr);
> > +}
>
> Similar comment as for mmap(), do we want the tag from new_addr to be
> preserved? In addition, should we check that the two tags are identical
> or mremap() should become a way to repaint a memory region?
>
> > +SYSCALL_DEFINE2(arm64_munmap, unsigned long, addr, size_t, len)
> > +{
> > +     addr = untagged_addr(addr);
> > +     return ksys_munmap(addr, len);
> > +}
>
> This looks fine.
>
> > +SYSCALL_DEFINE1(arm64_brk, unsigned long, brk)
> > +{
> > +     brk = untagged_addr(brk);
> > +     return ksys_brk(brk);
> > +}
>
> I wonder whether brk() should simply not accept tags, and should not
> return them (similar to the prctl(PR_SET_MM) discussion). We could
> document this in the ABI requirements.
>
> > +SYSCALL_DEFINE5(arm64_get_mempolicy, int __user *, policy,
> > +             unsigned long __user *, nmask, unsigned long, maxnode,
> > +             unsigned long, addr, unsigned long, flags)
> > +{
> > +     addr = untagged_addr(addr);
> > +     return ksys_get_mempolicy(policy, nmask, maxnode, addr, flags);
> > +}
> > +
> > +SYSCALL_DEFINE3(arm64_madvise, unsigned long, start,
> > +             size_t, len_in, int, behavior)
> > +{
> > +     start = untagged_addr(start);
> > +     return ksys_madvise(start, len_in, behavior);
> > +}
> > +
> > +SYSCALL_DEFINE6(arm64_mbind, unsigned long, start, unsigned long, len,
> > +             unsigned long, mode, const unsigned long __user *, nmask,
> > +             unsigned long, maxnode, unsigned int, flags)
> > +{
> > +     start = untagged_addr(start);
> > +     return ksys_mbind(start, len, mode, nmask, maxnode, flags);
> > +}
> > +
> > +SYSCALL_DEFINE2(arm64_mlock, unsigned long, start, size_t, len)
> > +{
> > +     start = untagged_addr(start);
> > +     return ksys_mlock(start, len, VM_LOCKED);
> > +}
> > +
> > +SYSCALL_DEFINE2(arm64_mlock2, unsigned long, start, size_t, len)
> > +{
> > +     start = untagged_addr(start);
> > +     return ksys_mlock(start, len, VM_LOCKED);
> > +}
> > +
> > +SYSCALL_DEFINE2(arm64_munlock, unsigned long, start, size_t, len)
> > +{
> > +     start = untagged_addr(start);
> > +     return ksys_munlock(start, len);
> > +}
> > +
> > +SYSCALL_DEFINE3(arm64_mprotect, unsigned long, start, size_t, len,
> > +             unsigned long, prot)
> > +{
> > +     start = untagged_addr(start);
> > +     return ksys_mprotect_pkey(start, len, prot, -1);
> > +}
> > +
> > +SYSCALL_DEFINE3(arm64_msync, unsigned long, start, size_t, len, int, flags)
> > +{
> > +     start = untagged_addr(start);
> > +     return ksys_msync(start, len, flags);
> > +}
> > +
> > +SYSCALL_DEFINE3(arm64_mincore, unsigned long, start, size_t, len,
> > +             unsigned char __user *, vec)
> > +{
> > +     start = untagged_addr(start);
> > +     return ksys_mincore(start, len, vec);
> > +}
>
> These look fine.
>
> > +SYSCALL_DEFINE5(arm64_remap_file_pages, unsigned long, start,
> > +             unsigned long, size, unsigned long, prot,
> > +             unsigned long, pgoff, unsigned long, flags)
> > +{
> > +     start = untagged_addr(start);
> > +     return ksys_remap_file_pages(start, size, prot, pgoff, flags);
> > +}
>
> While this has been deprecated for some time, I presume user space still
> invokes it?
>
> > +SYSCALL_DEFINE3(arm64_shmat, int, shmid, char __user *, shmaddr, int, shmflg)
> > +{
> > +     shmaddr = untagged_addr(shmaddr);
> > +     return ksys_shmat(shmid, shmaddr, shmflg);
> > +}
> > +
> > +SYSCALL_DEFINE1(arm64_shmdt, char __user *, shmaddr)
> > +{
> > +     shmaddr = untagged_addr(shmaddr);
> > +     return ksys_shmdt(shmaddr);
> > +}
>
> Do we actually want to allow shared tagged memory? Who's going to tag
> it? If not, we can document it as not supported.
>
> --
> Catalin
Catalin Marinas May 23, 2019, 9:04 a.m. UTC | #3
On Wed, May 22, 2019 at 02:16:57PM -0700, Evgenii Stepanov wrote:
> On Wed, May 22, 2019 at 4:49 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> > > This patch is a part of a series that extends arm64 kernel ABI to allow to
> > > pass tagged user pointers (with the top byte set to something else other
> > > than 0x00) as syscall arguments.
> > >
> > > This patch allows tagged pointers to be passed to the following memory
> > > syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
> > > mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> > > remap_file_pages, shmat and shmdt.
> > >
> > > This is done by untagging pointers passed to these syscalls in the
> > > prologues of their handlers.
> >
> > I'll go through them one by one to see if we can tighten the expected
> > ABI while having the MTE in mind.
> >
> > > diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c
> > > index b44065fb1616..933bb9f3d6ec 100644
> > > --- a/arch/arm64/kernel/sys.c
> > > +++ b/arch/arm64/kernel/sys.c
> > > @@ -35,10 +35,33 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
> > >  {
> > >       if (offset_in_page(off) != 0)
> > >               return -EINVAL;
> > > -
> > > +     addr = untagged_addr(addr);
> > >       return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
> > >  }
> >
> > If user passes a tagged pointer to mmap() and the address is honoured
> > (or MAP_FIXED is given), what is the expected return pointer? Does it
> > need to be tagged with the value from the hint?
> 
> For HWASan the most convenient would be to use the tag from the hint.
> But since in the TBI (not MTE) mode the kernel has no idea what
> meaning userspace assigns to pointer tags, perhaps it should not try
> to guess, and should return raw (zero-tagged) address instead.

Then, just to relax the ABI for hwasan, shall we simply disallow tagged
pointers on mmap() arguments? We can leave them in for
mremap(old_address), madvise().

> > With MTE, we may want to use this as a request for the default colour of
> > the mapped pages (still under discussion).
> 
> I like this - and in that case it would make sense to return the
> pointer that can be immediately dereferenced without crashing the
> process, i.e. with the matching tag.

This came up from the Android investigation work where large memory
allocations (using mmap) could be more efficiently pre-tagged by the
kernel on page fault. Not sure about the implementation details yet.
Evgenii Stepanov May 24, 2019, 4:23 a.m. UTC | #4
On Thu, May 23, 2019 at 2:04 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Wed, May 22, 2019 at 02:16:57PM -0700, Evgenii Stepanov wrote:
> > On Wed, May 22, 2019 at 4:49 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > > On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> > > > This patch is a part of a series that extends arm64 kernel ABI to allow to
> > > > pass tagged user pointers (with the top byte set to something else other
> > > > than 0x00) as syscall arguments.
> > > >
> > > > This patch allows tagged pointers to be passed to the following memory
> > > > syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
> > > > mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> > > > remap_file_pages, shmat and shmdt.
> > > >
> > > > This is done by untagging pointers passed to these syscalls in the
> > > > prologues of their handlers.
> > >
> > > I'll go through them one by one to see if we can tighten the expected
> > > ABI while having the MTE in mind.
> > >
> > > > diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c
> > > > index b44065fb1616..933bb9f3d6ec 100644
> > > > --- a/arch/arm64/kernel/sys.c
> > > > +++ b/arch/arm64/kernel/sys.c
> > > > @@ -35,10 +35,33 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
> > > >  {
> > > >       if (offset_in_page(off) != 0)
> > > >               return -EINVAL;
> > > > -
> > > > +     addr = untagged_addr(addr);
> > > >       return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
> > > >  }
> > >
> > > If user passes a tagged pointer to mmap() and the address is honoured
> > > (or MAP_FIXED is given), what is the expected return pointer? Does it
> > > need to be tagged with the value from the hint?
> >
> > For HWASan the most convenient would be to use the tag from the hint.
> > But since in the TBI (not MTE) mode the kernel has no idea what
> > meaning userspace assigns to pointer tags, perhaps it should not try
> > to guess, and should return raw (zero-tagged) address instead.
>
> Then, just to relax the ABI for hwasan, shall we simply disallow tagged
> pointers on mmap() arguments? We can leave them in for
> mremap(old_address), madvise().

I think this would be fine. We should allow tagged in pointers in
mprotect though.

> > > With MTE, we may want to use this as a request for the default colour of
> > > the mapped pages (still under discussion).
> >
> > I like this - and in that case it would make sense to return the
> > pointer that can be immediately dereferenced without crashing the
> > process, i.e. with the matching tag.
>
> This came up from the Android investigation work where large memory
> allocations (using mmap) could be more efficiently pre-tagged by the
> kernel on page fault. Not sure about the implementation details yet.
>
> --
> Catalin
Andrew Murray May 24, 2019, 3:41 p.m. UTC | #5
On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> This patch is a part of a series that extends arm64 kernel ABI to allow to
> pass tagged user pointers (with the top byte set to something else other
> than 0x00) as syscall arguments.
> 
> This patch allows tagged pointers to be passed to the following memory
> syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
> mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> remap_file_pages, shmat and shmdt.
> 
> This is done by untagging pointers passed to these syscalls in the
> prologues of their handlers.
> 
> Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
> ---


> +SYSCALL_DEFINE2(arm64_mlock, unsigned long, start, size_t, len)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mlock(start, len, VM_LOCKED);
> +}
> +
> +SYSCALL_DEFINE2(arm64_mlock2, unsigned long, start, size_t, len)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mlock(start, len, VM_LOCKED);
> +}

I think this may be a copy/paste error...

Shouldn't mlock2 have a third 'flags' argument to distinguish is from mlock?

Thanks,

Andrew Murray

> +
> +SYSCALL_DEFINE2(arm64_munlock, unsigned long, start, size_t, len)
> +{
> +	start = untagged_addr(start);
> +	return ksys_munlock(start, len);
> +}
> +
> +SYSCALL_DEFINE3(arm64_mprotect, unsigned long, start, size_t, len,
> +		unsigned long, prot)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mprotect_pkey(start, len, prot, -1);
> +}
> +
> +SYSCALL_DEFINE3(arm64_msync, unsigned long, start, size_t, len, int, flags)
> +{
> +	start = untagged_addr(start);
> +	return ksys_msync(start, len, flags);
> +}
> +
> +SYSCALL_DEFINE3(arm64_mincore, unsigned long, start, size_t, len,
> +		unsigned char __user *, vec)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mincore(start, len, vec);
> +}
> +
> +SYSCALL_DEFINE5(arm64_remap_file_pages, unsigned long, start,
> +		unsigned long, size, unsigned long, prot,
> +		unsigned long, pgoff, unsigned long, flags)
> +{
> +	start = untagged_addr(start);
> +	return ksys_remap_file_pages(start, size, prot, pgoff, flags);
> +}
> +
> +SYSCALL_DEFINE3(arm64_shmat, int, shmid, char __user *, shmaddr, int, shmflg)
> +{
> +	shmaddr = untagged_addr(shmaddr);
> +	return ksys_shmat(shmid, shmaddr, shmflg);
> +}
> +
> +SYSCALL_DEFINE1(arm64_shmdt, char __user *, shmaddr)
> +{
> +	shmaddr = untagged_addr(shmaddr);
> +	return ksys_shmdt(shmaddr);
> +}
> +
>  /*
>   * Wrappers to pass the pt_regs argument.
>   */
>  #define sys_personality		sys_arm64_personality
> +#define sys_mmap_pgoff		sys_arm64_mmap_pgoff
> +#define sys_mremap		sys_arm64_mremap
> +#define sys_munmap		sys_arm64_munmap
> +#define sys_brk			sys_arm64_brk
> +#define sys_get_mempolicy	sys_arm64_get_mempolicy
> +#define sys_madvise		sys_arm64_madvise
> +#define sys_mbind		sys_arm64_mbind
> +#define sys_mlock		sys_arm64_mlock
> +#define sys_mlock2		sys_arm64_mlock2
> +#define sys_munlock		sys_arm64_munlock
> +#define sys_mprotect		sys_arm64_mprotect
> +#define sys_msync		sys_arm64_msync
> +#define sys_mincore		sys_arm64_mincore
> +#define sys_remap_file_pages	sys_arm64_remap_file_pages
> +#define sys_shmat		sys_arm64_shmat
> +#define sys_shmdt		sys_arm64_shmdt
>  
>  asmlinkage long sys_ni_syscall(const struct pt_regs *);
>  #define __arm64_sys_ni_syscall	sys_ni_syscall
> -- 
> 2.21.0.1020.gf2820cf01a-goog
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Catalin Marinas May 25, 2019, 9:57 a.m. UTC | #6
On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> +SYSCALL_DEFINE5(arm64_get_mempolicy, int __user *, policy,
> +		unsigned long __user *, nmask, unsigned long, maxnode,
> +		unsigned long, addr, unsigned long, flags)
> +{
> +	addr = untagged_addr(addr);
> +	return ksys_get_mempolicy(policy, nmask, maxnode, addr, flags);
> +}
[...]
> +SYSCALL_DEFINE6(arm64_mbind, unsigned long, start, unsigned long, len,
> +		unsigned long, mode, const unsigned long __user *, nmask,
> +		unsigned long, maxnode, unsigned int, flags)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mbind(start, len, mode, nmask, maxnode, flags);
> +}

The kernel fails to build with CONFIG_NUMA disabled because the above
are in mm/mempolicy.c which is no longer compiled in.
Catalin Marinas May 27, 2019, 9:42 a.m. UTC | #7
On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> +SYSCALL_DEFINE2(arm64_mlock2, unsigned long, start, size_t, len)
> +{
> +	start = untagged_addr(start);
> +	return ksys_mlock(start, len, VM_LOCKED);
> +}

Copy/paste error: sys_mlock2() has 3 arguments and should call
ksys_mlock2().

Still tracking down an LTP failure on test mlock01.
Catalin Marinas May 27, 2019, 2:37 p.m. UTC | #8
On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> This patch is a part of a series that extends arm64 kernel ABI to allow to
> pass tagged user pointers (with the top byte set to something else other
> than 0x00) as syscall arguments.
> 
> This patch allows tagged pointers to be passed to the following memory
> syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
> mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> remap_file_pages, shmat and shmdt.
> 
> This is done by untagging pointers passed to these syscalls in the
> prologues of their handlers.
> 
> Signed-off-by: Andrey Konovalov <andreyknvl@google.com>

Actually, I don't think any of these wrappers get called (have you
tested this patch?). Following commit 4378a7d4be30 ("arm64: implement
syscall wrappers"), I think we have other macro names for overriding the
sys_* ones.
Catalin Marinas May 28, 2019, 1:05 p.m. UTC | #9
On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
>  /*
>   * Wrappers to pass the pt_regs argument.
>   */
>  #define sys_personality		sys_arm64_personality
> +#define sys_mmap_pgoff		sys_arm64_mmap_pgoff
> +#define sys_mremap		sys_arm64_mremap
> +#define sys_munmap		sys_arm64_munmap
> +#define sys_brk			sys_arm64_brk
> +#define sys_get_mempolicy	sys_arm64_get_mempolicy
> +#define sys_madvise		sys_arm64_madvise
> +#define sys_mbind		sys_arm64_mbind
> +#define sys_mlock		sys_arm64_mlock
> +#define sys_mlock2		sys_arm64_mlock2
> +#define sys_munlock		sys_arm64_munlock
> +#define sys_mprotect		sys_arm64_mprotect
> +#define sys_msync		sys_arm64_msync
> +#define sys_mincore		sys_arm64_mincore
> +#define sys_remap_file_pages	sys_arm64_remap_file_pages
> +#define sys_shmat		sys_arm64_shmat
> +#define sys_shmdt		sys_arm64_shmdt

This hunk should be (I sent a separate patch for sys_personality):

@@ -160,23 +163,23 @@ SYSCALL_DEFINE1(arm64_shmdt, char __user *, shmaddr)
 /*
  * Wrappers to pass the pt_regs argument.
  */
-#define sys_personality		sys_arm64_personality
-#define sys_mmap_pgoff		sys_arm64_mmap_pgoff
-#define sys_mremap		sys_arm64_mremap
-#define sys_munmap		sys_arm64_munmap
-#define sys_brk			sys_arm64_brk
-#define sys_get_mempolicy	sys_arm64_get_mempolicy
-#define sys_madvise		sys_arm64_madvise
-#define sys_mbind		sys_arm64_mbind
-#define sys_mlock		sys_arm64_mlock
-#define sys_mlock2		sys_arm64_mlock2
-#define sys_munlock		sys_arm64_munlock
-#define sys_mprotect		sys_arm64_mprotect
-#define sys_msync		sys_arm64_msync
-#define sys_mincore		sys_arm64_mincore
-#define sys_remap_file_pages	sys_arm64_remap_file_pages
-#define sys_shmat		sys_arm64_shmat
-#define sys_shmdt		sys_arm64_shmdt
+#define __arm64_sys_personality		__arm64_sys_arm64_personality
+#define __arm64_sys_mmap_pgoff		__arm64_sys_arm64_mmap_pgoff
+#define __arm64_sys_mremap		__arm64_sys_arm64_mremap
+#define __arm64_sys_munmap		__arm64_sys_arm64_munmap
+#define __arm64_sys_brk			__arm64_sys_arm64_brk
+#define __arm64_sys_get_mempolicy	__arm64_sys_arm64_get_mempolicy
+#define __arm64_sys_madvise		__arm64_sys_arm64_madvise
+#define __arm64_sys_mbind		__arm64_sys_arm64_mbind
+#define __arm64_sys_mlock		__arm64_sys_arm64_mlock
+#define __arm64_sys_mlock2		__arm64_sys_arm64_mlock2
+#define __arm64_sys_munlock		__arm64_sys_arm64_munlock
+#define __arm64_sys_mprotect		__arm64_sys_arm64_mprotect
+#define __arm64_sys_msync		__arm64_sys_arm64_msync
+#define __arm64_sys_mincore		__arm64_sys_arm64_mincore
+#define __arm64_sys_remap_file_pages	__arm64_sys_arm64_remap_file_pages
+#define __arm64_sys_shmat		__arm64_sys_arm64_shmat
+#define __arm64_sys_shmdt		__arm64_sys_arm64_shmdt
 
 asmlinkage long sys_ni_syscall(const struct pt_regs *);
 #define __arm64_sys_ni_syscall	sys_ni_syscall
Andrew Murray May 28, 2019, 2:54 p.m. UTC | #10
On Mon, May 27, 2019 at 03:37:20PM +0100, Catalin Marinas wrote:
> On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> > This patch is a part of a series that extends arm64 kernel ABI to allow to
> > pass tagged user pointers (with the top byte set to something else other
> > than 0x00) as syscall arguments.
> > 
> > This patch allows tagged pointers to be passed to the following memory
> > syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
> > mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> > remap_file_pages, shmat and shmdt.
> > 
> > This is done by untagging pointers passed to these syscalls in the
> > prologues of their handlers.
> > 
> > Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
> 
> Actually, I don't think any of these wrappers get called (have you
> tested this patch?). Following commit 4378a7d4be30 ("arm64: implement
> syscall wrappers"), I think we have other macro names for overriding the
> sys_* ones.

What is the value in adding these wrappers?

The untagged_addr macro is defined for all in linux/mm.h and these patches
already use untagged_addr in generic code. Thus adding a few more
untagged_addr in the generic syscall handlers (which turn to a nop for most)
is surely better than adding wrappers?

Even if other architectures implement untagged_addr in the future it would
be more consistent if they untagged in the same places and thus not adding
these wrappers enforces that.

Thanks,

Andrew Murray

> 
> -- 
> Catalin
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Catalin Marinas May 28, 2019, 3:40 p.m. UTC | #11
On Tue, May 28, 2019 at 03:54:11PM +0100, Andrew Murray wrote:
> On Mon, May 27, 2019 at 03:37:20PM +0100, Catalin Marinas wrote:
> > On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> > > This patch is a part of a series that extends arm64 kernel ABI to allow to
> > > pass tagged user pointers (with the top byte set to something else other
> > > than 0x00) as syscall arguments.
> > > 
> > > This patch allows tagged pointers to be passed to the following memory
> > > syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
> > > mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> > > remap_file_pages, shmat and shmdt.
> > > 
> > > This is done by untagging pointers passed to these syscalls in the
> > > prologues of their handlers.
> > > 
> > > Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
> > 
> > Actually, I don't think any of these wrappers get called (have you
> > tested this patch?). Following commit 4378a7d4be30 ("arm64: implement
> > syscall wrappers"), I think we have other macro names for overriding the
> > sys_* ones.
> 
> What is the value in adding these wrappers?

Not much value, initially proposed just to keep the core changes small.
I'm fine with moving them back in the generic code (but see below).

I think another aspect is how we define the ABI. Is allowing tags to
mlock() for example something specific to arm64 or would sparc ADI need
the same? In the absence of other architectures defining such ABI, my
preference would be to keep the wrappers in the arch code.

Assuming sparc won't implement untagged_addr(), we can place the macros
back in the generic code but, as per the review here, we need to be more
restrictive on where we allow tagged addresses. For example, if mmap()
gets a tagged address with MAP_FIXED, is it expected to return the tag?

My thoughts on allowing tags (quick look):

brk - no
get_mempolicy - yes
madvise - yes
mbind - yes
mincore - yes
mlock, mlock2, munlock - yes
mmap - no (we may change this with MTE but not for TBI)
mmap_pgoff - not used on arm64
mprotect - yes
mremap - yes for old_address, no for new_address (on par with mmap)
msync - yes
munmap - probably no (mmap does not return tagged ptrs)
remap_file_pages - no (also deprecated syscall)
shmat, shmdt - shall we allow tagged addresses on shared memory?

The above is only about the TBI ABI while ignoring hardware MTE. For the
latter, we may want to change the mmap() to allow pre-colouring on page
fault which means that munmap()/mprotect() should also support tagged
pointers. Possibly mremap() as well but we need to decide whether it
should allow re-colouring the page (probably no, in which case
old_address and new_address should have the same tag). For some of these
we'll end up with arm64 specific wrappers again, unless sparc ADI adopts
exactly the same ABI restrictions.
Dave Martin May 28, 2019, 3:56 p.m. UTC | #12
On Tue, May 28, 2019 at 04:40:58PM +0100, Catalin Marinas wrote:

[...]

> My thoughts on allowing tags (quick look):
>
> brk - no

[...]

> mlock, mlock2, munlock - yes
> mmap - no (we may change this with MTE but not for TBI)

[...]

> mprotect - yes

I haven't following this discussion closely... what's the rationale for
the inconsistencies here (feel free to refer me back to the discussion
if it's elsewhere).

Cheers
---Dave
Catalin Marinas May 28, 2019, 4:34 p.m. UTC | #13
On Tue, May 28, 2019 at 04:56:45PM +0100, Dave P Martin wrote:
> On Tue, May 28, 2019 at 04:40:58PM +0100, Catalin Marinas wrote:
> 
> [...]
> 
> > My thoughts on allowing tags (quick look):
> >
> > brk - no
> 
> [...]
> 
> > mlock, mlock2, munlock - yes
> > mmap - no (we may change this with MTE but not for TBI)
> 
> [...]
> 
> > mprotect - yes
> 
> I haven't following this discussion closely... what's the rationale for
> the inconsistencies here (feel free to refer me back to the discussion
> if it's elsewhere).

_My_ rationale (feel free to disagree) is that mmap() by default would
not return a tagged address (ignoring MTE for now). If it gets passed a
tagged address or a "tagged NULL" (for lack of a better name) we don't
have clear semantics of whether the returned address should be tagged in
this ABI relaxation. I'd rather reserve this specific behaviour if we
overload the non-zero tag meaning of mmap() for MTE. Similar reasoning
for mremap(), at least on the new_address argument (not entirely sure
about old_address).

munmap() should probably follow the mmap() rules.

As for brk(), I don't see why the user would need to pass a tagged
address, we can't associate any meaning to this tag.

For the rest, since it's likely such addresses would have been tagged by
malloc() in user space, we should allow tagged pointers.
Khalid Aziz May 28, 2019, 11:33 p.m. UTC | #14
On Tue, 2019-05-28 at 16:40 +0100, Catalin Marinas wrote:
> On Tue, May 28, 2019 at 03:54:11PM +0100, Andrew Murray wrote:
> > On Mon, May 27, 2019 at 03:37:20PM +0100, Catalin Marinas wrote:
> > > On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> > > > This patch is a part of a series that extends arm64 kernel ABI
> > > > to allow to
> > > > pass tagged user pointers (with the top byte set to something
> > > > else other
> > > > than 0x00) as syscall arguments.
> > > > 
> > > > This patch allows tagged pointers to be passed to the following
> > > > memory
> > > > syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock,
> > > > mlock2,
> > > > mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> > > > remap_file_pages, shmat and shmdt.
> > > > 
> > > > This is done by untagging pointers passed to these syscalls in
> > > > the
> > > > prologues of their handlers.
> > > > 
> > > > Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
> > > 
> > > Actually, I don't think any of these wrappers get called (have
> > > you
> > > tested this patch?). Following commit 4378a7d4be30 ("arm64:
> > > implement
> > > syscall wrappers"), I think we have other macro names for
> > > overriding the
> > > sys_* ones.
> > 
> > What is the value in adding these wrappers?
> 
> Not much value, initially proposed just to keep the core changes
> small.
> I'm fine with moving them back in the generic code (but see below).
> 
> I think another aspect is how we define the ABI. Is allowing tags to
> mlock() for example something specific to arm64 or would sparc ADI
> need
> the same? In the absence of other architectures defining such ABI, my
> preference would be to keep the wrappers in the arch code.
> 
> Assuming sparc won't implement untagged_addr(), we can place the
> macros
> back in the generic code but, as per the review here, we need to be
> more
> restrictive on where we allow tagged addresses. For example, if
> mmap()
> gets a tagged address with MAP_FIXED, is it expected to return the
> tag?

I would recommend against any ABI differences between ARM64 MTE/TBI and
sparc ADI unless it simply can not be helped. My understanding of
MTE/TBI is limited, so I will explain how sparc ADI works. On sparc, a
tagged address has no meaning until following steps happen:

1. set the user mode PSTATE.mcde bit. This acts as the master switch to
enable ADI for a process.

2. set TTE.mcd bit on TLB entries that match the address range ADI is
being enabled on.

3. Store version tag for the range of addresses userspace wants ADI
enabled on using "stxa" instruction. These tags are stored in physical
memory by the memory controller.

Steps 1 and 2 are accomplished by userspace by calling mprotect() with
PROT_ADI. Tags are set by storing tags in a loop, for example:

        version = 10;
        tmp_addr = shmaddr;
        end = shmaddr + BUFFER_SIZE;
        while (tmp_addr < end) {
                asm volatile(
                        "stxa %1, [%0]0x90\n\t"
                        :
                        : "r" (tmp_addr), "r" (version));
                tmp_addr += adi_blksz;
        }

With these semantics, giving mmap() or shamat() a tagged address is
meaningless since no tags have been stored at the addresses mmap() will
allocate and one can not store tags before memory range has been
allocated. If we choose to allow tagged addresses to come into mmap()
and shmat(), sparc code can strip the tags unconditionally and that may
help simplify ABI and/or code.

> 
> My thoughts on allowing tags (quick look):
> 
> brk - no
> get_mempolicy - yes
> madvise - yes
> mbind - yes
> mincore - yes
> mlock, mlock2, munlock - yes
> mmap - no (we may change this with MTE but not for TBI)
> mmap_pgoff - not used on arm64
> mprotect - yes
> mremap - yes for old_address, no for new_address (on par with mmap)
> msync - yes
> munmap - probably no (mmap does not return tagged ptrs)
> remap_file_pages - no (also deprecated syscall)
> shmat, shmdt - shall we allow tagged addresses on shared memory?
> 
> The above is only about the TBI ABI while ignoring hardware MTE. For
> the
> latter, we may want to change the mmap() to allow pre-colouring on
> page
> fault which means that munmap()/mprotect() should also support tagged
> pointers. Possibly mremap() as well but we need to decide whether it
> should allow re-colouring the page (probably no, in which case
> old_address and new_address should have the same tag). For some of
> these
> we'll end up with arm64 specific wrappers again, unless sparc ADI
> adopts
> exactly the same ABI restrictions.
> 

Let us keep any restrictions common across ARM64 and sparc. pre-
coloring on sparc in the kernel would mean kernel will have to execute
stxa instructions in a loop for each page being faulted in. Not that
big a deal but doesn't that assume the entire page has the same tag
which is dedcued from the upper bits of address? Shouldn't we support
tags at the same granularity level as what the hardware supports? We
went through this discussion for sparc and decision was to support tags
at the same granularity as hardware. That means we can not deduce tags
from the first address that pioints into an mmap or shmat region. Those
tags and the upper bytes of colored address could change for every
cacheline sized block (64-bytes on sparc M7). We can try to store tags
for an entire region in vma but that is expensive, plus on sparc tags
are set in userspace with no participation from kernel and now we need
a way for userspace to communicate the tags to kernel. From sparc point
of view, making kernel responsible for assigning tags to a page on page
fault is full of pitfalls.

Thanks,
Khalid
Dave Martin May 29, 2019, 12:42 p.m. UTC | #15
On Tue, May 28, 2019 at 05:34:00PM +0100, Catalin Marinas wrote:
> On Tue, May 28, 2019 at 04:56:45PM +0100, Dave P Martin wrote:
> > On Tue, May 28, 2019 at 04:40:58PM +0100, Catalin Marinas wrote:
> > 
> > [...]
> > 
> > > My thoughts on allowing tags (quick look):
> > >
> > > brk - no
> > 
> > [...]
> > 
> > > mlock, mlock2, munlock - yes
> > > mmap - no (we may change this with MTE but not for TBI)
> > 
> > [...]
> > 
> > > mprotect - yes
> > 
> > I haven't following this discussion closely... what's the rationale for
> > the inconsistencies here (feel free to refer me back to the discussion
> > if it's elsewhere).
> 
> _My_ rationale (feel free to disagree) is that mmap() by default would
> not return a tagged address (ignoring MTE for now). If it gets passed a
> tagged address or a "tagged NULL" (for lack of a better name) we don't
> have clear semantics of whether the returned address should be tagged in
> this ABI relaxation. I'd rather reserve this specific behaviour if we
> overload the non-zero tag meaning of mmap() for MTE. Similar reasoning
> for mremap(), at least on the new_address argument (not entirely sure
> about old_address).
> 
> munmap() should probably follow the mmap() rules.
> 
> As for brk(), I don't see why the user would need to pass a tagged
> address, we can't associate any meaning to this tag.
> 
> For the rest, since it's likely such addresses would have been tagged by
> malloc() in user space, we should allow tagged pointers.

Those arguments seem reasonable.  We should try to capture this
somewhere when documenting the ABI.

To be clear, I'm not sure that we should guarantee anywhere that a
tagged pointer is rejected: rather the behaviour should probably be
left unspecified.  Then we can tidy it up incrementally.

(The behaviour is unspecified today, in any case.)

Cheers
---Dave
Catalin Marinas May 29, 2019, 1:23 p.m. UTC | #16
On Wed, May 29, 2019 at 01:42:25PM +0100, Dave P Martin wrote:
> On Tue, May 28, 2019 at 05:34:00PM +0100, Catalin Marinas wrote:
> > On Tue, May 28, 2019 at 04:56:45PM +0100, Dave P Martin wrote:
> > > On Tue, May 28, 2019 at 04:40:58PM +0100, Catalin Marinas wrote:
> > > 
> > > [...]
> > > 
> > > > My thoughts on allowing tags (quick look):
> > > >
> > > > brk - no
> > > 
> > > [...]
> > > 
> > > > mlock, mlock2, munlock - yes
> > > > mmap - no (we may change this with MTE but not for TBI)
> > > 
> > > [...]
> > > 
> > > > mprotect - yes
> > > 
> > > I haven't following this discussion closely... what's the rationale for
> > > the inconsistencies here (feel free to refer me back to the discussion
> > > if it's elsewhere).
> > 
> > _My_ rationale (feel free to disagree) is that mmap() by default would
> > not return a tagged address (ignoring MTE for now). If it gets passed a
> > tagged address or a "tagged NULL" (for lack of a better name) we don't
> > have clear semantics of whether the returned address should be tagged in
> > this ABI relaxation. I'd rather reserve this specific behaviour if we
> > overload the non-zero tag meaning of mmap() for MTE. Similar reasoning
> > for mremap(), at least on the new_address argument (not entirely sure
> > about old_address).
> > 
> > munmap() should probably follow the mmap() rules.
> > 
> > As for brk(), I don't see why the user would need to pass a tagged
> > address, we can't associate any meaning to this tag.
> > 
> > For the rest, since it's likely such addresses would have been tagged by
> > malloc() in user space, we should allow tagged pointers.
> 
> Those arguments seem reasonable.  We should try to capture this
> somewhere when documenting the ABI.
> 
> To be clear, I'm not sure that we should guarantee anywhere that a
> tagged pointer is rejected: rather the behaviour should probably be
> left unspecified.  Then we can tidy it up incrementally.
> 
> (The behaviour is unspecified today, in any case.)

What is specified (or rather de-facto ABI) today is that passing a user
address above TASK_SIZE (e.g. non-zero top byte) would fail in most
cases. If we relax this with the TBI we may end up with some de-facto
ABI before we actually get MTE hardware. Tightening it afterwards may be
slightly more problematic, although MTE needs to be an explicit opt-in.

IOW, I wouldn't want to unnecessarily relax the ABI if we don't need to.
Catalin Marinas May 29, 2019, 2:20 p.m. UTC | #17
Hi Khalid,

On Tue, May 28, 2019 at 05:33:04PM -0600, Khalid Aziz wrote:
> On Tue, 2019-05-28 at 16:40 +0100, Catalin Marinas wrote:
> > I think another aspect is how we define the ABI. Is allowing tags to
> > mlock() for example something specific to arm64 or would sparc ADI
> > need the same? In the absence of other architectures defining such
> > ABI, my preference would be to keep the wrappers in the arch code.
> > 
> > Assuming sparc won't implement untagged_addr(), we can place the
> > macros back in the generic code but, as per the review here, we need
> > to be more restrictive on where we allow tagged addresses. For
> > example, if mmap() gets a tagged address with MAP_FIXED, is it
> > expected to return the tag?
> 
> I would recommend against any ABI differences between ARM64 MTE/TBI and
> sparc ADI unless it simply can not be helped. My understanding of
> MTE/TBI is limited, so I will explain how sparc ADI works. On sparc, a
> tagged address has no meaning until following steps happen:

Before we go into the MTE/ADI similarities or differences, just to
clarify that TBI is something that we supported from the start of the
arm64 kernel port. TBI (top byte ignore) allows a user pointer to have
non-zero top byte and dereference it without causing a fault (the
hardware masks it out). The user/kernel ABI does not allow such tagged
pointers into the kernel, nor would the kernel return any such tagged
addresses.

With MTE (memory tagging extensions), the top-byte meaning is changed
from no longer being ignored to actually being checked against a tag in
the physical RAM (we call it allocation tag).

> 1. set the user mode PSTATE.mcde bit. This acts as the master switch to
> enable ADI for a process.
> 
> 2. set TTE.mcd bit on TLB entries that match the address range ADI is
> being enabled on.

Something close enough for MTE, with the difference that enabling it is
not a PSTATE bit but rather a system control bit (SCTLR_EL1 register),
so only the kernel can turn it on/off for the user.

> 3. Store version tag for the range of addresses userspace wants ADI
> enabled on using "stxa" instruction. These tags are stored in physical
> memory by the memory controller.

Do you have an "ldxa" instruction to load the tags from physical memory?

> Steps 1 and 2 are accomplished by userspace by calling mprotect() with
> PROT_ADI. Tags are set by storing tags in a loop, for example:
> 
>         version = 10;
>         tmp_addr = shmaddr;
>         end = shmaddr + BUFFER_SIZE;
>         while (tmp_addr < end) {
>                 asm volatile(
>                         "stxa %1, [%0]0x90\n\t"
>                         :
>                         : "r" (tmp_addr), "r" (version));
>                 tmp_addr += adi_blksz;
>         }

On arm64, a sequence similar to the above would live in the libc. So a
malloc() call will tag the memory and return the tagged address to the
user.

We were not planning for a PROT_ADI/MTE but rather have MTE enabled for
all user memory ranges. We may revisit this before we upstream the MTE
support (probably some marginal benefit for the hardware not fetching
the tags from memory if we don't need to, e.g. code sections).

Given that we already have the TBI feature and with MTE enabled the top
byte is no longer ignored, we are planning for an explicit opt-in by the
user via prctl() to enable MTE.

> With these semantics, giving mmap() or shamat() a tagged address is
> meaningless since no tags have been stored at the addresses mmap() will
> allocate and one can not store tags before memory range has been
> allocated. If we choose to allow tagged addresses to come into mmap()
> and shmat(), sparc code can strip the tags unconditionally and that may
> help simplify ABI and/or code.

We could say that with TBI (pre-MTE support), the top byte is actually
ignored on mmap(). Now, if you pass a MAP_FIXED with a tagged address,
should the user expect the same tagged address back or stripping the tag
is acceptable? If we want to keep the current mmap() semantics, I'd say
the same tag is returned. However, with MTE this also implies that the
memory was coloured.

> > My thoughts on allowing tags (quick look):
> > 
> > brk - no
> > get_mempolicy - yes
> > madvise - yes
> > mbind - yes
> > mincore - yes
> > mlock, mlock2, munlock - yes
> > mmap - no (we may change this with MTE but not for TBI)
> > mmap_pgoff - not used on arm64
> > mprotect - yes
> > mremap - yes for old_address, no for new_address (on par with mmap)
> > msync - yes
> > munmap - probably no (mmap does not return tagged ptrs)
> > remap_file_pages - no (also deprecated syscall)
> > shmat, shmdt - shall we allow tagged addresses on shared memory?
> > 
> > The above is only about the TBI ABI while ignoring hardware MTE. For
> > the latter, we may want to change the mmap() to allow pre-colouring
> > on page fault which means that munmap()/mprotect() should also
> > support tagged pointers. Possibly mremap() as well but we need to
> > decide whether it should allow re-colouring the page (probably no,
> > in which case old_address and new_address should have the same tag).
> > For some of these we'll end up with arm64 specific wrappers again,
> > unless sparc ADI adopts exactly the same ABI restrictions.
> 
> Let us keep any restrictions common across ARM64 and sparc. pre-
> coloring on sparc in the kernel would mean kernel will have to execute
> stxa instructions in a loop for each page being faulted in.

Since the user can probe the pre-existing colour in a faulted-in page
(either with some 'ldxa' instruction or by performing a tag-checked
access), the kernel should always pre-colour (even if colour 0) any
allocated page. There might not be an obvious security risk but I feel
uneasy about letting colours leak between address spaces (different user
processes or between kernel and user).

Since we already need such loop in the kernel, we might as well allow
user space to require a certain colour. This comes in handy for large
malloc() and another advantage is that the C library won't be stuck
trying to paint the whole range (think GB).

> Not that big a deal but doesn't that assume the entire page has the
> same tag which is dedcued from the upper bits of address? Shouldn't we
> support tags at the same granularity level as what the hardware
> supports?

That's mostly about large malloc() optimisation via mmap(), the latter
working on page granularity already. There is another use-case for
pre-coloured thread stacks, also allocated via anonymous mmap().

> We went through this discussion for sparc and decision was to support
> tags at the same granularity as hardware. That means we can not deduce
> tags from the first address that pioints into an mmap or shmat region.
> Those tags and the upper bytes of colored address could change for
> every cacheline sized block (64-bytes on sparc M7).

It's 16-byte for arm64, so smaller than the cacheline.

> We can try to store tags for an entire region in vma but that is
> expensive, plus on sparc tags are set in userspace with no
> participation from kernel and now we need a way for userspace to
> communicate the tags to kernel.

We can't support finer granularity through the mmap() syscall and, as
you said, the vma is not the right thing to store the individual tags.
With the above extension to mmap(), we'd have to store a colour per vma
and prevent merging if different colours (we could as well use the
pkeys mechanism we already have in the kernel but use a colour per vma
instead of a key).

Of course, the user is allowed to change the in-memory colours at a
finer granularity and the kernel will preserve them during swapping
out/in, page migration etc. The above mmap() proposal is just for the
first fault-in of a page in a given range/vma.

> From sparc point of view, making kernel responsible for assigning tags
> to a page on page fault is full of pitfalls.

This could be just some arm64-specific but if you plan to deploy it more
generically for sparc (at the C library level), you may find this
useful.
Dave Martin May 29, 2019, 3:18 p.m. UTC | #18
On Wed, May 29, 2019 at 02:23:42PM +0100, Catalin Marinas wrote:
> On Wed, May 29, 2019 at 01:42:25PM +0100, Dave P Martin wrote:
> > On Tue, May 28, 2019 at 05:34:00PM +0100, Catalin Marinas wrote:
> > > On Tue, May 28, 2019 at 04:56:45PM +0100, Dave P Martin wrote:
> > > > On Tue, May 28, 2019 at 04:40:58PM +0100, Catalin Marinas wrote:
> > > > 
> > > > [...]
> > > > 
> > > > > My thoughts on allowing tags (quick look):
> > > > >
> > > > > brk - no
> > > > 
> > > > [...]
> > > > 
> > > > > mlock, mlock2, munlock - yes
> > > > > mmap - no (we may change this with MTE but not for TBI)
> > > > 
> > > > [...]
> > > > 
> > > > > mprotect - yes
> > > > 
> > > > I haven't following this discussion closely... what's the rationale for
> > > > the inconsistencies here (feel free to refer me back to the discussion
> > > > if it's elsewhere).
> > > 
> > > _My_ rationale (feel free to disagree) is that mmap() by default would
> > > not return a tagged address (ignoring MTE for now). If it gets passed a
> > > tagged address or a "tagged NULL" (for lack of a better name) we don't
> > > have clear semantics of whether the returned address should be tagged in
> > > this ABI relaxation. I'd rather reserve this specific behaviour if we
> > > overload the non-zero tag meaning of mmap() for MTE. Similar reasoning
> > > for mremap(), at least on the new_address argument (not entirely sure
> > > about old_address).
> > > 
> > > munmap() should probably follow the mmap() rules.
> > > 
> > > As for brk(), I don't see why the user would need to pass a tagged
> > > address, we can't associate any meaning to this tag.
> > > 
> > > For the rest, since it's likely such addresses would have been tagged by
> > > malloc() in user space, we should allow tagged pointers.
> > 
> > Those arguments seem reasonable.  We should try to capture this
> > somewhere when documenting the ABI.
> > 
> > To be clear, I'm not sure that we should guarantee anywhere that a
> > tagged pointer is rejected: rather the behaviour should probably be
> > left unspecified.  Then we can tidy it up incrementally.
> > 
> > (The behaviour is unspecified today, in any case.)
> 
> What is specified (or rather de-facto ABI) today is that passing a user
> address above TASK_SIZE (e.g. non-zero top byte) would fail in most
> cases. If we relax this with the TBI we may end up with some de-facto

I may be being too picky, but "would fail in most cases" sounds like
"unspecified" ?

> ABI before we actually get MTE hardware. Tightening it afterwards may be
> slightly more problematic, although MTE needs to be an explicit opt-in.
> 
> IOW, I wouldn't want to unnecessarily relax the ABI if we don't need to.

So long we don't block foreseeable future developments unnecessarily
either -- I agree there's a balance to be struck.

I guess this can be reviewed when we have nailed down the details a bit
further.

Cheers
---Dave
Khalid Aziz May 29, 2019, 7:16 p.m. UTC | #19
On 5/29/19 8:20 AM, Catalin Marinas wrote:
> Hi Khalid,
> 
> On Tue, May 28, 2019 at 05:33:04PM -0600, Khalid Aziz wrote:
>> On Tue, 2019-05-28 at 16:40 +0100, Catalin Marinas wrote:
>>> I think another aspect is how we define the ABI. Is allowing tags to
>>> mlock() for example something specific to arm64 or would sparc ADI
>>> need the same? In the absence of other architectures defining such
>>> ABI, my preference would be to keep the wrappers in the arch code.
>>>
>>> Assuming sparc won't implement untagged_addr(), we can place the
>>> macros back in the generic code but, as per the review here, we need
>>> to be more restrictive on where we allow tagged addresses. For
>>> example, if mmap() gets a tagged address with MAP_FIXED, is it
>>> expected to return the tag?
>>
>> I would recommend against any ABI differences between ARM64 MTE/TBI and
>> sparc ADI unless it simply can not be helped. My understanding of
>> MTE/TBI is limited, so I will explain how sparc ADI works. On sparc, a
>> tagged address has no meaning until following steps happen:
> 
> Before we go into the MTE/ADI similarities or differences, just to
> clarify that TBI is something that we supported from the start of the
> arm64 kernel port. TBI (top byte ignore) allows a user pointer to have
> non-zero top byte and dereference it without causing a fault (the
> hardware masks it out). The user/kernel ABI does not allow such tagged
> pointers into the kernel, nor would the kernel return any such tagged
> addresses.
> 
> With MTE (memory tagging extensions), the top-byte meaning is changed
> from no longer being ignored to actually being checked against a tag in
> the physical RAM (we call it allocation tag).
> 
>> 1. set the user mode PSTATE.mcde bit. This acts as the master switch to
>> enable ADI for a process.
>>
>> 2. set TTE.mcd bit on TLB entries that match the address range ADI is
>> being enabled on.
> 
> Something close enough for MTE, with the difference that enabling it is
> not a PSTATE bit but rather a system control bit (SCTLR_EL1 register),
> so only the kernel can turn it on/off for the user.
> 
>> 3. Store version tag for the range of addresses userspace wants ADI
>> enabled on using "stxa" instruction. These tags are stored in physical
>> memory by the memory controller.
> 
> Do you have an "ldxa" instruction to load the tags from physical memory?

Yes, "ldxa" can be used to read current tag for any memory location.
Kernel uses it to read the tags for a physical page being swapped out
and restores those tags when the page is swapped back in.

> 
>> Steps 1 and 2 are accomplished by userspace by calling mprotect() with
>> PROT_ADI. Tags are set by storing tags in a loop, for example:
>>
>>         version = 10;
>>         tmp_addr = shmaddr;
>>         end = shmaddr + BUFFER_SIZE;
>>         while (tmp_addr < end) {
>>                 asm volatile(
>>                         "stxa %1, [%0]0x90\n\t"
>>                         :
>>                         : "r" (tmp_addr), "r" (version));
>>                 tmp_addr += adi_blksz;
>>         }
> 
> On arm64, a sequence similar to the above would live in the libc. So a
> malloc() call will tag the memory and return the tagged address to thePre-coloring could easily be done by 
> user.
> 
> We were not planning for a PROT_ADI/MTE but rather have MTE enabled for
> all user memory ranges. We may revisit this before we upstream the MTE
> support (probably some marginal benefit for the hardware not fetching
> the tags from memory if we don't need to, e.g. code sections).
> 
> Given that we already have the TBI feature and with MTE enabled the top
> byte is no longer ignored, we are planning for an explicit opt-in by the
> user via prctl() to enable MTE.

OK. I had initially proposed enabling ADI for a process using prctl().
Feedback I got was prctl was not a desirable interface and I ended up
making mprotect() with PROT_ADI enable ADI on the process instead. Just
something to keep in mind.

> 
>> With these semantics, giving mmap() or shamat() a tagged address is
>> meaningless since no tags have been stored at the addresses mmap() will
>> allocate and one can not store tags before memory range has been
>> allocated. If we choose to allow tagged addresses to come into mmap()
>> and shmat(), sparc code can strip the tags unconditionally and that may
>> help simplify ABI and/or code.
> 
> We could say that with TBI (pre-MTE support), the top byte is actually
> ignored on mmap(). Now, if you pass a MAP_FIXED with a tagged address,
> should the user expect the same tagged address back or stripping the tag
> is acceptable? If we want to keep the current mmap() semantics, I'd say
> the same tag is returned. However, with MTE this also implies that the
> memory was coloured.
> 

Is assigning a tag aprivileged operationon ARM64? I am thinking not
since you mentioned libc could do it in a loop for malloc'd memory.
mmap() can return the same tagged address but I am uneasy about kernel
pre-coloring the pages. Database can mmap 100's of GB of memory. That is
lot of work being offloaded to the kernel to pre-color the page even if
done in batches as pages are faulted in.

>>> My thoughts on allowing tags (quick look):
>>>
>>> brk - no
>>> get_mempolicy - yes
>>> madvise - yes
>>> mbind - yes
>>> mincore - yes
>>> mlock, mlock2, munlock - yes
>>> mmap - no (we may change this with MTE but not for TBI)
>>> mmap_pgoff - not used on arm64
>>> mprotect - yes
>>> mremap - yes for old_address, no for new_address (on par with mmap)
>>> msync - yes
>>> munmap - probably no (mmap does not return tagged ptrs)
>>> remap_file_pages - no (also deprecated syscall)
>>> shmat, shmdt - shall we allow tagged addresses on shared memory?
>>>
>>> The above is only about the TBI ABI while ignoring hardware MTE. For
>>> the latter, we may want to change the mmap() to allow pre-colouring
>>> on page fault which means that munmap()/mprotect() should also
>>> support tagged pointers. Possibly mremap() as well but we need to
>>> decide whether it should allow re-colouring the page (probably no,
>>> in which case old_address and new_address should have the same tag).
>>> For some of these we'll end up with arm64 specific wrappers again,
>>> unless sparc ADI adopts exactly the same ABI restrictions.
>>
>> Let us keep any restrictions common across ARM64 and sparc. pre-
>> coloring on sparc in the kernel would mean kernel will have to execute
>> stxa instructions in a loop for each page being faulted in.
> 
> Since the user can probe the pre-existing colour in a faulted-in page
> (either with some 'ldxa' instruction or by performing a tag-checked
> access), the kernel should always pre-colour (even if colour 0) any
> allocated page. There might not be an obvious security risk but I feel
> uneasy about letting colours leak between address spaces (different user
> processes or between kernel and user).

On sparc, tags 0 and 15 are special in that 0 means untagged memory and
15 means match any tag in the address. Colour 0 is the default for any
newly faulted in page on sparc.

> 
> Since we already need such loop in the kernel, we might as well allow
> user space to require a certain colour. This comes in handy for large
> malloc() and another advantage is that the C library won't be stuck
> trying to paint the whole range (think GB).

If kernel is going to pre-color all pages in a vma, we will need to
store the default tag in the vma. It will add more time to page fault
handling code. On sparc M7, kernel will need to execute additional 128
stxa instructions to set the tags on a normal page.

> 
>> Not that big a deal but doesn't that assume the entire page has the
>> same tag which is dedcued from the upper bits of address? Shouldn't we
>> support tags at the same granularity level as what the hardware
>> supports?
> 
> That's mostly about large malloc() optimisation via mmap(), the latter
> working on page granularity already. There is another use-case for
> pre-coloured thread stacks, also allocated via anonymous mmap().
> 
>> We went through this discussion for sparc and decision was to support
>> tags at the same granularity as hardware. That means we can not deduce
>> tags from the first address that pioints into an mmap or shmat region.
>> Those tags and the upper bytes of colored address could change for
>> every cacheline sized block (64-bytes on sparc M7).
> 
> It's 16-byte for arm64, so smaller than the cacheline.
> 
>> We can try to store tags for an entire region in vma but that is
>> expensive, plus on sparc tags are set in userspace with no
>> participation from kernel and now we need a way for userspace to
>> communicate the tags to kernel.
> 
> We can't support finer granularity through the mmap() syscall and, as
> you said, the vma is not the right thing to store the individual tags.
> With the above extension to mmap(), we'd have to store a colour per vma
> and prevent merging if different colours (we could as well use the
> pkeys mechanism we already have in the kernel but use a colour per vma
> instead of a key).

Since tags can change on any part of mmap region on sparc at any time
without kernel being involved, I am not sure I see much reason for
kernel to enforce any tag related restrictions.

> 
> Of course, the user is allowed to change the in-memory colours at a
> finer granularity and the kernel will preserve them during swapping
> out/in, page migration etc. The above mmap() proposal is just for the
> first fault-in of a page in a given range/vma.
> 
>> From sparc point of view, making kernel responsible for assigning tags
>> to a page on page fault is full of pitfalls.
> 
> This could be just some arm64-specific but if you plan to deploy it more
> generically for sparc (at the C library level), you may find this
> useful.
> 

Common semantics from app developer point of view will be very useful to
maintain. If arm64 says mmap with MAP_FIXED and a tagged address will
return a pre-colored page, I would rather have it be the same on any
architecture. Is there a use case that justifies kernel doing this extra
work?

--
Khalid
Catalin Marinas May 30, 2019, 3:11 p.m. UTC | #20
On Wed, May 29, 2019 at 01:16:37PM -0600, Khalid Aziz wrote:
> On 5/29/19 8:20 AM, Catalin Marinas wrote:
> > On Tue, May 28, 2019 at 05:33:04PM -0600, Khalid Aziz wrote:
> >> Steps 1 and 2 are accomplished by userspace by calling mprotect() with
> >> PROT_ADI. Tags are set by storing tags in a loop, for example:
> >>
> >>         version = 10;
> >>         tmp_addr = shmaddr;
> >>         end = shmaddr + BUFFER_SIZE;
> >>         while (tmp_addr < end) {
> >>                 asm volatile(
> >>                         "stxa %1, [%0]0x90\n\t"
> >>                         :
> >>                         : "r" (tmp_addr), "r" (version));
> >>                 tmp_addr += adi_blksz;
> >>         }
> > 
> > On arm64, a sequence similar to the above would live in the libc. So a
> > malloc() call will tag the memory and return the tagged address to thePre-coloring could easily be done by 
> > user.
> > 
> > We were not planning for a PROT_ADI/MTE but rather have MTE enabled for
> > all user memory ranges. We may revisit this before we upstream the MTE
> > support (probably some marginal benefit for the hardware not fetching
> > the tags from memory if we don't need to, e.g. code sections).
> > 
> > Given that we already have the TBI feature and with MTE enabled the top
> > byte is no longer ignored, we are planning for an explicit opt-in by the
> > user via prctl() to enable MTE.
> 
> OK. I had initially proposed enabling ADI for a process using prctl().
> Feedback I got was prctl was not a desirable interface and I ended up
> making mprotect() with PROT_ADI enable ADI on the process instead. Just
> something to keep in mind.

Thanks for the feedback. We'll keep this in mind when adding MTE
support. In the way we plan to deploy this, it would be a libc decision
to invoke the mmap() with the right flag.

This could actually simplify the automatic page faulting below brk(),
basically no tagged/coloured memory allowed implicitly. It needs
feedback from the bionic/glibc folk.

> >> With these semantics, giving mmap() or shamat() a tagged address is
> >> meaningless since no tags have been stored at the addresses mmap() will
> >> allocate and one can not store tags before memory range has been
> >> allocated. If we choose to allow tagged addresses to come into mmap()
> >> and shmat(), sparc code can strip the tags unconditionally and that may
> >> help simplify ABI and/or code.
> > 
> > We could say that with TBI (pre-MTE support), the top byte is actually
> > ignored on mmap(). Now, if you pass a MAP_FIXED with a tagged address,
> > should the user expect the same tagged address back or stripping the tag
> > is acceptable? If we want to keep the current mmap() semantics, I'd say
> > the same tag is returned. However, with MTE this also implies that the
> > memory was coloured.
> 
> Is assigning a tag aprivileged operationon ARM64? I am thinking not
> since you mentioned libc could do it in a loop for malloc'd memory.

Indeed it's not, the user can do it.

> mmap() can return the same tagged address but I am uneasy about kernel
> pre-coloring the pages. Database can mmap 100's of GB of memory. That is
> lot of work being offloaded to the kernel to pre-color the page even if
> done in batches as pages are faulted in.

For anonymous mmap() for example, the kernel would have to zero the
faulted in pages anyway. We can handle the colouring at the same time in
clear_user_page() (as I said below, we have to clear the colour anyway
from previous uses, so it's simply extending this to support something
other than tag/colour 0 by default with no additional overhead).

> > Since the user can probe the pre-existing colour in a faulted-in page
> > (either with some 'ldxa' instruction or by performing a tag-checked
> > access), the kernel should always pre-colour (even if colour 0) any
> > allocated page. There might not be an obvious security risk but I feel
> > uneasy about letting colours leak between address spaces (different user
> > processes or between kernel and user).
> 
> On sparc, tags 0 and 15 are special in that 0 means untagged memory and
> 15 means match any tag in the address. Colour 0 is the default for any
> newly faulted in page on sparc.

With MTE we don't have match-all/any tag in memory, only in the virtual
address/pointer. So if we turn on MTE for all pages and the user
accesses an address with a 0 tag, the underlying memory needs to be
coloured with the same 0 value.

> > Since we already need such loop in the kernel, we might as well allow
> > user space to require a certain colour. This comes in handy for large
> > malloc() and another advantage is that the C library won't be stuck
> > trying to paint the whole range (think GB).
> 
> If kernel is going to pre-color all pages in a vma, we will need to
> store the default tag in the vma. It will add more time to page fault
> handling code. On sparc M7, kernel will need to execute additional 128
> stxa instructions to set the tags on a normal page.

As I said, since the user can retrieve an old colour using ldxa, the
kernel should perform this operation anyway on any newly allocated page
(unless you clear the existing colour on page freeing).

> >> We can try to store tags for an entire region in vma but that is
> >> expensive, plus on sparc tags are set in userspace with no
> >> participation from kernel and now we need a way for userspace to
> >> communicate the tags to kernel.
> > 
> > We can't support finer granularity through the mmap() syscall and, as
> > you said, the vma is not the right thing to store the individual tags.
> > With the above extension to mmap(), we'd have to store a colour per vma
> > and prevent merging if different colours (we could as well use the
> > pkeys mechanism we already have in the kernel but use a colour per vma
> > instead of a key).
> 
> Since tags can change on any part of mmap region on sparc at any time
> without kernel being involved, I am not sure I see much reason for
> kernel to enforce any tag related restrictions.

It's not enforcing a tag, more like the default colour for a faulted in
page. Anyway, if sparc is going with default 0/untagged, that's fine as
well. We may add this mmap() option to arm64 only.

> >> From sparc point of view, making kernel responsible for assigning tags
> >> to a page on page fault is full of pitfalls.
> > 
> > This could be just some arm64-specific but if you plan to deploy it more
> > generically for sparc (at the C library level), you may find this
> > useful.
> 
> Common semantics from app developer point of view will be very useful to
> maintain. If arm64 says mmap with MAP_FIXED and a tagged address will
> return a pre-colored page, I would rather have it be the same on any
> architecture. Is there a use case that justifies kernel doing this extra
> work?

So if a database program is doing an anonymous mmap(PROT_TBI) of 100GB,
IIUC for sparc the faulted-in pages will have random colours (on 64-byte
granularity). Ignoring the information leak from prior uses of such
pages, it would be the responsibility of the db program to issue the
stxa. On arm64, since we also want to do this via malloc(), any large
allocation would require all pages to be faulted in so that malloc() can
set the write colour before being handed over to the user. That's what
we want to avoid and the user is free to repaint the memory as it likes.
Khalid Aziz May 30, 2019, 4:05 p.m. UTC | #21
On 5/30/19 9:11 AM, Catalin Marinas wrote:
> On Wed, May 29, 2019 at 01:16:37PM -0600, Khalid Aziz wrote:
>> mmap() can return the same tagged address but I am uneasy about kernel
>> pre-coloring the pages. Database can mmap 100's of GB of memory. That is
>> lot of work being offloaded to the kernel to pre-color the page even if
>> done in batches as pages are faulted in.
> 
> For anonymous mmap() for example, the kernel would have to zero the
> faulted in pages anyway. We can handle the colouring at the same time in
> clear_user_page() (as I said below, we have to clear the colour anyway
> from previous uses, so it's simply extending this to support something
> other than tag/colour 0 by default with no additional overhead).
> 

On sparc M7, clear_user_page() ends up in M7clear_user_page defined in
arch/sparc/lib/M7memset.S. M7 code use Block Init Store (BIS) to clear
the page. BIS on M7 clears the memory tags as well and no separate
instructions are needed to clear the tags. As a result when kernel
clears a page before returning it to user, the page is not only zeroed
out, its tags are also cleared to 0.

>>> Since we already need such loop in the kernel, we might as well allow
>>> user space to require a certain colour. This comes in handy for large
>>> malloc() and another advantage is that the C library won't be stuck
>>> trying to paint the whole range (think GB).
>>
>> If kernel is going to pre-color all pages in a vma, we will need to
>> store the default tag in the vma. It will add more time to page fault
>> handling code. On sparc M7, kernel will need to execute additional 128
>> stxa instructions to set the tags on a normal page.
> 
> As I said, since the user can retrieve an old colour using ldxa, the
> kernel should perform this operation anyway on any newly allocated page
> (unless you clear the existing colour on page freeing).>

Tags are not cleared on sparc on freeing. They get cleared when the page
is allocated again.

>>>> We can try to store tags for an entire region in vma but that is
>>>> expensive, plus on sparc tags are set in userspace with no
>>>> participation from kernel and now we need a way for userspace to
>>>> communicate the tags to kernel.
>>>
>>> We can't support finer granularity through the mmap() syscall and, as
>>> you said, the vma is not the right thing to store the individual tags.
>>> With the above extension to mmap(), we'd have to store a colour per vma
>>> and prevent merging if different colours (we could as well use the
>>> pkeys mechanism we already have in the kernel but use a colour per vma
>>> instead of a key).
>>
>> Since tags can change on any part of mmap region on sparc at any time
>> without kernel being involved, I am not sure I see much reason for
>> kernel to enforce any tag related restrictions.
> 
> It's not enforcing a tag, more like the default colour for a faulted in
> page. Anyway, if sparc is going with default 0/untagged, that's fine as
> well. We may add this mmap() option to arm64 only.
> 
>>>> From sparc point of view, making kernel responsible for assigning tags
>>>> to a page on page fault is full of pitfalls.
>>>
>>> This could be just some arm64-specific but if you plan to deploy it more
>>> generically for sparc (at the C library level), you may find this
>>> useful.
>>
>> Common semantics from app developer point of view will be very useful to
>> maintain. If arm64 says mmap with MAP_FIXED and a tagged address will
>> return a pre-colored page, I would rather have it be the same on any
>> architecture. Is there a use case that justifies kernel doing this extra
>> work?
> 
> So if a database program is doing an anonymous mmap(PROT_TBI) of 100GB,
> IIUC for sparc the faulted-in pages will have random colours (on 64-byte
> granularity). Ignoring the information leak from prior uses of such
> pages, it would be the responsibility of the db program to issue the
> stxa. On arm64, since we also want to do this via malloc(), any large
> allocation would require all pages to be faulted in so that malloc() can
> set the write colour before being handed over to the user. That's what
> we want to avoid and the user is free to repaint the memory as it likes.
> 

On sparc, any newly allocated page is cleared along with any old tags on
it. Since clearing tag happens automatically when page is cleared on
sparc, clear_user_page() will need to execute additional stxa
instructions to set a new tag. It is doable. In a way it is done already
if page is being pre-colored with tag 0 always ;) Where would the
pre-defined tag be stored - as part of address stored in vm_start or a
new field in vm_area_struct?

--
Khalid
Catalin Marinas May 30, 2019, 4:57 p.m. UTC | #22
On Thu, May 30, 2019 at 10:05:55AM -0600, Khalid Aziz wrote:
> On 5/30/19 9:11 AM, Catalin Marinas wrote:
> > So if a database program is doing an anonymous mmap(PROT_TBI) of 100GB,
> > IIUC for sparc the faulted-in pages will have random colours (on 64-byte
> > granularity). Ignoring the information leak from prior uses of such
> > pages, it would be the responsibility of the db program to issue the
> > stxa. On arm64, since we also want to do this via malloc(), any large
> > allocation would require all pages to be faulted in so that malloc() can
> > set the write colour before being handed over to the user. That's what
> > we want to avoid and the user is free to repaint the memory as it likes.
> 
> On sparc, any newly allocated page is cleared along with any old tags on
> it. Since clearing tag happens automatically when page is cleared on
> sparc, clear_user_page() will need to execute additional stxa
> instructions to set a new tag. It is doable. In a way it is done already
> if page is being pre-colored with tag 0 always ;)

Ah, good to know. On arm64 we'd have to use different instructions,
although the same loop.

> Where would the pre-defined tag be stored - as part of address stored
> in vm_start or a new field in vm_area_struct?

I think we can discuss the details when we post the actual MTE patches.
In our internal hack we overloaded the VM_HIGH_ARCH_* flags and selected
CONFIG_ARCH_USES_HIGH_VMA_FLAGS (used for pkeys on x86).

For the time being, I'd rather restrict tagged addresses passed to
mmap() until we agreed that they have any meaning. If we allowed them
now but get ignored (though probably no-one would be doing this), I feel
it's slightly harder to change the semantics afterwards.

Thanks.

Patch
diff mbox series

diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c
index b44065fb1616..933bb9f3d6ec 100644
--- a/arch/arm64/kernel/sys.c
+++ b/arch/arm64/kernel/sys.c
@@ -35,10 +35,33 @@  SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 {
 	if (offset_in_page(off) != 0)
 		return -EINVAL;
-
+	addr = untagged_addr(addr);
 	return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
 }
 
+SYSCALL_DEFINE6(arm64_mmap_pgoff, unsigned long, addr, unsigned long, len,
+		unsigned long, prot, unsigned long, flags,
+		unsigned long, fd, unsigned long, pgoff)
+{
+	addr = untagged_addr(addr);
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
+}
+
+SYSCALL_DEFINE5(arm64_mremap, unsigned long, addr, unsigned long, old_len,
+		unsigned long, new_len, unsigned long, flags,
+		unsigned long, new_addr)
+{
+	addr = untagged_addr(addr);
+	new_addr = untagged_addr(new_addr);
+	return ksys_mremap(addr, old_len, new_len, flags, new_addr);
+}
+
+SYSCALL_DEFINE2(arm64_munmap, unsigned long, addr, size_t, len)
+{
+	addr = untagged_addr(addr);
+	return ksys_munmap(addr, len);
+}
+
 SYSCALL_DEFINE1(arm64_personality, unsigned int, personality)
 {
 	if (personality(personality) == PER_LINUX32 &&
@@ -47,10 +70,113 @@  SYSCALL_DEFINE1(arm64_personality, unsigned int, personality)
 	return ksys_personality(personality);
 }
 
+SYSCALL_DEFINE1(arm64_brk, unsigned long, brk)
+{
+	brk = untagged_addr(brk);
+	return ksys_brk(brk);
+}
+
+SYSCALL_DEFINE5(arm64_get_mempolicy, int __user *, policy,
+		unsigned long __user *, nmask, unsigned long, maxnode,
+		unsigned long, addr, unsigned long, flags)
+{
+	addr = untagged_addr(addr);
+	return ksys_get_mempolicy(policy, nmask, maxnode, addr, flags);
+}
+
+SYSCALL_DEFINE3(arm64_madvise, unsigned long, start,
+		size_t, len_in, int, behavior)
+{
+	start = untagged_addr(start);
+	return ksys_madvise(start, len_in, behavior);
+}
+
+SYSCALL_DEFINE6(arm64_mbind, unsigned long, start, unsigned long, len,
+		unsigned long, mode, const unsigned long __user *, nmask,
+		unsigned long, maxnode, unsigned int, flags)
+{
+	start = untagged_addr(start);
+	return ksys_mbind(start, len, mode, nmask, maxnode, flags);
+}
+
+SYSCALL_DEFINE2(arm64_mlock, unsigned long, start, size_t, len)
+{
+	start = untagged_addr(start);
+	return ksys_mlock(start, len, VM_LOCKED);
+}
+
+SYSCALL_DEFINE2(arm64_mlock2, unsigned long, start, size_t, len)
+{
+	start = untagged_addr(start);
+	return ksys_mlock(start, len, VM_LOCKED);
+}
+
+SYSCALL_DEFINE2(arm64_munlock, unsigned long, start, size_t, len)
+{
+	start = untagged_addr(start);
+	return ksys_munlock(start, len);
+}
+
+SYSCALL_DEFINE3(arm64_mprotect, unsigned long, start, size_t, len,
+		unsigned long, prot)
+{
+	start = untagged_addr(start);
+	return ksys_mprotect_pkey(start, len, prot, -1);
+}
+
+SYSCALL_DEFINE3(arm64_msync, unsigned long, start, size_t, len, int, flags)
+{
+	start = untagged_addr(start);
+	return ksys_msync(start, len, flags);
+}
+
+SYSCALL_DEFINE3(arm64_mincore, unsigned long, start, size_t, len,
+		unsigned char __user *, vec)
+{
+	start = untagged_addr(start);
+	return ksys_mincore(start, len, vec);
+}
+
+SYSCALL_DEFINE5(arm64_remap_file_pages, unsigned long, start,
+		unsigned long, size, unsigned long, prot,
+		unsigned long, pgoff, unsigned long, flags)
+{
+	start = untagged_addr(start);
+	return ksys_remap_file_pages(start, size, prot, pgoff, flags);
+}
+
+SYSCALL_DEFINE3(arm64_shmat, int, shmid, char __user *, shmaddr, int, shmflg)
+{
+	shmaddr = untagged_addr(shmaddr);
+	return ksys_shmat(shmid, shmaddr, shmflg);
+}
+
+SYSCALL_DEFINE1(arm64_shmdt, char __user *, shmaddr)
+{
+	shmaddr = untagged_addr(shmaddr);
+	return ksys_shmdt(shmaddr);
+}
+
 /*
  * Wrappers to pass the pt_regs argument.
  */
 #define sys_personality		sys_arm64_personality
+#define sys_mmap_pgoff		sys_arm64_mmap_pgoff
+#define sys_mremap		sys_arm64_mremap
+#define sys_munmap		sys_arm64_munmap
+#define sys_brk			sys_arm64_brk
+#define sys_get_mempolicy	sys_arm64_get_mempolicy
+#define sys_madvise		sys_arm64_madvise
+#define sys_mbind		sys_arm64_mbind
+#define sys_mlock		sys_arm64_mlock
+#define sys_mlock2		sys_arm64_mlock2
+#define sys_munlock		sys_arm64_munlock
+#define sys_mprotect		sys_arm64_mprotect
+#define sys_msync		sys_arm64_msync
+#define sys_mincore		sys_arm64_mincore
+#define sys_remap_file_pages	sys_arm64_remap_file_pages
+#define sys_shmat		sys_arm64_shmat
+#define sys_shmdt		sys_arm64_shmdt
 
 asmlinkage long sys_ni_syscall(const struct pt_regs *);
 #define __arm64_sys_ni_syscall	sys_ni_syscall