diff mbox

arm64: Add translation functions for /dev/mem read/write

Message ID 1493756885-29704-1-git-send-email-sgoel@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Goel, Sameer May 2, 2017, 8:28 p.m. UTC
Port architecture specific xlate and unxlate functions for /dev/mem
read/write. This sets up the mapping for a valid physical address if a
kernel direct mapping is not already present.

This is a generic issue as a user space app should not be allowed to crash
the kernel. This issue was observed when systemd tried to access performance
pointer record from the FPDT table.

Ported from commit e045fb2a988a ("x86: PAT avoid aliasing in /dev/mem
read/write")

Crash Signature:
 Unable to handle kernel paging request at virtual address ffff800008ff0000
 pgd = ffff8007de8b2200
 [ffff800008ff0000] *pgd=0000000000000000, *pud=0000000000000000
 Internal error: Oops: 96000007 [#1] SMP
................
 CPU: 0 PID: 1 Comm: systemd Not tainted 4.10.0 #1
 task: ffff8007c0820000 task.stack: ffff8007c0900000
 PC is at __arch_copy_to_user+0xb4/0x280
 LR is at read_mem+0xc0/0x138
 pc : [<ffff0000084b3bb4>] lr : [<ffff00000869d178>]
pstate: 80000145
 sp : ffff8007c0903d40
....................
 x3 : ffff800800000000 x2 : 0000000000000008
 x1 : ffff800008ff0000 x0 : 0000fffff6fdac00
....................
 Call trace:
 Exception stack(0xffff8007c0903b70 to 0xffff8007c0903ca0)
 [<ffff0000084b3bb4>] __arch_copy_to_user+0xb4/0x280
 [<ffff0000082454d0>] __vfs_read+0x48/0x130
 [<ffff0000082467dc>] vfs_read+0x8c/0x148
 [<ffff000008247a34>] SyS_pread64+0x94/0xa8
 [<ffff0000080833b0>] el0_svc_naked+0x24/0x28
 Code: a88120c7 d503201f d503201f 36180082 (f8408423)

Signed-off-by: Sameer Goel <sgoel@codeaurora.org>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
---
 arch/arm64/include/asm/io.h |  5 +++++
 arch/arm64/mm/ioremap.c     | 31 +++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

Comments

Will Deacon May 3, 2017, 11:26 a.m. UTC | #1
[adding some /dev/mem fans to cc]

On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote:
> Port architecture specific xlate and unxlate functions for /dev/mem
> read/write. This sets up the mapping for a valid physical address if a
> kernel direct mapping is not already present.
> 
> This is a generic issue as a user space app should not be allowed to crash
> the kernel.

> This issue was observed when systemd tried to access performance
> pointer record from the FPDT table.

Why is it doing that? Is there not a way to get this via /sys?

> Ported from commit e045fb2a988a ("x86: PAT avoid aliasing in /dev/mem
> read/write")
> 
> Crash Signature:
>  Unable to handle kernel paging request at virtual address ffff800008ff0000
>  pgd = ffff8007de8b2200
>  [ffff800008ff0000] *pgd=0000000000000000, *pud=0000000000000000
>  Internal error: Oops: 96000007 [#1] SMP
> ................
>  CPU: 0 PID: 1 Comm: systemd Not tainted 4.10.0 #1
>  task: ffff8007c0820000 task.stack: ffff8007c0900000
>  PC is at __arch_copy_to_user+0xb4/0x280
>  LR is at read_mem+0xc0/0x138
>  pc : [<ffff0000084b3bb4>] lr : [<ffff00000869d178>]
> pstate: 80000145
>  sp : ffff8007c0903d40
> ....................
>  x3 : ffff800800000000 x2 : 0000000000000008
>  x1 : ffff800008ff0000 x0 : 0000fffff6fdac00
> ....................
>  Call trace:
>  Exception stack(0xffff8007c0903b70 to 0xffff8007c0903ca0)
>  [<ffff0000084b3bb4>] __arch_copy_to_user+0xb4/0x280
>  [<ffff0000082454d0>] __vfs_read+0x48/0x130
>  [<ffff0000082467dc>] vfs_read+0x8c/0x148
>  [<ffff000008247a34>] SyS_pread64+0x94/0xa8
>  [<ffff0000080833b0>] el0_svc_naked+0x24/0x28

So this certainly looks like a kernel bug, but I don't think your patch is
the right way to fix it.

>  Code: a88120c7 d503201f d503201f 36180082 (f8408423)
> 
> Signed-off-by: Sameer Goel <sgoel@codeaurora.org>
> Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
> ---
>  arch/arm64/include/asm/io.h |  5 +++++
>  arch/arm64/mm/ioremap.c     | 31 +++++++++++++++++++++++++++++++
>  2 files changed, 36 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
> index 0c00c87..c869ea4 100644
> --- a/arch/arm64/include/asm/io.h
> +++ b/arch/arm64/include/asm/io.h
> @@ -183,6 +183,11 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
>  #define iowrite32be(v,p)	({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); })
>  #define iowrite64be(v,p)	({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); })
>  
> +extern void *xlate_dev_mem_ptr(phys_addr_t phys);
> +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
> +
> +#define xlate_dev_mem_ptr xlate_dev_mem_ptr
> +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
>  #include <asm-generic/io.h>
>  
>  /*
> diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
> index c4c8cd4..ba7e63b 100644
> --- a/arch/arm64/mm/ioremap.c
> +++ b/arch/arm64/mm/ioremap.c
> @@ -24,6 +24,7 @@
>  #include <linux/mm.h>
>  #include <linux/vmalloc.h>
>  #include <linux/io.h>
> +#include <linux/memblock.h>
>  
>  #include <asm/fixmap.h>
>  #include <asm/tlbflush.h>
> @@ -105,6 +106,36 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
>  EXPORT_SYMBOL(ioremap_cache);
>  
>  /*
> + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
> + * access
> + */
> +void *xlate_dev_mem_ptr(phys_addr_t phys)
> +{
> +	unsigned long start  = phys &  PAGE_MASK;
> +	unsigned long offset = phys & ~PAGE_MASK;
> +	void *vaddr;
> +
> +	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
> +	if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys))
> +		return __va(phys);
> +
> +	vaddr = ioremap_cache(start, PAGE_SIZE);

Blindly using ioremap like this looks unsafe, since we could accidentally
set conflict with the attributes of a mapping used by something else (e.g.
firmware running on another CPU).

I'd like to understand more about the crash, so we can see work out how to
fix this properly.

Will
Goel, Sameer May 3, 2017, 5:07 p.m. UTC | #2
On 5/3/2017 5:26 AM, Will Deacon wrote:
> [adding some /dev/mem fans to cc]
> 
> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote:
>> Port architecture specific xlate and unxlate functions for /dev/mem
>> read/write. This sets up the mapping for a valid physical address if a
>> kernel direct mapping is not already present.
>>
>> This is a generic issue as a user space app should not be allowed to crash
>> the kernel.
> 
>> This issue was observed when systemd tried to access performance
>> pointer record from the FPDT table.
> 
> Why is it doing that? Is there not a way to get this via /sys?
There is no ACPI FPDT implementation in the kernel, so the userspace systemd code is getting the FPDT table contents from /sys 
and parsing the entries. The performance pointer record is a reserved address populated by UEFI and the userspace code tries to 
access it using /dev/mem. The physical address is valid, so cannot push back on the user space code.

https://github.com/systemd/systemd/blob/master/src/shared/acpi-fpdt.c
http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf  5.2.23 
> 
>> Ported from commit e045fb2a988a ("x86: PAT avoid aliasing in /dev/mem
>> read/write")
>>
>> Crash Signature:
>>  Unable to handle kernel paging request at virtual address ffff800008ff0000
>>  pgd = ffff8007de8b2200
>>  [ffff800008ff0000] *pgd=0000000000000000, *pud=0000000000000000
>>  Internal error: Oops: 96000007 [#1] SMP
>> ................
>>  CPU: 0 PID: 1 Comm: systemd Not tainted 4.10.0 #1
>>  task: ffff8007c0820000 task.stack: ffff8007c0900000
>>  PC is at __arch_copy_to_user+0xb4/0x280
>>  LR is at read_mem+0xc0/0x138
>>  pc : [<ffff0000084b3bb4>] lr : [<ffff00000869d178>]
>> pstate: 80000145
>>  sp : ffff8007c0903d40
>> ....................
>>  x3 : ffff800800000000 x2 : 0000000000000008
>>  x1 : ffff800008ff0000 x0 : 0000fffff6fdac00
>> ....................
>>  Call trace:
>>  Exception stack(0xffff8007c0903b70 to 0xffff8007c0903ca0)
>>  [<ffff0000084b3bb4>] __arch_copy_to_user+0xb4/0x280
>>  [<ffff0000082454d0>] __vfs_read+0x48/0x130
>>  [<ffff0000082467dc>] vfs_read+0x8c/0x148
>>  [<ffff000008247a34>] SyS_pread64+0x94/0xa8
>>  [<ffff0000080833b0>] el0_svc_naked+0x24/0x28
> 
> So this certainly looks like a kernel bug, but I don't think your patch is
> the right way to fix it.

I agree that the reserved regions are not meant to be accessed by the kernel as system 
ram. So, another option was to to return a NULL for this translation.

Since, the same usage was working on other architectures I ported over the same code to
highlight the issue.

> 
>>  Code: a88120c7 d503201f d503201f 36180082 (f8408423)
>>
>> Signed-off-by: Sameer Goel <sgoel@codeaurora.org>
>> Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
>> ---
>>  arch/arm64/include/asm/io.h |  5 +++++
>>  arch/arm64/mm/ioremap.c     | 31 +++++++++++++++++++++++++++++++
>>  2 files changed, 36 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
>> index 0c00c87..c869ea4 100644
>> --- a/arch/arm64/include/asm/io.h
>> +++ b/arch/arm64/include/asm/io.h
>> @@ -183,6 +183,11 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
>>  #define iowrite32be(v,p)	({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); })
>>  #define iowrite64be(v,p)	({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); })
>>  
>> +extern void *xlate_dev_mem_ptr(phys_addr_t phys);
>> +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
>> +
>> +#define xlate_dev_mem_ptr xlate_dev_mem_ptr
>> +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
>>  #include <asm-generic/io.h>
>>  
>>  /*
>> diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
>> index c4c8cd4..ba7e63b 100644
>> --- a/arch/arm64/mm/ioremap.c
>> +++ b/arch/arm64/mm/ioremap.c
>> @@ -24,6 +24,7 @@
>>  #include <linux/mm.h>
>>  #include <linux/vmalloc.h>
>>  #include <linux/io.h>
>> +#include <linux/memblock.h>
>>  
>>  #include <asm/fixmap.h>
>>  #include <asm/tlbflush.h>
>> @@ -105,6 +106,36 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
>>  EXPORT_SYMBOL(ioremap_cache);
>>  
>>  /*
>> + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
>> + * access
>> + */
>> +void *xlate_dev_mem_ptr(phys_addr_t phys)
>> +{
>> +	unsigned long start  = phys &  PAGE_MASK;
>> +	unsigned long offset = phys & ~PAGE_MASK;
>> +	void *vaddr;
>> +
>> +	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
>> +	if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys))
>> +		return __va(phys);
>> +
>> +	vaddr = ioremap_cache(start, PAGE_SIZE);
> 
> Blindly using ioremap like this looks unsafe, since we could accidentally
> set conflict with the attributes of a mapping used by something else (e.g.
> firmware running on another CPU).
> 
> I'd like to understand more about the crash, so we can see work out how to
> fix this properly.
>
This does opens up access to any valid physical address. In the short term we 
can block this crash by return NULL from this function if the memblock is MEMBLOCK_NOMAP.

Eventually we might need to add another memory type to make sure that it can be mapped.
I have not though about the exact design here.

Thanks,
Sameer
 
> Will
>
Leif Lindholm May 3, 2017, 8:18 p.m. UTC | #3
On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote:
> On 5/3/2017 5:26 AM, Will Deacon wrote:
> > [adding some /dev/mem fans to cc]
> > 
> > On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote:
> >> Port architecture specific xlate and unxlate functions for /dev/mem
> >> read/write. This sets up the mapping for a valid physical address if a
> >> kernel direct mapping is not already present.
> >>
> >> This is a generic issue as a user space app should not be allowed to crash
> >> the kernel.
> > 
> >> This issue was observed when systemd tried to access performance
> >> pointer record from the FPDT table.
> > 
> > Why is it doing that? Is there not a way to get this via /sys?
>
> There is no ACPI FPDT implementation in the kernel, so the userspace
> systemd code is getting the FPDT table contents from /sys
> and parsing the entries. The performance pointer record is a
> reserved address populated by UEFI and the userspace code tries to
> access it using /dev/mem. The physical address is valid, so cannot
> push back on the user space code.

OK, so then we need to add support for parsing this table in the
kernel and exposing the referred-to regions in a controllable fashion.
Maybe something that belongs under /sys/firmware/efi (adding Matt), or
maybe something that deserves its own driver.

The only two use-cases for /dev/mem on arm64 are:
- Implementing interfaces in the kernel takes up-front effort.
- Being able to accidentally panic the kernel from userland.

/
    Leif

> https://github.com/systemd/systemd/blob/master/src/shared/acpi-fpdt.c
> http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf  5.2.23 
> > 
> >> Ported from commit e045fb2a988a ("x86: PAT avoid aliasing in /dev/mem
> >> read/write")
> >>
> >> Crash Signature:
> >>  Unable to handle kernel paging request at virtual address ffff800008ff0000
> >>  pgd = ffff8007de8b2200
> >>  [ffff800008ff0000] *pgd=0000000000000000, *pud=0000000000000000
> >>  Internal error: Oops: 96000007 [#1] SMP
> >> ................
> >>  CPU: 0 PID: 1 Comm: systemd Not tainted 4.10.0 #1
> >>  task: ffff8007c0820000 task.stack: ffff8007c0900000
> >>  PC is at __arch_copy_to_user+0xb4/0x280
> >>  LR is at read_mem+0xc0/0x138
> >>  pc : [<ffff0000084b3bb4>] lr : [<ffff00000869d178>]
> >> pstate: 80000145
> >>  sp : ffff8007c0903d40
> >> ....................
> >>  x3 : ffff800800000000 x2 : 0000000000000008
> >>  x1 : ffff800008ff0000 x0 : 0000fffff6fdac00
> >> ....................
> >>  Call trace:
> >>  Exception stack(0xffff8007c0903b70 to 0xffff8007c0903ca0)
> >>  [<ffff0000084b3bb4>] __arch_copy_to_user+0xb4/0x280
> >>  [<ffff0000082454d0>] __vfs_read+0x48/0x130
> >>  [<ffff0000082467dc>] vfs_read+0x8c/0x148
> >>  [<ffff000008247a34>] SyS_pread64+0x94/0xa8
> >>  [<ffff0000080833b0>] el0_svc_naked+0x24/0x28
> > 
> > So this certainly looks like a kernel bug, but I don't think your patch is
> > the right way to fix it.
> 
> I agree that the reserved regions are not meant to be accessed by the kernel as system 
> ram. So, another option was to to return a NULL for this translation.
> 
> Since, the same usage was working on other architectures I ported over the same code to
> highlight the issue.
> 
> > 
> >>  Code: a88120c7 d503201f d503201f 36180082 (f8408423)
> >>
> >> Signed-off-by: Sameer Goel <sgoel@codeaurora.org>
> >> Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
> >> ---
> >>  arch/arm64/include/asm/io.h |  5 +++++
> >>  arch/arm64/mm/ioremap.c     | 31 +++++++++++++++++++++++++++++++
> >>  2 files changed, 36 insertions(+)
> >>
> >> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
> >> index 0c00c87..c869ea4 100644
> >> --- a/arch/arm64/include/asm/io.h
> >> +++ b/arch/arm64/include/asm/io.h
> >> @@ -183,6 +183,11 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
> >>  #define iowrite32be(v,p)	({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); })
> >>  #define iowrite64be(v,p)	({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); })
> >>  
> >> +extern void *xlate_dev_mem_ptr(phys_addr_t phys);
> >> +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
> >> +
> >> +#define xlate_dev_mem_ptr xlate_dev_mem_ptr
> >> +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
> >>  #include <asm-generic/io.h>
> >>  
> >>  /*
> >> diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
> >> index c4c8cd4..ba7e63b 100644
> >> --- a/arch/arm64/mm/ioremap.c
> >> +++ b/arch/arm64/mm/ioremap.c
> >> @@ -24,6 +24,7 @@
> >>  #include <linux/mm.h>
> >>  #include <linux/vmalloc.h>
> >>  #include <linux/io.h>
> >> +#include <linux/memblock.h>
> >>  
> >>  #include <asm/fixmap.h>
> >>  #include <asm/tlbflush.h>
> >> @@ -105,6 +106,36 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
> >>  EXPORT_SYMBOL(ioremap_cache);
> >>  
> >>  /*
> >> + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
> >> + * access
> >> + */
> >> +void *xlate_dev_mem_ptr(phys_addr_t phys)
> >> +{
> >> +	unsigned long start  = phys &  PAGE_MASK;
> >> +	unsigned long offset = phys & ~PAGE_MASK;
> >> +	void *vaddr;
> >> +
> >> +	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
> >> +	if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys))
> >> +		return __va(phys);
> >> +
> >> +	vaddr = ioremap_cache(start, PAGE_SIZE);
> > 
> > Blindly using ioremap like this looks unsafe, since we could accidentally
> > set conflict with the attributes of a mapping used by something else (e.g.
> > firmware running on another CPU).
> > 
> > I'd like to understand more about the crash, so we can see work out how to
> > fix this properly.
> >
> This does opens up access to any valid physical address. In the short term we 
> can block this crash by return NULL from this function if the memblock is MEMBLOCK_NOMAP.
> 
> Eventually we might need to add another memory type to make sure that it can be mapped.
> I have not though about the exact design here.
> 
> Thanks,
> Sameer
>  
> > Will
> > 
> 
> -- 
>  Qualcomm Datacenter Technologies as an affiliate of Qualcomm Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
Goel, Sameer May 3, 2017, 9:47 p.m. UTC | #4
On 5/3/2017 2:18 PM, Leif Lindholm wrote:
> On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote:
>> On 5/3/2017 5:26 AM, Will Deacon wrote:
>>> [adding some /dev/mem fans to cc]
>>>
>>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote:
>>>> Port architecture specific xlate and unxlate functions for /dev/mem
>>>> read/write. This sets up the mapping for a valid physical address if a
>>>> kernel direct mapping is not already present.
>>>>
>>>> This is a generic issue as a user space app should not be allowed to crash
>>>> the kernel.
>>>
>>>> This issue was observed when systemd tried to access performance
>>>> pointer record from the FPDT table.
>>>
>>> Why is it doing that? Is there not a way to get this via /sys?
>>
>> There is no ACPI FPDT implementation in the kernel, so the userspace
>> systemd code is getting the FPDT table contents from /sys
>> and parsing the entries. The performance pointer record is a
>> reserved address populated by UEFI and the userspace code tries to
>> access it using /dev/mem. The physical address is valid, so cannot
>> push back on the user space code.
> 
> OK, so then we need to add support for parsing this table in the
> kernel and exposing the referred-to regions in a controllable fashion.
> Maybe something that belongs under /sys/firmware/efi (adding Matt), or
> maybe something that deserves its own driver.
> 
> The only two use-cases for /dev/mem on arm64 are:
> - Implementing interfaces in the kernel takes up-front effort.
> - Being able to accidentally panic the kernel from userland.
> 
We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked
memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support.

- Sameer
> /
>     Leif
> 
>> https://github.com/systemd/systemd/blob/master/src/shared/acpi-fpdt.c
>> http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf  5.2.23 
>>>
>>>> Ported from commit e045fb2a988a ("x86: PAT avoid aliasing in /dev/mem
>>>> read/write")
>>>>
>>>> Crash Signature:
>>>>  Unable to handle kernel paging request at virtual address ffff800008ff0000
>>>>  pgd = ffff8007de8b2200
>>>>  [ffff800008ff0000] *pgd=0000000000000000, *pud=0000000000000000
>>>>  Internal error: Oops: 96000007 [#1] SMP
>>>> ................
>>>>  CPU: 0 PID: 1 Comm: systemd Not tainted 4.10.0 #1
>>>>  task: ffff8007c0820000 task.stack: ffff8007c0900000
>>>>  PC is at __arch_copy_to_user+0xb4/0x280
>>>>  LR is at read_mem+0xc0/0x138
>>>>  pc : [<ffff0000084b3bb4>] lr : [<ffff00000869d178>]
>>>> pstate: 80000145
>>>>  sp : ffff8007c0903d40
>>>> ....................
>>>>  x3 : ffff800800000000 x2 : 0000000000000008
>>>>  x1 : ffff800008ff0000 x0 : 0000fffff6fdac00
>>>> ....................
>>>>  Call trace:
>>>>  Exception stack(0xffff8007c0903b70 to 0xffff8007c0903ca0)
>>>>  [<ffff0000084b3bb4>] __arch_copy_to_user+0xb4/0x280
>>>>  [<ffff0000082454d0>] __vfs_read+0x48/0x130
>>>>  [<ffff0000082467dc>] vfs_read+0x8c/0x148
>>>>  [<ffff000008247a34>] SyS_pread64+0x94/0xa8
>>>>  [<ffff0000080833b0>] el0_svc_naked+0x24/0x28
>>>
>>> So this certainly looks like a kernel bug, but I don't think your patch is
>>> the right way to fix it.
>>
>> I agree that the reserved regions are not meant to be accessed by the kernel as system 
>> ram. So, another option was to to return a NULL for this translation.
>>
>> Since, the same usage was working on other architectures I ported over the same code to
>> highlight the issue.
>>
>>>
>>>>  Code: a88120c7 d503201f d503201f 36180082 (f8408423)
>>>>
>>>> Signed-off-by: Sameer Goel <sgoel@codeaurora.org>
>>>> Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
>>>> ---
>>>>  arch/arm64/include/asm/io.h |  5 +++++
>>>>  arch/arm64/mm/ioremap.c     | 31 +++++++++++++++++++++++++++++++
>>>>  2 files changed, 36 insertions(+)
>>>>
>>>> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
>>>> index 0c00c87..c869ea4 100644
>>>> --- a/arch/arm64/include/asm/io.h
>>>> +++ b/arch/arm64/include/asm/io.h
>>>> @@ -183,6 +183,11 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
>>>>  #define iowrite32be(v,p)	({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); })
>>>>  #define iowrite64be(v,p)	({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); })
>>>>  
>>>> +extern void *xlate_dev_mem_ptr(phys_addr_t phys);
>>>> +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
>>>> +
>>>> +#define xlate_dev_mem_ptr xlate_dev_mem_ptr
>>>> +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
>>>>  #include <asm-generic/io.h>
>>>>  
>>>>  /*
>>>> diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
>>>> index c4c8cd4..ba7e63b 100644
>>>> --- a/arch/arm64/mm/ioremap.c
>>>> +++ b/arch/arm64/mm/ioremap.c
>>>> @@ -24,6 +24,7 @@
>>>>  #include <linux/mm.h>
>>>>  #include <linux/vmalloc.h>
>>>>  #include <linux/io.h>
>>>> +#include <linux/memblock.h>
>>>>  
>>>>  #include <asm/fixmap.h>
>>>>  #include <asm/tlbflush.h>
>>>> @@ -105,6 +106,36 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
>>>>  EXPORT_SYMBOL(ioremap_cache);
>>>>  
>>>>  /*
>>>> + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
>>>> + * access
>>>> + */
>>>> +void *xlate_dev_mem_ptr(phys_addr_t phys)
>>>> +{
>>>> +	unsigned long start  = phys &  PAGE_MASK;
>>>> +	unsigned long offset = phys & ~PAGE_MASK;
>>>> +	void *vaddr;
>>>> +
>>>> +	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
>>>> +	if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys))
>>>> +		return __va(phys);
>>>> +
>>>> +	vaddr = ioremap_cache(start, PAGE_SIZE);
>>>
>>> Blindly using ioremap like this looks unsafe, since we could accidentally
>>> set conflict with the attributes of a mapping used by something else (e.g.
>>> firmware running on another CPU).
>>>
>>> I'd like to understand more about the crash, so we can see work out how to
>>> fix this properly.
>>>
>> This does opens up access to any valid physical address. In the short term we 
>> can block this crash by return NULL from this function if the memblock is MEMBLOCK_NOMAP.
>>
>> Eventually we might need to add another memory type to make sure that it can be mapped.
>> I have not though about the exact design here.
>>
>> Thanks,
>> Sameer
>>  
>>> Will
>>>
>>
>> -- 
>>  Qualcomm Datacenter Technologies as an affiliate of Qualcomm Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
>
Ard Biesheuvel May 4, 2017, 6:40 a.m. UTC | #5
On 3 May 2017 at 22:47, Goel, Sameer <sgoel@codeaurora.org> wrote:
>
>
> On 5/3/2017 2:18 PM, Leif Lindholm wrote:
>> On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote:
>>> On 5/3/2017 5:26 AM, Will Deacon wrote:
>>>> [adding some /dev/mem fans to cc]
>>>>
>>>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote:
>>>>> Port architecture specific xlate and unxlate functions for /dev/mem
>>>>> read/write. This sets up the mapping for a valid physical address if a
>>>>> kernel direct mapping is not already present.
>>>>>
>>>>> This is a generic issue as a user space app should not be allowed to crash
>>>>> the kernel.
>>>>
>>>>> This issue was observed when systemd tried to access performance
>>>>> pointer record from the FPDT table.
>>>>
>>>> Why is it doing that? Is there not a way to get this via /sys?
>>>
>>> There is no ACPI FPDT implementation in the kernel, so the userspace
>>> systemd code is getting the FPDT table contents from /sys
>>> and parsing the entries. The performance pointer record is a
>>> reserved address populated by UEFI and the userspace code tries to
>>> access it using /dev/mem. The physical address is valid, so cannot
>>> push back on the user space code.
>>
>> OK, so then we need to add support for parsing this table in the
>> kernel and exposing the referred-to regions in a controllable fashion.
>> Maybe something that belongs under /sys/firmware/efi (adding Matt), or
>> maybe something that deserves its own driver.
>>
>> The only two use-cases for /dev/mem on arm64 are:
>> - Implementing interfaces in the kernel takes up-front effort.
>> - Being able to accidentally panic the kernel from userland.
>>
> We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked
> memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support.
>

I reported the same issue a couple of weeks ago [0]. So while we all
agree that such accesses shouldn't oops the kernel, I think we may
disagree on whether such accesses should be allowed in the first
place, especially when using read/write on /dev/mem (as opposed to
mmap())

One the UEFI/EDK2 side, there are two fundamental issues here, which
we should resolve:
- The use of EfiRuntimeServicesData for such regions: these tables
have no significance to the firmware itself (not after
ExitBootServices()) anyway, and so the only reason for choosing this
memory type is that they are guaranteed to be left untouched by the
OS. Also, using this type rather than something like 'ACPI Reclaim'
results in the memory to be occupied regardless of whether you
understand cq. are interested in its contents, which is something we
usually try to avoid in UEFI.
- The unconditional use of the EFI_MEMORY_RUNTIME attribute for
EfiRuntimeServicesData regions, which requests the OS to create a
runtime mapping for it in the OS page tables. We should be able to
take this attribute as a cue that the firmware has no interest in its
contents (given that it never requested a mapping for it) making it
safe for the OS to map it with any attributes it likes. However, EDK2
currently does not provide this facility, i.e., the EFI_MEMORY_RUNTIME
bit is always set.

So modulo the feedback on my patch, I think that approach is
preferred, and we should really look into cleaning this up further on
the firmware side. For now, the userland fix is to use mmap() rather
than read() (which is already documented in the code as something that
should not be relied upon to remain available indefinitely).





[0] http://marc.info/?l=linux-arm-kernel&m=149198561609050
Goel, Sameer May 4, 2017, 7:58 p.m. UTC | #6
On 5/4/2017 12:40 AM, Ard Biesheuvel wrote:
> On 3 May 2017 at 22:47, Goel, Sameer <sgoel@codeaurora.org> wrote:
>>
>>
>> On 5/3/2017 2:18 PM, Leif Lindholm wrote:
>>> On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote:
>>>> On 5/3/2017 5:26 AM, Will Deacon wrote:
>>>>> [adding some /dev/mem fans to cc]
>>>>>
>>>>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote:
>>>>>> Port architecture specific xlate and unxlate functions for /dev/mem
>>>>>> read/write. This sets up the mapping for a valid physical address if a
>>>>>> kernel direct mapping is not already present.
>>>>>>
>>>>>> This is a generic issue as a user space app should not be allowed to crash
>>>>>> the kernel.
>>>>>
>>>>>> This issue was observed when systemd tried to access performance
>>>>>> pointer record from the FPDT table.
>>>>>
>>>>> Why is it doing that? Is there not a way to get this via /sys?
>>>>
>>>> There is no ACPI FPDT implementation in the kernel, so the userspace
>>>> systemd code is getting the FPDT table contents from /sys
>>>> and parsing the entries. The performance pointer record is a
>>>> reserved address populated by UEFI and the userspace code tries to
>>>> access it using /dev/mem. The physical address is valid, so cannot
>>>> push back on the user space code.
>>>
>>> OK, so then we need to add support for parsing this table in the
>>> kernel and exposing the referred-to regions in a controllable fashion.
>>> Maybe something that belongs under /sys/firmware/efi (adding Matt), or
>>> maybe something that deserves its own driver.
>>>
>>> The only two use-cases for /dev/mem on arm64 are:
>>> - Implementing interfaces in the kernel takes up-front effort.
>>> - Being able to accidentally panic the kernel from userland.
>>>
>> We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked
>> memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support.
>>
> 
> I reported the same issue a couple of weeks ago [0]. So while we all
> agree that such accesses shouldn't oops the kernel, I think we may
> disagree on whether such accesses should be allowed in the first
> place, especially when using read/write on /dev/mem (as opposed to
> mmap())
> 
> One the UEFI/EDK2 side, there are two fundamental issues here, which
> we should resolve:
> - The use of EfiRuntimeServicesData for such regions: these tables
> have no significance to the firmware itself (not after
> ExitBootServices()) anyway, and so the only reason for choosing this
> memory type is that they are guaranteed to be left untouched by the
> OS. Also, using this type rather than something like 'ACPI Reclaim'
> results in the memory to be occupied regardless of whether you
> understand cq. are interested in its contents, which is something we
> usually try to avoid in UEFI.
> - The unconditional use of the EFI_MEMORY_RUNTIME attribute for
> EfiRuntimeServicesData regions, which requests the OS to create a
> runtime mapping for it in the OS page tables. We should be able to
> take this attribute as a cue that the firmware has no interest in its
> contents (given that it never requested a mapping for it) making it
> safe for the OS to map it with any attributes it likes. However, EDK2
> currently does not provide this facility, i.e., the EFI_MEMORY_RUNTIME
> bit is always set.
> 
> So modulo the feedback on my patch, I think that approach is
> preferred, and we should really look into cleaning this up further on
> the firmware side. For now, the userland fix is to use mmap() rather
> than read() (which is already documented in the code as something that
> should not be relied upon to remain available indefinitely).
> 
> 
> 
> 
> 
> [0] http://marc.info/?l=linux-arm-kernel&m=149198561609050
> 
 Makes sense. I will pick up the patch mentioned in [0] for fixing my current issue.

Thanks,
Sameer
Will Deacon May 5, 2017, 2:55 p.m. UTC | #7
On Thu, May 04, 2017 at 07:40:50AM +0100, Ard Biesheuvel wrote:
> On 3 May 2017 at 22:47, Goel, Sameer <sgoel@codeaurora.org> wrote:
> >
> >
> > On 5/3/2017 2:18 PM, Leif Lindholm wrote:
> >> On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote:
> >>> On 5/3/2017 5:26 AM, Will Deacon wrote:
> >>>> [adding some /dev/mem fans to cc]
> >>>>
> >>>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote:
> >>>>> Port architecture specific xlate and unxlate functions for /dev/mem
> >>>>> read/write. This sets up the mapping for a valid physical address if a
> >>>>> kernel direct mapping is not already present.
> >>>>>
> >>>>> This is a generic issue as a user space app should not be allowed to crash
> >>>>> the kernel.
> >>>>
> >>>>> This issue was observed when systemd tried to access performance
> >>>>> pointer record from the FPDT table.
> >>>>
> >>>> Why is it doing that? Is there not a way to get this via /sys?
> >>>
> >>> There is no ACPI FPDT implementation in the kernel, so the userspace
> >>> systemd code is getting the FPDT table contents from /sys
> >>> and parsing the entries. The performance pointer record is a
> >>> reserved address populated by UEFI and the userspace code tries to
> >>> access it using /dev/mem. The physical address is valid, so cannot
> >>> push back on the user space code.
> >>
> >> OK, so then we need to add support for parsing this table in the
> >> kernel and exposing the referred-to regions in a controllable fashion.
> >> Maybe something that belongs under /sys/firmware/efi (adding Matt), or
> >> maybe something that deserves its own driver.
> >>
> >> The only two use-cases for /dev/mem on arm64 are:
> >> - Implementing interfaces in the kernel takes up-front effort.
> >> - Being able to accidentally panic the kernel from userland.
> >>
> > We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked
> > memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support.
> >
> 
> I reported the same issue a couple of weeks ago [0]. So while we all
> agree that such accesses shouldn't oops the kernel, I think we may
> disagree on whether such accesses should be allowed in the first
> place, especially when using read/write on /dev/mem (as opposed to
> mmap())

Did you plan to respin those patches to address Alex's comments? I agree
that it would be good to close the oops, regardless of the rest of the
discussion here.

Will
Ard Biesheuvel May 5, 2017, 4:25 p.m. UTC | #8
> On 5 May 2017, at 15:55, Will Deacon <will.deacon@arm.com> wrote:
> 
>> On Thu, May 04, 2017 at 07:40:50AM +0100, Ard Biesheuvel wrote:
>>> On 3 May 2017 at 22:47, Goel, Sameer <sgoel@codeaurora.org> wrote:
>>> 
>>> 
>>>> On 5/3/2017 2:18 PM, Leif Lindholm wrote:
>>>>> On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote:
>>>>>> On 5/3/2017 5:26 AM, Will Deacon wrote:
>>>>>> [adding some /dev/mem fans to cc]
>>>>>> 
>>>>>>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote:
>>>>>>> Port architecture specific xlate and unxlate functions for /dev/mem
>>>>>>> read/write. This sets up the mapping for a valid physical address if a
>>>>>>> kernel direct mapping is not already present.
>>>>>>> 
>>>>>>> This is a generic issue as a user space app should not be allowed to crash
>>>>>>> the kernel.
>>>>>> 
>>>>>>> This issue was observed when systemd tried to access performance
>>>>>>> pointer record from the FPDT table.
>>>>>> 
>>>>>> Why is it doing that? Is there not a way to get this via /sys?
>>>>> 
>>>>> There is no ACPI FPDT implementation in the kernel, so the userspace
>>>>> systemd code is getting the FPDT table contents from /sys
>>>>> and parsing the entries. The performance pointer record is a
>>>>> reserved address populated by UEFI and the userspace code tries to
>>>>> access it using /dev/mem. The physical address is valid, so cannot
>>>>> push back on the user space code.
>>>> 
>>>> OK, so then we need to add support for parsing this table in the
>>>> kernel and exposing the referred-to regions in a controllable fashion.
>>>> Maybe something that belongs under /sys/firmware/efi (adding Matt), or
>>>> maybe something that deserves its own driver.
>>>> 
>>>> The only two use-cases for /dev/mem on arm64 are:
>>>> - Implementing interfaces in the kernel takes up-front effort.
>>>> - Being able to accidentally panic the kernel from userland.
>>>> 
>>> We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked
>>> memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support.
>>> 
>> 
>> I reported the same issue a couple of weeks ago [0]. So while we all
>> agree that such accesses shouldn't oops the kernel, I think we may
>> disagree on whether such accesses should be allowed in the first
>> place, especially when using read/write on /dev/mem (as opposed to
>> mmap())
> 
> Did you plan to respin those patches to address Alex's comments? I agree
> that it would be good to close the oops, regardless of the rest of the
> discussion here.

Agreed. I will look into this after my vacation (back on May 15th)
diff mbox

Patch

diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 0c00c87..c869ea4 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -183,6 +183,11 @@  static inline u64 __raw_readq(const volatile void __iomem *addr)
 #define iowrite32be(v,p)	({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); })
 #define iowrite64be(v,p)	({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); })
 
+extern void *xlate_dev_mem_ptr(phys_addr_t phys);
+extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
+
+#define xlate_dev_mem_ptr xlate_dev_mem_ptr
+#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
 #include <asm-generic/io.h>
 
 /*
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index c4c8cd4..ba7e63b 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -24,6 +24,7 @@ 
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/io.h>
+#include <linux/memblock.h>
 
 #include <asm/fixmap.h>
 #include <asm/tlbflush.h>
@@ -105,6 +106,36 @@  void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
 EXPORT_SYMBOL(ioremap_cache);
 
 /*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(phys_addr_t phys)
+{
+	unsigned long start  = phys &  PAGE_MASK;
+	unsigned long offset = phys & ~PAGE_MASK;
+	void *vaddr;
+
+	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+	if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys))
+		return __va(phys);
+
+	vaddr = ioremap_cache(start, PAGE_SIZE);
+	 /* Add the offset on success and return NULL if ioremap() failed */
+	if (vaddr)
+		vaddr += offset;
+
+	return vaddr;
+}
+
+void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
+{
+	if (page_is_ram(phys >> PAGE_SHIFT))
+		return;
+
+	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+}
+
+/*
  * Must be called after early_fixmap_init
  */
 void __init early_ioremap_init(void)