diff mbox

[09/10] arch: introduce memremap()

Message ID 20150720001822.30857.65670.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams July 20, 2015, 12:18 a.m. UTC
Existing users of ioremap_cache() are mapping memory that is known in
advance to not have i/o side effects.  These users are forced to cast
away the __iomem annotation, or otherwise neglect to fix the sparse
errors thrown when dereferencing pointers to this memory.  Provide
memremap() as a non __iomem annotated ioremap_*() in the case when
ioremap is otherwise a pointer to memory.

The ARCH_HAS_MEMREMAP kconfig symbol is introduced for archs to assert
that it is safe to recast / reuse the return value from ioremap as a
normal pointer to memory.  In other words, archs that mandate specific
accessors for __iomem are not memremap() capable and drivers that care,
like pmem, can add a dependency to disable themselves on these archs.

Note, that memremap is a break from the ioremap implementation pattern
of adding a new memremap_<type>() for each mapping type.  Instead,
the implementation defines flags that are passed to the central
memremap() implementation.

Outside of ioremap() and ioremap_nocache(), the expectation is that most
calls to ioremap_<type>() are seeking memory-like semantics (e.g.
speculative reads, and prefetching permitted).  These callsites can be
moved to memremap() over time.

Cc: Arnd Bergmann <arnd@arndb.de>
Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/arm/Kconfig                     |    1 +
 arch/arm64/Kconfig                   |    1 +
 arch/arm64/kernel/efi.c              |    7 ++---
 arch/arm64/kernel/smp_spin_table.c   |   17 +++++------
 arch/frv/Kconfig                     |    1 +
 arch/m68k/Kconfig                    |    1 +
 arch/metag/Kconfig                   |    1 +
 arch/mips/Kconfig                    |    1 +
 arch/powerpc/Kconfig                 |    1 +
 arch/x86/Kconfig                     |    1 +
 arch/x86/kernel/crash_dump_64.c      |    6 ++--
 arch/x86/kernel/kdebugfs.c           |    8 +++--
 arch/x86/kernel/ksysfs.c             |   28 +++++++++----------
 arch/x86/mm/ioremap.c                |   10 +++----
 arch/xtensa/Kconfig                  |    1 +
 drivers/acpi/apei/einj.c             |    9 +++---
 drivers/acpi/apei/erst.c             |    6 ++--
 drivers/firmware/google/memconsole.c |    5 ++-
 include/linux/device.h               |    5 +++
 include/linux/io.h                   |   11 +++++++
 kernel/resource.c                    |   51 +++++++++++++++++++++++++++++++++-
 lib/Kconfig                          |    5 +++
 22 files changed, 126 insertions(+), 51 deletions(-)

Comments

Mark Rutland July 20, 2015, noon UTC | #1
Hi,

On Mon, Jul 20, 2015 at 01:18:23AM +0100, Dan Williams wrote:
> Existing users of ioremap_cache() are mapping memory that is known in
> advance to not have i/o side effects.  These users are forced to cast
> away the __iomem annotation, or otherwise neglect to fix the sparse
> errors thrown when dereferencing pointers to this memory.  Provide
> memremap() as a non __iomem annotated ioremap_*() in the case when
> ioremap is otherwise a pointer to memory.
> 
> The ARCH_HAS_MEMREMAP kconfig symbol is introduced for archs to assert
> that it is safe to recast / reuse the return value from ioremap as a
> normal pointer to memory.  In other words, archs that mandate specific
> accessors for __iomem are not memremap() capable and drivers that care,
> like pmem, can add a dependency to disable themselves on these archs.
> 
> Note, that memremap is a break from the ioremap implementation pattern
> of adding a new memremap_<type>() for each mapping type.  Instead,
> the implementation defines flags that are passed to the central
> memremap() implementation.
> 
> Outside of ioremap() and ioremap_nocache(), the expectation is that most
> calls to ioremap_<type>() are seeking memory-like semantics (e.g.
> speculative reads, and prefetching permitted).  These callsites can be
> moved to memremap() over time.
> 
> Cc: Arnd Bergmann <arnd@arndb.de>
> Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  arch/arm/Kconfig                     |    1 +
>  arch/arm64/Kconfig                   |    1 +
>  arch/arm64/kernel/efi.c              |    7 ++---
>  arch/arm64/kernel/smp_spin_table.c   |   17 +++++------

These last two files weren't updated in patch 2 to use <linux/io.h>, nor are
they updated here, so this patch breaks the build for arm64.

[...]

> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 318175f62c24..305def28385f 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -6,6 +6,7 @@ config ARM64
>         select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
>         select ARCH_HAS_ELF_RANDOMIZE
>         select ARCH_HAS_GCOV_PROFILE_ALL
> +       select ARCH_HAS_MEMREMAP
>         select ARCH_HAS_SG_CHAIN
>         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
>         select ARCH_USE_CMPXCHG_LOCKREF
> diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
> index 9d4aa18f2a82..ed363a0202b9 100644
> --- a/arch/arm64/kernel/efi.c
> +++ b/arch/arm64/kernel/efi.c
> @@ -290,8 +290,7 @@ static int __init arm64_enable_runtime_services(void)
>         pr_info("Remapping and enabling EFI services.\n");
> 
>         mapsize = memmap.map_end - memmap.map;
> -       memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map,
> -                                                  mapsize);
> +       memmap.map = memremap(memmap.phys_map, mapsize, MEMREMAP_CACHE);

memmap.phys_map is a void*, so we need to retain a cast to a numeric type or
we'll get splats like:

arch/arm64/kernel/efi.c: In function ‘arm64_enable_runtime_services’:
arch/arm64/kernel/efi.c:294:24: warning: passing argument 1 of ‘memremap’ makes integer from pointer without a cast
  memmap.map = memremap(memmap.phys_map, mapsize, MEMREMAP_CACHE);
                        ^
In file included from arch/arm64/kernel/efi.c:18:0:
include/linux/io.h:203:14: note: expected ‘resource_size_t’ but argument is of type ‘void *’
 extern void *memremap(resource_size_t offset, size_t size, unsigned long flags);
              ^
Unfortunately, even with that fixed this patch breaks boot due to the
memremap_valid check. It's extremely likely that (portions of) the
tables are already in the linear mapping, and so page_is_ram will be
true.

At boot time I get the following:

Remapping and enabling EFI services.
------------[ cut here ]------------
WARNING: CPU: 0 PID: 1 at kernel/resource.c:541 memremap+0xb0/0xbc()
memremap attempted on ram 0x00000009f9e4a018 size: 1200
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc3+ #201
Hardware name: ARM Juno development board (r0) (DT)
Call trace:
[<ffffffc000089954>] dump_backtrace+0x0/0x124
[<ffffffc000089a88>] show_stack+0x10/0x1c
[<ffffffc0005d15c8>] dump_stack+0x84/0xc8
[<ffffffc0000b3f44>] warn_slowpath_common+0x98/0xd0
[<ffffffc0000b3fc8>] warn_slowpath_fmt+0x4c/0x58
[<ffffffc0000b9470>] memremap+0xac/0xbc
[<ffffffc000814fe8>] arm64_enable_runtime_services+0x90/0x208
[<ffffffc000082864>] do_one_initcall+0x88/0x1a0
[<ffffffc000811a20>] kernel_init_freeable+0x8c/0x1f8
[<ffffffc0005ce370>] kernel_init+0xc/0xd8
---[ end trace cb88537fdc8fa200 ]---
Failed to remap EFI memory map

>         if (!memmap.map) {
>                 pr_err("Failed to remap EFI memory map\n");
>                 return -1;
> @@ -299,8 +298,8 @@ static int __init arm64_enable_runtime_services(void)
>         memmap.map_end = memmap.map + mapsize;
>         efi.memmap = &memmap;
> 
> -       efi.systab = (__force void *)ioremap_cache(efi_system_table,
> -                                                  sizeof(efi_system_table_t));
> +       efi.systab = memremap(efi_system_table, sizeof(efi_system_table_t),
> +                       MEMREMAP_CACHE);
>         if (!efi.systab) {
>                 pr_err("Failed to remap EFI System Table\n");
>                 return -1;
> diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
> index aef3605a8c47..b9caf6cd44e6 100644
> --- a/arch/arm64/kernel/smp_spin_table.c
> +++ b/arch/arm64/kernel/smp_spin_table.c
> @@ -73,19 +73,19 @@ static int smp_spin_table_cpu_init(unsigned int cpu)
> 
>  static int smp_spin_table_cpu_prepare(unsigned int cpu)
>  {
> -       __le64 __iomem *release_addr;
> +       __le64 *release_addr;
> 
>         if (!cpu_release_addr[cpu])
>                 return -ENODEV;
> 
>         /*
>          * The cpu-release-addr may or may not be inside the linear mapping.
> -        * As ioremap_cache will either give us a new mapping or reuse the
> -        * existing linear mapping, we can use it to cover both cases. In
> -        * either case the memory will be MT_NORMAL.
> +        * As memremap will either give us a new mapping or reuse the existing
> +        * linear mapping, we can use it to cover both cases. In either case
> +        * the memory will be MT_NORMAL.
>          */
> -       release_addr = ioremap_cache(cpu_release_addr[cpu],
> -                                    sizeof(*release_addr));
> +       release_addr = memremap(cpu_release_addr[cpu], sizeof(*release_addr),
> +                       MEMREMAP_CACHE);

Likewise, the comment here is contradicted by the code in memremap_valid
(below). We'll fail to bring up secondaries if the release address fell
in the linear mapping.

> +#ifdef CONFIG_ARCH_HAS_MEMREMAP
> +/*
> + * memremap() is "ioremap" for cases where it is known that the resource
> + * being mapped does not have i/o side effects and the __iomem
> + * annotation is not applicable.
> + */
> +static bool memremap_valid(resource_size_t offset, size_t size)
> +{
> +       if (region_is_ram(offset, size) != 0) {
> +               WARN_ONCE(1, "memremap attempted on ram %pa size: %zu\n",
> +                               &offset, size);
> +               return false;
> +       }
> +       return true;
> +}
> +
> +void *memremap(resource_size_t offset, size_t size, unsigned long flags)
> +{
> +       void __iomem *addr = NULL;
> +
> +       if (!memremap_valid(offset, size))
> +               return NULL;
> +
> +       /* try all mapping types requested until one returns non-NULL */
> +       if (flags & MEMREMAP_CACHE) {
> +               if (flags & MEMREMAP_STRICT)
> +                       addr = strict_ioremap_cache(offset, size);
> +               else
> +                       addr = ioremap_cache(offset, size);
> +       }
> +
> +       if (!addr && (flags & MEMREMAP_WT)) {
> +               if (flags & MEMREMAP_STRICT)
> +                       addr = strict_ioremap_wt(offset, size);
> +               else
> +                       addr = ioremap_wt(offset, size);
> +       }
> +
> +       return (void __force *) addr;
> +}
> +EXPORT_SYMBOL(memremap);

Thanks,
Mark.
Dan Williams July 20, 2015, 2:39 p.m. UTC | #2
On Mon, Jul 20, 2015 at 5:00 AM, Mark Rutland <mark.rutland@arm.com> wrote:
> Hi,
>
> On Mon, Jul 20, 2015 at 01:18:23AM +0100, Dan Williams wrote:
>> Existing users of ioremap_cache() are mapping memory that is known in
>> advance to not have i/o side effects.  These users are forced to cast
>> away the __iomem annotation, or otherwise neglect to fix the sparse
>> errors thrown when dereferencing pointers to this memory.  Provide
>> memremap() as a non __iomem annotated ioremap_*() in the case when
>> ioremap is otherwise a pointer to memory.
>>
>> The ARCH_HAS_MEMREMAP kconfig symbol is introduced for archs to assert
>> that it is safe to recast / reuse the return value from ioremap as a
>> normal pointer to memory.  In other words, archs that mandate specific
>> accessors for __iomem are not memremap() capable and drivers that care,
>> like pmem, can add a dependency to disable themselves on these archs.
>>
>> Note, that memremap is a break from the ioremap implementation pattern
>> of adding a new memremap_<type>() for each mapping type.  Instead,
>> the implementation defines flags that are passed to the central
>> memremap() implementation.
>>
>> Outside of ioremap() and ioremap_nocache(), the expectation is that most
>> calls to ioremap_<type>() are seeking memory-like semantics (e.g.
>> speculative reads, and prefetching permitted).  These callsites can be
>> moved to memremap() over time.
>>
>> Cc: Arnd Bergmann <arnd@arndb.de>
>> Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  arch/arm/Kconfig                     |    1 +
>>  arch/arm64/Kconfig                   |    1 +
>>  arch/arm64/kernel/efi.c              |    7 ++---
>>  arch/arm64/kernel/smp_spin_table.c   |   17 +++++------
>
> These last two files weren't updated in patch 2 to use <linux/io.h>, nor are
> they updated here, so this patch breaks the build for arm64.

Thanks for testing!  I'll follow up and see why this failure wasn't
reported by 0day-kbuild.

> [...]
>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 318175f62c24..305def28385f 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -6,6 +6,7 @@ config ARM64
>>         select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
>>         select ARCH_HAS_ELF_RANDOMIZE
>>         select ARCH_HAS_GCOV_PROFILE_ALL
>> +       select ARCH_HAS_MEMREMAP
>>         select ARCH_HAS_SG_CHAIN
>>         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
>>         select ARCH_USE_CMPXCHG_LOCKREF
>> diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
>> index 9d4aa18f2a82..ed363a0202b9 100644
>> --- a/arch/arm64/kernel/efi.c
>> +++ b/arch/arm64/kernel/efi.c
>> @@ -290,8 +290,7 @@ static int __init arm64_enable_runtime_services(void)
>>         pr_info("Remapping and enabling EFI services.\n");
>>
>>         mapsize = memmap.map_end - memmap.map;
>> -       memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map,
>> -                                                  mapsize);
>> +       memmap.map = memremap(memmap.phys_map, mapsize, MEMREMAP_CACHE);
>
> memmap.phys_map is a void*, so we need to retain a cast to a numeric type or
> we'll get splats like:
>
> arch/arm64/kernel/efi.c: In function ‘arm64_enable_runtime_services’:
> arch/arm64/kernel/efi.c:294:24: warning: passing argument 1 of ‘memremap’ makes integer from pointer without a cast
>   memmap.map = memremap(memmap.phys_map, mapsize, MEMREMAP_CACHE);
>                         ^
> In file included from arch/arm64/kernel/efi.c:18:0:
> include/linux/io.h:203:14: note: expected ‘resource_size_t’ but argument is of type ‘void *’
>  extern void *memremap(resource_size_t offset, size_t size, unsigned long flags);
>               ^

Again, I need to check my compile scripts.

> Unfortunately, even with that fixed this patch breaks boot due to the
> memremap_valid check. It's extremely likely that (portions of) the
> tables are already in the linear mapping, and so page_is_ram will be
> true.

Really, "portions of", not the entire table?  Is it too early to use
the direct mapping like pfn_to_page(memmap.phys_map >> PAGE_SHIFT)?

In any event this splat mandates splitting this memremap() enabling
patch into per-instance conversions of ioremap_cache().

> At boot time I get the following:
>
> Remapping and enabling EFI services.
> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 1 at kernel/resource.c:541 memremap+0xb0/0xbc()
> memremap attempted on ram 0x00000009f9e4a018 size: 1200
> Modules linked in:
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc3+ #201
> Hardware name: ARM Juno development board (r0) (DT)
> Call trace:
> [<ffffffc000089954>] dump_backtrace+0x0/0x124
> [<ffffffc000089a88>] show_stack+0x10/0x1c
> [<ffffffc0005d15c8>] dump_stack+0x84/0xc8
> [<ffffffc0000b3f44>] warn_slowpath_common+0x98/0xd0
> [<ffffffc0000b3fc8>] warn_slowpath_fmt+0x4c/0x58
> [<ffffffc0000b9470>] memremap+0xac/0xbc
> [<ffffffc000814fe8>] arm64_enable_runtime_services+0x90/0x208
> [<ffffffc000082864>] do_one_initcall+0x88/0x1a0
> [<ffffffc000811a20>] kernel_init_freeable+0x8c/0x1f8
> [<ffffffc0005ce370>] kernel_init+0xc/0xd8
> ---[ end trace cb88537fdc8fa200 ]---
> Failed to remap EFI memory map
>
>>         if (!memmap.map) {
>>                 pr_err("Failed to remap EFI memory map\n");
>>                 return -1;
>> @@ -299,8 +298,8 @@ static int __init arm64_enable_runtime_services(void)
>>         memmap.map_end = memmap.map + mapsize;
>>         efi.memmap = &memmap;
>>
>> -       efi.systab = (__force void *)ioremap_cache(efi_system_table,
>> -                                                  sizeof(efi_system_table_t));
>> +       efi.systab = memremap(efi_system_table, sizeof(efi_system_table_t),
>> +                       MEMREMAP_CACHE);
>>         if (!efi.systab) {
>>                 pr_err("Failed to remap EFI System Table\n");
>>                 return -1;
>> diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
>> index aef3605a8c47..b9caf6cd44e6 100644
>> --- a/arch/arm64/kernel/smp_spin_table.c
>> +++ b/arch/arm64/kernel/smp_spin_table.c
>> @@ -73,19 +73,19 @@ static int smp_spin_table_cpu_init(unsigned int cpu)
>>
>>  static int smp_spin_table_cpu_prepare(unsigned int cpu)
>>  {
>> -       __le64 __iomem *release_addr;
>> +       __le64 *release_addr;
>>
>>         if (!cpu_release_addr[cpu])
>>                 return -ENODEV;
>>
>>         /*
>>          * The cpu-release-addr may or may not be inside the linear mapping.
>> -        * As ioremap_cache will either give us a new mapping or reuse the
>> -        * existing linear mapping, we can use it to cover both cases. In
>> -        * either case the memory will be MT_NORMAL.
>> +        * As memremap will either give us a new mapping or reuse the existing
>> +        * linear mapping, we can use it to cover both cases. In either case
>> +        * the memory will be MT_NORMAL.
>>          */
>> -       release_addr = ioremap_cache(cpu_release_addr[cpu],
>> -                                    sizeof(*release_addr));
>> +       release_addr = memremap(cpu_release_addr[cpu], sizeof(*release_addr),
>> +                       MEMREMAP_CACHE);
>
> Likewise, the comment here is contradicted by the code in memremap_valid
> (below). We'll fail to bring up secondaries if the release address fell
> in the linear mapping.

Ah, so ARM's ioremap_cache() silently handles calls to remap RAM with
a pointer to the direct mapping.  It think that's a useful property to
carry in the generic implementation.  Especially if the direct mapping
has established greater than PAGE_SIZE mappings.

>
>> +#ifdef CONFIG_ARCH_HAS_MEMREMAP
>> +/*
>> + * memremap() is "ioremap" for cases where it is known that the resource
>> + * being mapped does not have i/o side effects and the __iomem
>> + * annotation is not applicable.
>> + */
>> +static bool memremap_valid(resource_size_t offset, size_t size)
>> +{
>> +       if (region_is_ram(offset, size) != 0) {
>> +               WARN_ONCE(1, "memremap attempted on ram %pa size: %zu\n",
>> +                               &offset, size);
>> +               return false;
>> +       }
>> +       return true;
>> +}
>> +
>> +void *memremap(resource_size_t offset, size_t size, unsigned long flags)
>> +{
>> +       void __iomem *addr = NULL;
>> +
>> +       if (!memremap_valid(offset, size))
>> +               return NULL;
>> +
>> +       /* try all mapping types requested until one returns non-NULL */
>> +       if (flags & MEMREMAP_CACHE) {
>> +               if (flags & MEMREMAP_STRICT)
>> +                       addr = strict_ioremap_cache(offset, size);
>> +               else
>> +                       addr = ioremap_cache(offset, size);
>> +       }
>> +
>> +       if (!addr && (flags & MEMREMAP_WT)) {
>> +               if (flags & MEMREMAP_STRICT)
>> +                       addr = strict_ioremap_wt(offset, size);
>> +               else
>> +                       addr = ioremap_wt(offset, size);
>> +       }
>> +
>> +       return (void __force *) addr;
>> +}
>> +EXPORT_SYMBOL(memremap);
>
> Thanks,
> Mark.

Thanks Mark!
Mark Rutland July 20, 2015, 3:56 p.m. UTC | #3
On Mon, Jul 20, 2015 at 03:39:44PM +0100, Dan Williams wrote:
> On Mon, Jul 20, 2015 at 5:00 AM, Mark Rutland <mark.rutland@arm.com> wrote:
> > Hi,
> >
> > On Mon, Jul 20, 2015 at 01:18:23AM +0100, Dan Williams wrote:
> >> Existing users of ioremap_cache() are mapping memory that is known in
> >> advance to not have i/o side effects.  These users are forced to cast
> >> away the __iomem annotation, or otherwise neglect to fix the sparse
> >> errors thrown when dereferencing pointers to this memory.  Provide
> >> memremap() as a non __iomem annotated ioremap_*() in the case when
> >> ioremap is otherwise a pointer to memory.
> >>
> >> The ARCH_HAS_MEMREMAP kconfig symbol is introduced for archs to assert
> >> that it is safe to recast / reuse the return value from ioremap as a
> >> normal pointer to memory.  In other words, archs that mandate specific
> >> accessors for __iomem are not memremap() capable and drivers that care,
> >> like pmem, can add a dependency to disable themselves on these archs.
> >>
> >> Note, that memremap is a break from the ioremap implementation pattern
> >> of adding a new memremap_<type>() for each mapping type.  Instead,
> >> the implementation defines flags that are passed to the central
> >> memremap() implementation.
> >>
> >> Outside of ioremap() and ioremap_nocache(), the expectation is that most
> >> calls to ioremap_<type>() are seeking memory-like semantics (e.g.
> >> speculative reads, and prefetching permitted).  These callsites can be
> >> moved to memremap() over time.
> >>
> >> Cc: Arnd Bergmann <arnd@arndb.de>
> >> Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
> >> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

[...]

> > Unfortunately, even with that fixed this patch breaks boot due to the
> > memremap_valid check. It's extremely likely that (portions of) the
> > tables are already in the linear mapping, and so page_is_ram will be
> > true.
> 
> Really, "portions of", not the entire table?

Yes, if a table straddles a page boundary (the spec doesn't seem to
disallow this). We might not map all of the required pages, e.g.

* If the kernel was loaded more than TEXT_OFFSET bytes above the table
  (the arm64 linear/direct mapping is currently tied to our text
  mapping, and we can't currently address anything below that).

  In this case the start isn't in the direct mapping, and we'll try to
  create a new mapping. That should work.

* If the kernel is built with a small VA range, and we have just the
  right amount of memory before the table begins (this is absurdly
  unlikely).

  In this case if the start is in the direct mapping, but the end isn't,
  we'll explode when accessing it.

* If the 'mem=' parameter is used, the direct mapping can be truncated.

  In this case if the start is in the direct mapping, but the end isn't,
  we'll explode when accessing it.

In the arm64 ioremap_cache we only check
pfn_valid(__phys_to_pfn(phys_addr)), and assume that if the start was
mapped, then the whole region is. In the cases above that's clearly not
true, so we should probably see about handling that.

> Is it too early to use the direct mapping like
> pfn_to_page(memmap.phys_map >> PAGE_SHIFT)?

I believe it's too early, as we haven't initialised the memmap yet. For
the cases above we will never have a page for the portion of the region
not in the direct mapping anyway.

> In any event this splat mandates splitting this memremap() enabling
> patch into per-instance conversions of ioremap_cache().
> 
> > At boot time I get the following:
> >
> > Remapping and enabling EFI services.
> > ------------[ cut here ]------------
> > WARNING: CPU: 0 PID: 1 at kernel/resource.c:541 memremap+0xb0/0xbc()
> > memremap attempted on ram 0x00000009f9e4a018 size: 1200
> > Modules linked in:
> > CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc3+ #201
> > Hardware name: ARM Juno development board (r0) (DT)
> > Call trace:
> > [<ffffffc000089954>] dump_backtrace+0x0/0x124
> > [<ffffffc000089a88>] show_stack+0x10/0x1c
> > [<ffffffc0005d15c8>] dump_stack+0x84/0xc8
> > [<ffffffc0000b3f44>] warn_slowpath_common+0x98/0xd0
> > [<ffffffc0000b3fc8>] warn_slowpath_fmt+0x4c/0x58
> > [<ffffffc0000b9470>] memremap+0xac/0xbc
> > [<ffffffc000814fe8>] arm64_enable_runtime_services+0x90/0x208
> > [<ffffffc000082864>] do_one_initcall+0x88/0x1a0
> > [<ffffffc000811a20>] kernel_init_freeable+0x8c/0x1f8
> > [<ffffffc0005ce370>] kernel_init+0xc/0xd8
> > ---[ end trace cb88537fdc8fa200 ]---
> > Failed to remap EFI memory map
> >
> >>         if (!memmap.map) {
> >>                 pr_err("Failed to remap EFI memory map\n");
> >>                 return -1;
> >> @@ -299,8 +298,8 @@ static int __init arm64_enable_runtime_services(void)
> >>         memmap.map_end = memmap.map + mapsize;
> >>         efi.memmap = &memmap;
> >>
> >> -       efi.systab = (__force void *)ioremap_cache(efi_system_table,
> >> -                                                  sizeof(efi_system_table_t));
> >> +       efi.systab = memremap(efi_system_table, sizeof(efi_system_table_t),
> >> +                       MEMREMAP_CACHE);
> >>         if (!efi.systab) {
> >>                 pr_err("Failed to remap EFI System Table\n");
> >>                 return -1;
> >> diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
> >> index aef3605a8c47..b9caf6cd44e6 100644
> >> --- a/arch/arm64/kernel/smp_spin_table.c
> >> +++ b/arch/arm64/kernel/smp_spin_table.c
> >> @@ -73,19 +73,19 @@ static int smp_spin_table_cpu_init(unsigned int cpu)
> >>
> >>  static int smp_spin_table_cpu_prepare(unsigned int cpu)
> >>  {
> >> -       __le64 __iomem *release_addr;
> >> +       __le64 *release_addr;
> >>
> >>         if (!cpu_release_addr[cpu])
> >>                 return -ENODEV;
> >>
> >>         /*
> >>          * The cpu-release-addr may or may not be inside the linear mapping.
> >> -        * As ioremap_cache will either give us a new mapping or reuse the
> >> -        * existing linear mapping, we can use it to cover both cases. In
> >> -        * either case the memory will be MT_NORMAL.
> >> +        * As memremap will either give us a new mapping or reuse the existing
> >> +        * linear mapping, we can use it to cover both cases. In either case
> >> +        * the memory will be MT_NORMAL.
> >>          */
> >> -       release_addr = ioremap_cache(cpu_release_addr[cpu],
> >> -                                    sizeof(*release_addr));
> >> +       release_addr = memremap(cpu_release_addr[cpu], sizeof(*release_addr),
> >> +                       MEMREMAP_CACHE);
> >
> > Likewise, the comment here is contradicted by the code in memremap_valid
> > (below). We'll fail to bring up secondaries if the release address fell
> > in the linear mapping.
> 
> Ah, so ARM's ioremap_cache() silently handles calls to remap RAM with
> a pointer to the direct mapping.  It think that's a useful property to
> carry in the generic implementation.  Especially if the direct mapping
> has established greater than PAGE_SIZE mappings.

That sounds good to me.

As mentioned above, I think we should test that the entire region we're
trying to map falls within the direct mapping before assuming we can use
it, though I'm not sure what the best way of doing that is.

Thanks,
Mark.
Luis Chamberlain July 21, 2015, 11:58 p.m. UTC | #4
On Sun, Jul 19, 2015 at 08:18:23PM -0400, Dan Williams wrote:
> diff --git a/include/linux/io.h b/include/linux/io.h
> index 080a4fbf2ba4..2983b6e63970 100644
> --- a/include/linux/io.h
> +++ b/include/linux/io.h
> @@ -192,4 +192,15 @@ static inline int arch_phys_wc_index(int handle)
>  #endif
>  #endif
>  
> +enum {
> +	MEMREMAP_WB = 1 << 0,
> +	MEMREMAP_WT = 1 << 1,
> +	MEMREMAP_WC = 1 << 2,
> +	MEMREMAP_STRICT = 1 << 3,
> +	MEMREMAP_CACHE = MEMREMAP_WB,
> +};

A few things:

1) You'll need MEMREMAP_UC now as well now.

2) as you are doing all this sweep over architectures on this please
also address the lack of ioremap_*() variant implemention to return
NULL, ie not supported, because we've decided for now that so long
as the semantics are not well defined we can't expect architectures
to get it right unless they are doing the work themselves, and the
old strategy of just #defin'ing a variant to iorempa_nocache() which
folks tended to just can lead to issues. In your case since you are
jumping to the flags implementation this might be knocking two birds
with one stone.

3) For the case that architectures have no MMU we currently do a direct
mapping such as what you try to describe to do with memremap(). I wonder
if its worth it to then enable that code to just map to memremap(). That
throws off your usage of CONFIG_ARCH_HAS_MEMREMAP if we want to repurpose
that implementation for no the MMU case, unless of course you just have a
__memremap() which does the default basic direct mapping implementation.

4) Since we are all blaming semantics on our woes I'd like to ask for
some effort on semantics to be well defined. Semantics here sholud cover
some form of Documentation but also sparse annotation checks and perhaps
Coccinelle grammar rules for how things should be done. This should not only
cover general use but also if there are calls which depend on a cache
type to have been set. If we used sparse annotations it  may meen a
different return type for each cache type.  Not sure if we want this.
If we went with grammar rules I'm looking to for instance have in place
rules on scripts/coccinelle which would allow developers to use

make coccicheck M=foo/

to find issues. I can perhaps help with this, but we'd need to do a good
sweep here to not only cover good territory but also get folks to agree
on things.

5) This may be related to 4), not sure. There are aligment requirements we
should probably iron out for architectures. How will we annotate these
requirements or allow architectures to be pedantic over these requirements?

  Luis
Dan Williams July 22, 2015, 4:04 a.m. UTC | #5
On Tue, Jul 21, 2015 at 4:58 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
> On Sun, Jul 19, 2015 at 08:18:23PM -0400, Dan Williams wrote:
>> diff --git a/include/linux/io.h b/include/linux/io.h
>> index 080a4fbf2ba4..2983b6e63970 100644
>> --- a/include/linux/io.h
>> +++ b/include/linux/io.h
>> @@ -192,4 +192,15 @@ static inline int arch_phys_wc_index(int handle)
>>  #endif
>>  #endif
>>
>> +enum {
>> +     MEMREMAP_WB = 1 << 0,
>> +     MEMREMAP_WT = 1 << 1,
>> +     MEMREMAP_WC = 1 << 2,
>> +     MEMREMAP_STRICT = 1 << 3,
>> +     MEMREMAP_CACHE = MEMREMAP_WB,
>> +};
>
> A few things:
>
> 1) You'll need MEMREMAP_UC now as well now.

Why?  I don't think it fits.  If there are any I/O areas (non-memory)
in the range then it simply is not "memory" and should not be using
memremap().  In other words it seems like you really do need to heed
the __iomem annotation in the return value from ioremap_uc().

> 2) as you are doing all this sweep over architectures on this please
> also address the lack of ioremap_*() variant implemention to return
> NULL, ie not supported, because we've decided for now that so long
> as the semantics are not well defined we can't expect architectures
> to get it right unless they are doing the work themselves, and the
> old strategy of just #defin'ing a variant to iorempa_nocache() which
> folks tended to just can lead to issues. In your case since you are
> jumping to the flags implementation this might be knocking two birds
> with one stone.

I'm not going to do a general sweep for this as the expectation that
ioremap silently falls back if a mapping type is not supported is
buried all over the place.  That said, new usages and conversions to
memremap() can be strict about this. For now, I'm only handling
ioremap_cache() and ioremap_wt() conversions.

> 3) For the case that architectures have no MMU we currently do a direct
> mapping such as what you try to describe to do with memremap(). I wonder
> if its worth it to then enable that code to just map to memremap(). That
> throws off your usage of CONFIG_ARCH_HAS_MEMREMAP if we want to repurpose
> that implementation for no the MMU case, unless of course you just have a
> __memremap() which does the default basic direct mapping implementation.

Yes, in the next rev of this series I am having it fall back to direct
mappings where it makes sense.

> 4) Since we are all blaming semantics on our woes I'd like to ask for
> some effort on semantics to be well defined. Semantics here sholud cover
> some form of Documentation but also sparse annotation checks and perhaps
> Coccinelle grammar rules for how things should be done. This should not only
> cover general use but also if there are calls which depend on a cache
> type to have been set. If we used sparse annotations it  may meen a
> different return type for each cache type.  Not sure if we want this.
> If we went with grammar rules I'm looking to for instance have in place
> rules on scripts/coccinelle which would allow developers to use

memremap() explicitly does not want get into arch semantics debates.
The pointer returned from memremap() is a "void *" that can be used as
normal memory.  If it is a normal pointer I don't see the role for
sparse annotation.

>
> make coccicheck M=foo/
>
> to find issues. I can perhaps help with this, but we'd need to do a good
> sweep here to not only cover good territory but also get folks to agree
> on things.
>
> 5) This may be related to 4), not sure. There are aligment requirements we
> should probably iron out for architectures. How will we annotate these
> requirements or allow architectures to be pedantic over these requirements?

What requirements beyond PAGE_SIZE alignment would we need to worry about?
Luis Chamberlain July 22, 2015, 10:55 p.m. UTC | #6
On Tue, Jul 21, 2015 at 09:04:22PM -0700, Dan Williams wrote:
> On Tue, Jul 21, 2015 at 4:58 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
> > On Sun, Jul 19, 2015 at 08:18:23PM -0400, Dan Williams wrote:
> >> diff --git a/include/linux/io.h b/include/linux/io.h
> >> index 080a4fbf2ba4..2983b6e63970 100644
> >> --- a/include/linux/io.h
> >> +++ b/include/linux/io.h
> >> @@ -192,4 +192,15 @@ static inline int arch_phys_wc_index(int handle)
> >>  #endif
> >>  #endif
> >>
> >> +enum {
> >> +     MEMREMAP_WB = 1 << 0,
> >> +     MEMREMAP_WT = 1 << 1,
> >> +     MEMREMAP_WC = 1 << 2,
> >> +     MEMREMAP_STRICT = 1 << 3,
> >> +     MEMREMAP_CACHE = MEMREMAP_WB,
> >> +};
> >
> > A few things:
> >
> > 1) You'll need MEMREMAP_UC now as well now.
> 
> Why?  I don't think it fits.  If there are any I/O areas (non-memory)
> in the range then it simply is not "memory" and should not be using
> memremap().  In other words it seems like you really do need to heed
> the __iomem annotation in the return value from ioremap_uc().

One can use a similar argument for some areas of use of write-combining,
what litmus test are you using to add a flag above? Why would WC be OK
and not UC? WC is a cache hack on x86...

> > 2) as you are doing all this sweep over architectures on this please
> > also address the lack of ioremap_*() variant implemention to return
> > NULL, ie not supported, because we've decided for now that so long
> > as the semantics are not well defined we can't expect architectures
> > to get it right unless they are doing the work themselves, and the
> > old strategy of just #defin'ing a variant to iorempa_nocache() which
> > folks tended to just can lead to issues. In your case since you are
> > jumping to the flags implementation this might be knocking two birds
> > with one stone.
> 
> I'm not going to do a general sweep for this as the expectation that
> ioremap silently falls back if a mapping type is not supported is
> buried all over the place.

But it does not. It cannot. The reason, as I noted in a commit now merged
on tip, is that the default wrappers are nested under #ifndef CONFIG_MMU,
whereas really this should have just been used for ioremap() and iounmap().

That is, the ioremap_*() variants should have a definition even for !CONFIG_MMU,
and since we actually don't want to enable sloppy defines of this the sensible
defaults should be to return NULL on variants -- for both !CONFIG_MMU
and for CONFIG_MMU. The way to fix then then is to move the default variant
definitions out from #ifndef CONFIG_MMU and have them return NULL. We may
be able to have them return something not-null by default always but we
first need to beat to death the topic of semantics with all architecture
folks to each that agreement. Until then the variants shoudl just return
NULL encouraging arch developers to supply a proper implementation.

> That said, new usages and conversions to
> memremap() can be strict about this. For now, I'm only handling
> ioremap_cache() and ioremap_wt() conversions.

OK, if you are not going to do this let me know and I can do it.

> > 3) For the case that architectures have no MMU we currently do a direct
> > mapping such as what you try to describe to do with memremap(). I wonder
> > if its worth it to then enable that code to just map to memremap(). That
> > throws off your usage of CONFIG_ARCH_HAS_MEMREMAP if we want to repurpose
> > that implementation for no the MMU case, unless of course you just have a
> > __memremap() which does the default basic direct mapping implementation.
> 
> Yes, in the next rev of this series I am having it fall back to direct
> mappings where it makes sense.
> 
> > 4) Since we are all blaming semantics on our woes I'd like to ask for
> > some effort on semantics to be well defined. Semantics here sholud cover
> > some form of Documentation but also sparse annotation checks and perhaps
> > Coccinelle grammar rules for how things should be done. This should not only
> > cover general use but also if there are calls which depend on a cache
> > type to have been set. If we used sparse annotations it  may meen a
> > different return type for each cache type.  Not sure if we want this.
> > If we went with grammar rules I'm looking to for instance have in place
> > rules on scripts/coccinelle which would allow developers to use
> 
> memremap() explicitly does not want get into arch semantics debates.

Why? The ioremap() cluster fuck seems like a good example to learn from.

I was also under the impression you are going to provide a new API with flags
to kill / make old ioremap() varaints use this new API with the flags passed?
Is that not the case ?

> The pointer returned from memremap() is a "void *" that can be used as
> normal memory.  If it is a normal pointer I don't see the role for
> sparse annotation.

Hrm, do we want to *prevent* certain to use the memory range when it is direct?

> > make coccicheck M=foo/
> >
> > to find issues. I can perhaps help with this, but we'd need to do a good
> > sweep here to not only cover good territory but also get folks to agree
> > on things.
> >
> > 5) This may be related to 4), not sure. There are aligment requirements we
> > should probably iron out for architectures. How will we annotate these
> > requirements or allow architectures to be pedantic over these requirements?
> 
> What requirements beyond PAGE_SIZE alignment would we need to worry about?

That's a big concern some folks expressed, architecture folks would know more.

One last thing: if you are providing a new API to replace a series of old
symbols that were used before (I though you were working towards this for all
ioremap_*() variants) we have to take care to ensure that any symbol that is
currently exported with EXPORT_SYMBOL_GPL() does not get an EXPORT_SYMBOL()
wrapper-escape since we do not want proprietary drivers to make use these
alternatives.

 Luis
Dan Williams July 22, 2015, 11:15 p.m. UTC | #7
On Wed, Jul 22, 2015 at 3:55 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
> On Tue, Jul 21, 2015 at 09:04:22PM -0700, Dan Williams wrote:
>> On Tue, Jul 21, 2015 at 4:58 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
>> > On Sun, Jul 19, 2015 at 08:18:23PM -0400, Dan Williams wrote:
>> >> diff --git a/include/linux/io.h b/include/linux/io.h
>> >> index 080a4fbf2ba4..2983b6e63970 100644
>> >> --- a/include/linux/io.h
>> >> +++ b/include/linux/io.h
>> >> @@ -192,4 +192,15 @@ static inline int arch_phys_wc_index(int handle)
>> >>  #endif
>> >>  #endif
>> >>
>> >> +enum {
>> >> +     MEMREMAP_WB = 1 << 0,
>> >> +     MEMREMAP_WT = 1 << 1,
>> >> +     MEMREMAP_WC = 1 << 2,
>> >> +     MEMREMAP_STRICT = 1 << 3,
>> >> +     MEMREMAP_CACHE = MEMREMAP_WB,
>> >> +};
>> >
>> > A few things:
>> >
>> > 1) You'll need MEMREMAP_UC now as well now.
>>
>> Why?  I don't think it fits.  If there are any I/O areas (non-memory)
>> in the range then it simply is not "memory" and should not be using
>> memremap().  In other words it seems like you really do need to heed
>> the __iomem annotation in the return value from ioremap_uc().
>
> One can use a similar argument for some areas of use of write-combining,
> what litmus test are you using to add a flag above? Why would WC be OK
> and not UC? WC is a cache hack on x86...

I should remove WC from the list for now, I'm not converting
ioremap_wc() to memremap at this time.

That said it's not the same argument.  A driver calling ioremap_wc()
is explicitly signing up to handle flushing the write-combine buffer
and the expectation is that there are no I/O ranges in the mapping
that would be affected by write combining.  An ioremap_uc() mapping
says "careful there are I/O sensitive registers in this range".

>> > 2) as you are doing all this sweep over architectures on this please
>> > also address the lack of ioremap_*() variant implemention to return
>> > NULL, ie not supported, because we've decided for now that so long
>> > as the semantics are not well defined we can't expect architectures
>> > to get it right unless they are doing the work themselves, and the
>> > old strategy of just #defin'ing a variant to iorempa_nocache() which
>> > folks tended to just can lead to issues. In your case since you are
>> > jumping to the flags implementation this might be knocking two birds
>> > with one stone.
>>
>> I'm not going to do a general sweep for this as the expectation that
>> ioremap silently falls back if a mapping type is not supported is
>> buried all over the place.
>
> But it does not. It cannot. The reason, as I noted in a commit now merged
> on tip, is that the default wrappers are nested under #ifndef CONFIG_MMU,
> whereas really this should have just been used for ioremap() and iounmap().
>
> That is, the ioremap_*() variants should have a definition even for !CONFIG_MMU,
> and since we actually don't want to enable sloppy defines of this the sensible
> defaults should be to return NULL on variants -- for both !CONFIG_MMU
> and for CONFIG_MMU. The way to fix then then is to move the default variant
> definitions out from #ifndef CONFIG_MMU and have them return NULL. We may
> be able to have them return something not-null by default always but we
> first need to beat to death the topic of semantics with all architecture
> folks to each that agreement. Until then the variants shoudl just return
> NULL encouraging arch developers to supply a proper implementation.

That clean up is orthogonal to memremap.  memremap will return NULL
for unsupported mapping types.

>> That said, new usages and conversions to
>> memremap() can be strict about this. For now, I'm only handling
>> ioremap_cache() and ioremap_wt() conversions.
>
> OK, if you are not going to do this let me know and I can do it.

Yes, go ahead.

>> > 3) For the case that architectures have no MMU we currently do a direct
>> > mapping such as what you try to describe to do with memremap(). I wonder
>> > if its worth it to then enable that code to just map to memremap(). That
>> > throws off your usage of CONFIG_ARCH_HAS_MEMREMAP if we want to repurpose
>> > that implementation for no the MMU case, unless of course you just have a
>> > __memremap() which does the default basic direct mapping implementation.
>>
>> Yes, in the next rev of this series I am having it fall back to direct
>> mappings where it makes sense.
>>
>> > 4) Since we are all blaming semantics on our woes I'd like to ask for
>> > some effort on semantics to be well defined. Semantics here sholud cover
>> > some form of Documentation but also sparse annotation checks and perhaps
>> > Coccinelle grammar rules for how things should be done. This should not only
>> > cover general use but also if there are calls which depend on a cache
>> > type to have been set. If we used sparse annotations it  may meen a
>> > different return type for each cache type.  Not sure if we want this.
>> > If we went with grammar rules I'm looking to for instance have in place
>> > rules on scripts/coccinelle which would allow developers to use
>>
>> memremap() explicitly does not want get into arch semantics debates.
>
> Why? The ioremap() cluster fuck seems like a good example to learn from.
>
> I was also under the impression you are going to provide a new API with flags
> to kill / make old ioremap() varaints use this new API with the flags passed?
> Is that not the case ?

Yes, I'll post the new series shortly once 0day says there are no
build regressions.

>> The pointer returned from memremap() is a "void *" that can be used as
>> normal memory.  If it is a normal pointer I don't see the role for
>> sparse annotation.
>
> Hrm, do we want to *prevent* certain to use the memory range when it is direct?

Can you give an example scenario?

>
>> > make coccicheck M=foo/
>> >
>> > to find issues. I can perhaps help with this, but we'd need to do a good
>> > sweep here to not only cover good territory but also get folks to agree
>> > on things.
>> >
>> > 5) This may be related to 4), not sure. There are aligment requirements we
>> > should probably iron out for architectures. How will we annotate these
>> > requirements or allow architectures to be pedantic over these requirements?
>>
>> What requirements beyond PAGE_SIZE alignment would we need to worry about?
>
> That's a big concern some folks expressed, architecture folks would know more.
>
> One last thing: if you are providing a new API to replace a series of old
> symbols that were used before (I though you were working towards this for all
> ioremap_*() variants) we have to take care to ensure that any symbol that is
> currently exported with EXPORT_SYMBOL_GPL() does not get an EXPORT_SYMBOL()
> wrapper-escape since we do not want proprietary drivers to make use these
> alternatives.

Yes, memremap() is inheriting the export level of ioremap_cache.
Luis Chamberlain July 23, 2015, 12:55 a.m. UTC | #8
On Wed, Jul 22, 2015 at 04:15:41PM -0700, Dan Williams wrote:
> On Wed, Jul 22, 2015 at 3:55 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
> > On Tue, Jul 21, 2015 at 09:04:22PM -0700, Dan Williams wrote:
> >> On Tue, Jul 21, 2015 at 4:58 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
> >> > On Sun, Jul 19, 2015 at 08:18:23PM -0400, Dan Williams wrote:
> >> >> diff --git a/include/linux/io.h b/include/linux/io.h
> >> >> index 080a4fbf2ba4..2983b6e63970 100644
> >> >> --- a/include/linux/io.h
> >> >> +++ b/include/linux/io.h
> >> >> @@ -192,4 +192,15 @@ static inline int arch_phys_wc_index(int handle)
> >> >>  #endif
> >> >>  #endif
> >> >>
> >> >> +enum {
> >> >> +     MEMREMAP_WB = 1 << 0,
> >> >> +     MEMREMAP_WT = 1 << 1,
> >> >> +     MEMREMAP_WC = 1 << 2,
> >> >> +     MEMREMAP_STRICT = 1 << 3,
> >> >> +     MEMREMAP_CACHE = MEMREMAP_WB,
> >> >> +};
> >> >
> >> > A few things:
> >> >
> >> > 1) You'll need MEMREMAP_UC now as well now.
> >>
> >> Why?  I don't think it fits.  If there are any I/O areas (non-memory)
> >> in the range then it simply is not "memory" and should not be using
> >> memremap().  In other words it seems like you really do need to heed
> >> the __iomem annotation in the return value from ioremap_uc().
> >
> > One can use a similar argument for some areas of use of write-combining,
> > what litmus test are you using to add a flag above? Why would WC be OK
> > and not UC? WC is a cache hack on x86...
> 
> I should remove WC from the list for now, I'm not converting
> ioremap_wc() to memremap at this time.

Do you have plans to do so? If so can you elaborate on such plans ?

> That said it's not the same argument.  A driver calling ioremap_wc()
> is explicitly signing up to handle flushing the write-combine buffer
> and the expectation is that there are no I/O ranges in the mapping
> that would be affected by write combining.

The thing is the semantics even for write-combining need to be discussed
further, write-combining for kernel interfaces are screwed up for at least one
architecture becaues we didn't get things right or document expectations
correctly amongst other things, so the above is not sufficient to define
write-combining for now.

> An ioremap_uc() mapping says "careful there are I/O sensitive registers in this range".

Yes but we have ioremap_nocache(). I added ioremap_uc() and the goal behind ioremap_uc()
was to enable efficient use of absolutely no cache at all, and this was particularly
aimed at taking advantage of strong UC on PAT. This was also a compromise to help
with the transition over on x86 from UC- to UC in the long term as we also convert
all write-combining variant API users to (old mtrr_add(), now arch_phys_wc_add())
over from ioremap_nocache() to ioremap_wc() and ensure its MMIO registers use
keep using ioremap_nocache() -- but for corner cases with old MTRR hacks we need
ioremap_uc() to also avoid regressions when we do the flip on x86 fromr UC- to
UC. How we want to map this to other architectures remaisn to be discovered, but
perhaps they don't care.

Anyway so why is WB and WT allowed if you are going to remove WC and not add UC?

> >> > 2) as you are doing all this sweep over architectures on this please
> >> > also address the lack of ioremap_*() variant implemention to return
> >> > NULL, ie not supported, because we've decided for now that so long
> >> > as the semantics are not well defined we can't expect architectures
> >> > to get it right unless they are doing the work themselves, and the
> >> > old strategy of just #defin'ing a variant to iorempa_nocache() which
> >> > folks tended to just can lead to issues. In your case since you are
> >> > jumping to the flags implementation this might be knocking two birds
> >> > with one stone.
> >>
> >> I'm not going to do a general sweep for this as the expectation that
> >> ioremap silently falls back if a mapping type is not supported is
> >> buried all over the place.
> >
> > But it does not. It cannot. The reason, as I noted in a commit now merged
> > on tip, is that the default wrappers are nested under #ifndef CONFIG_MMU,
> > whereas really this should have just been used for ioremap() and iounmap().
> >
> > That is, the ioremap_*() variants should have a definition even for !CONFIG_MMU,
> > and since we actually don't want to enable sloppy defines of this the sensible
> > defaults should be to return NULL on variants -- for both !CONFIG_MMU
> > and for CONFIG_MMU. The way to fix then then is to move the default variant
> > definitions out from #ifndef CONFIG_MMU and have them return NULL. We may
> > be able to have them return something not-null by default always but we
> > first need to beat to death the topic of semantics with all architecture
> > folks to each that agreement. Until then the variants shoudl just return
> > NULL encouraging arch developers to supply a proper implementation.
> 
> That clean up is orthogonal to memremap.  memremap will return NULL
> for unsupported mapping types.

It can be if you are not doing a major long term cleanup...

> >> That said, new usages and conversions to
> >> memremap() can be strict about this. For now, I'm only handling
> >> ioremap_cache() and ioremap_wt() conversions.
> >
> > OK, if you are not going to do this let me know and I can do it.
> 
> Yes, go ahead.

OK..

> >> > 3) For the case that architectures have no MMU we currently do a direct
> >> > mapping such as what you try to describe to do with memremap(). I wonder
> >> > if its worth it to then enable that code to just map to memremap(). That
> >> > throws off your usage of CONFIG_ARCH_HAS_MEMREMAP if we want to repurpose
> >> > that implementation for no the MMU case, unless of course you just have a
> >> > __memremap() which does the default basic direct mapping implementation.
> >>
> >> Yes, in the next rev of this series I am having it fall back to direct
> >> mappings where it makes sense.
> >>
> >> > 4) Since we are all blaming semantics on our woes I'd like to ask for
> >> > some effort on semantics to be well defined. Semantics here sholud cover
> >> > some form of Documentation but also sparse annotation checks and perhaps
> >> > Coccinelle grammar rules for how things should be done. This should not only
> >> > cover general use but also if there are calls which depend on a cache
> >> > type to have been set. If we used sparse annotations it  may meen a
> >> > different return type for each cache type.  Not sure if we want this.
> >> > If we went with grammar rules I'm looking to for instance have in place
> >> > rules on scripts/coccinelle which would allow developers to use
> >>
> >> memremap() explicitly does not want get into arch semantics debates.
> >
> > Why? The ioremap() cluster fuck seems like a good example to learn from.
> >
> > I was also under the impression you are going to provide a new API with flags
> > to kill / make old ioremap() varaints use this new API with the flags passed?
> > Is that not the case ?
> 
> Yes, I'll post the new series shortly once 0day says there are no
> build regressions.

Then what API was it you intend on using for that ? My point is that semantics
shoudl matter then for that -- strongly.

> >> The pointer returned from memremap() is a "void *" that can be used as
> >> normal memory.  If it is a normal pointer I don't see the role for
> >> sparse annotation.
> >
> > Hrm, do we want to *prevent* certain to use the memory range when it is direct?
> 
> Can you give an example scenario?

arch_phys_wc_add(), set_memory_wc(), set_memory_*(). There may be others. The
mem region API Toshi just fixed is another.

> >> > make coccicheck M=foo/
> >> >
> >> > to find issues. I can perhaps help with this, but we'd need to do a good
> >> > sweep here to not only cover good territory but also get folks to agree
> >> > on things.
> >> >
> >> > 5) This may be related to 4), not sure. There are aligment requirements we
> >> > should probably iron out for architectures. How will we annotate these
> >> > requirements or allow architectures to be pedantic over these requirements?
> >>
> >> What requirements beyond PAGE_SIZE alignment would we need to worry about?
> >
> > That's a big concern some folks expressed, architecture folks would know more.
> >
> > One last thing: if you are providing a new API to replace a series of old
> > symbols that were used before (I though you were working towards this for all
> > ioremap_*() variants) we have to take care to ensure that any symbol that is
> > currently exported with EXPORT_SYMBOL_GPL() does not get an EXPORT_SYMBOL()
> > wrapper-escape since we do not want proprietary drivers to make use these
> > alternatives.
> 
> Yes, memremap() is inheriting the export level of ioremap_cache.

Like I said we need to take care to ensure EXPORT_SYMBOL_GPL() is not used
then by new wrappers to enable proprietary drivers to make use of those
symbosl through the new wrapper. One idea may be to work on symbol namespaces,
if we cannot address this through static inlines for old callers and just
making the new APIs EXPORT_SYMBOL_GPL().

  Luis
Dan Williams July 23, 2015, 1:02 a.m. UTC | #9
On Wed, Jul 22, 2015 at 5:55 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
> On Wed, Jul 22, 2015 at 04:15:41PM -0700, Dan Williams wrote:
> Anyway so why is WB and WT allowed if you are going to remove WC and not add UC?
>

Because all existing usages of WB and WT mappings are clearly free of
I/O side effect concerns.  The WC and now UC usages are a muddy mix of
"sometimes there's I/O side effects, but it depends by arch and
driver".
diff mbox

Patch

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1c5021002fe4..1d64aae0a226 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -3,6 +3,7 @@  config ARM
 	default y
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAS_ELF_RANDOMIZE
+	select ARCH_HAS_MEMREMAP
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 318175f62c24..305def28385f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -6,6 +6,7 @@  config ARM64
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_MEMREMAP
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_USE_CMPXCHG_LOCKREF
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 9d4aa18f2a82..ed363a0202b9 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -290,8 +290,7 @@  static int __init arm64_enable_runtime_services(void)
 	pr_info("Remapping and enabling EFI services.\n");
 
 	mapsize = memmap.map_end - memmap.map;
-	memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map,
-						   mapsize);
+	memmap.map = memremap(memmap.phys_map, mapsize, MEMREMAP_CACHE);
 	if (!memmap.map) {
 		pr_err("Failed to remap EFI memory map\n");
 		return -1;
@@ -299,8 +298,8 @@  static int __init arm64_enable_runtime_services(void)
 	memmap.map_end = memmap.map + mapsize;
 	efi.memmap = &memmap;
 
-	efi.systab = (__force void *)ioremap_cache(efi_system_table,
-						   sizeof(efi_system_table_t));
+	efi.systab = memremap(efi_system_table, sizeof(efi_system_table_t),
+			MEMREMAP_CACHE);
 	if (!efi.systab) {
 		pr_err("Failed to remap EFI System Table\n");
 		return -1;
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index aef3605a8c47..b9caf6cd44e6 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -73,19 +73,19 @@  static int smp_spin_table_cpu_init(unsigned int cpu)
 
 static int smp_spin_table_cpu_prepare(unsigned int cpu)
 {
-	__le64 __iomem *release_addr;
+	__le64 *release_addr;
 
 	if (!cpu_release_addr[cpu])
 		return -ENODEV;
 
 	/*
 	 * The cpu-release-addr may or may not be inside the linear mapping.
-	 * As ioremap_cache will either give us a new mapping or reuse the
-	 * existing linear mapping, we can use it to cover both cases. In
-	 * either case the memory will be MT_NORMAL.
+	 * As memremap will either give us a new mapping or reuse the existing
+	 * linear mapping, we can use it to cover both cases. In either case
+	 * the memory will be MT_NORMAL.
 	 */
-	release_addr = ioremap_cache(cpu_release_addr[cpu],
-				     sizeof(*release_addr));
+	release_addr = memremap(cpu_release_addr[cpu], sizeof(*release_addr),
+			MEMREMAP_CACHE);
 	if (!release_addr)
 		return -ENOMEM;
 
@@ -97,15 +97,14 @@  static int smp_spin_table_cpu_prepare(unsigned int cpu)
 	 * the boot protocol.
 	 */
 	writeq_relaxed(__pa(secondary_holding_pen), release_addr);
-	__flush_dcache_area((__force void *)release_addr,
-			    sizeof(*release_addr));
+	__flush_dcache_area(release_addr, sizeof(*release_addr));
 
 	/*
 	 * Send an event to wake up the secondary CPU.
 	 */
 	sev();
 
-	iounmap(release_addr);
+	memunmap(release_addr);
 
 	return 0;
 }
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 34aa19352dc1..2373bf183527 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -14,6 +14,7 @@  config FRV
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
 	select HAVE_DEBUG_STACKOVERFLOW
+	select ARCH_HAS_MEMREMAP
 
 config ZONE_DMA
 	bool
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 2dd8f63bfbbb..831b1be8c43d 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -23,6 +23,7 @@  config M68K
 	select MODULES_USE_ELF_RELA
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
+	select ARCH_HAS_MEMREMAP
 
 config RWSEM_GENERIC_SPINLOCK
 	bool
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index 0b389a81c43a..5669fe3eb807 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -24,6 +24,7 @@  config METAG
 	select HAVE_PERF_EVENTS
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNDERSCORE_SYMBOL_PREFIX
+	select ARCH_HAS_MEMREMAP
 	select IRQ_DOMAIN
 	select MODULES_USE_ELF_RELA
 	select OF
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index aab7e46cadd5..ac9da90931b9 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -59,6 +59,7 @@  config MIPS
 	select SYSCTL_EXCEPTION_TRACE
 	select HAVE_VIRT_CPU_ACCOUNTING_GEN
 	select HAVE_IRQ_TIME_ACCOUNTING
+	select ARCH_HAS_MEMREMAP
 
 menu "Machine selection"
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5ef27113b898..50889c7b5c81 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -89,6 +89,7 @@  config PPC
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select BINFMT_ELF
 	select ARCH_HAS_ELF_RANDOMIZE
+	select ARCH_HAS_MEMREMAP
 	select OF
 	select OF_EARLY_FLATTREE
 	select OF_RESERVED_MEM
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3dbb7e7909ca..80eb19c8b62d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@  config X86
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_MEMREMAP
 	select ARCH_HAS_PMEM_API
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index afa64adb75ee..b87ca059e5fc 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -31,19 +31,19 @@  ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 	if (!csize)
 		return 0;
 
-	vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+	vaddr = memremap(pfn << PAGE_SHIFT, PAGE_SIZE, MEMREMAP_CACHE);
 	if (!vaddr)
 		return -ENOMEM;
 
 	if (userbuf) {
 		if (copy_to_user(buf, vaddr + offset, csize)) {
-			iounmap(vaddr);
+			memunmap(vaddr);
 			return -EFAULT;
 		}
 	} else
 		memcpy(buf, vaddr + offset, csize);
 
 	set_iounmap_nonlazy();
-	iounmap(vaddr);
+	memunmap(vaddr);
 	return csize;
 }
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index dc1404bf8e4b..a2248482c14b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -49,7 +49,7 @@  static ssize_t setup_data_read(struct file *file, char __user *user_buf,
 	pa = node->paddr + sizeof(struct setup_data) + pos;
 	pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
 	if (PageHighMem(pg)) {
-		p = ioremap_cache(pa, count);
+		p = memremap(pa, count, MEMREMAP_CACHE);
 		if (!p)
 			return -ENXIO;
 	} else
@@ -58,7 +58,7 @@  static ssize_t setup_data_read(struct file *file, char __user *user_buf,
 	remain = copy_to_user(user_buf, p, count);
 
 	if (PageHighMem(pg))
-		iounmap(p);
+		memunmap(p);
 
 	if (remain)
 		return -EFAULT;
@@ -128,7 +128,7 @@  static int __init create_setup_data_nodes(struct dentry *parent)
 
 		pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
 		if (PageHighMem(pg)) {
-			data = ioremap_cache(pa_data, sizeof(*data));
+			data = memremap(pa_data, sizeof(*data), MEMREMAP_CACHE);
 			if (!data) {
 				kfree(node);
 				error = -ENXIO;
@@ -144,7 +144,7 @@  static int __init create_setup_data_nodes(struct dentry *parent)
 		pa_data = data->next;
 
 		if (PageHighMem(pg))
-			iounmap(data);
+			memunmap(data);
 		if (error)
 			goto err_dir;
 		no++;
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index c2bedaea11f7..d2670bb75a08 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -16,8 +16,8 @@ 
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/io.h>
 
-#include <asm/io.h>
 #include <asm/setup.h>
 
 static ssize_t version_show(struct kobject *kobj,
@@ -79,12 +79,12 @@  static int get_setup_data_paddr(int nr, u64 *paddr)
 			*paddr = pa_data;
 			return 0;
 		}
-		data = ioremap_cache(pa_data, sizeof(*data));
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_CACHE);
 		if (!data)
 			return -ENOMEM;
 
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 		i++;
 	}
 	return -EINVAL;
@@ -97,17 +97,17 @@  static int __init get_setup_data_size(int nr, size_t *size)
 	u64 pa_data = boot_params.hdr.setup_data;
 
 	while (pa_data) {
-		data = ioremap_cache(pa_data, sizeof(*data));
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_CACHE);
 		if (!data)
 			return -ENOMEM;
 		if (nr == i) {
 			*size = data->len;
-			iounmap(data);
+			memunmap(data);
 			return 0;
 		}
 
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 		i++;
 	}
 	return -EINVAL;
@@ -127,12 +127,12 @@  static ssize_t type_show(struct kobject *kobj,
 	ret = get_setup_data_paddr(nr, &paddr);
 	if (ret)
 		return ret;
-	data = ioremap_cache(paddr, sizeof(*data));
+	data = memremap(paddr, sizeof(*data), MEMREMAP_CACHE);
 	if (!data)
 		return -ENOMEM;
 
 	ret = sprintf(buf, "0x%x\n", data->type);
-	iounmap(data);
+	memunmap(data);
 	return ret;
 }
 
@@ -154,7 +154,7 @@  static ssize_t setup_data_data_read(struct file *fp,
 	ret = get_setup_data_paddr(nr, &paddr);
 	if (ret)
 		return ret;
-	data = ioremap_cache(paddr, sizeof(*data));
+	data = memremap(paddr, sizeof(*data), MEMREMAP_CACHE);
 	if (!data)
 		return -ENOMEM;
 
@@ -170,15 +170,15 @@  static ssize_t setup_data_data_read(struct file *fp,
 		goto out;
 
 	ret = count;
-	p = ioremap_cache(paddr + sizeof(*data), data->len);
+	p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_CACHE);
 	if (!p) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	memcpy(buf, p + off, count);
-	iounmap(p);
+	memunmap(p);
 out:
-	iounmap(data);
+	memunmap(data);
 	return ret;
 }
 
@@ -250,13 +250,13 @@  static int __init get_setup_data_total_num(u64 pa_data, int *nr)
 	*nr = 0;
 	while (pa_data) {
 		*nr += 1;
-		data = ioremap_cache(pa_data, sizeof(*data));
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_CACHE);
 		if (!data) {
 			ret = -ENOMEM;
 			goto out;
 		}
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 	}
 
 out:
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index ccf5c867b2eb..4920f735f551 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -401,12 +401,10 @@  void *xlate_dev_mem_ptr(phys_addr_t phys)
 	if (page_is_ram(start >> PAGE_SHIFT))
 		return __va(phys);
 
-	vaddr = ioremap_cache(start, PAGE_SIZE);
-	/* Only add the offset on success and return NULL if the ioremap() failed: */
+	vaddr = memremap(start, PAGE_SIZE, MEMREMAP_CACHE);
 	if (vaddr)
-		vaddr += offset;
-
-	return vaddr;
+		return vaddr + offset;
+	return NULL;
 }
 
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
@@ -414,7 +412,7 @@  void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 	if (page_is_ram(phys >> PAGE_SHIFT))
 		return;
 
-	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+	memunmap((void *)((unsigned long)addr & PAGE_MASK));
 }
 
 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index e5b872ba2484..da1078692c41 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -3,6 +3,7 @@  config ZONE_DMA
 
 config XTENSA
 	def_bool y
+	select ARCH_HAS_MEMREMAP
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index a095d4f858da..d4992fea6994 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -318,7 +318,8 @@  static int __einj_error_trigger(u64 trigger_paddr, u32 type,
 			    sizeof(*trigger_tab) - 1);
 		goto out;
 	}
-	trigger_tab = ioremap_cache(trigger_paddr, sizeof(*trigger_tab));
+	trigger_tab = memremap(trigger_paddr, sizeof(*trigger_tab),
+			MEMREMAP_CACHE);
 	if (!trigger_tab) {
 		pr_err(EINJ_PFX "Failed to map trigger table!\n");
 		goto out_rel_header;
@@ -346,8 +347,8 @@  static int __einj_error_trigger(u64 trigger_paddr, u32 type,
 		       (unsigned long long)trigger_paddr + table_size - 1);
 		goto out_rel_header;
 	}
-	iounmap(trigger_tab);
-	trigger_tab = ioremap_cache(trigger_paddr, table_size);
+	memunmap(trigger_tab);
+	trigger_tab = memremap(trigger_paddr, table_size, MEMREMAP_CACHE);
 	if (!trigger_tab) {
 		pr_err(EINJ_PFX "Failed to map trigger table!\n");
 		goto out_rel_entry;
@@ -409,7 +410,7 @@  out_rel_header:
 	release_mem_region(trigger_paddr, sizeof(*trigger_tab));
 out:
 	if (trigger_tab)
-		iounmap(trigger_tab);
+		memunmap(trigger_tab);
 
 	return rc;
 }
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 3670bbab57a3..dc49b0b42d65 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -77,7 +77,7 @@  static struct acpi_table_erst *erst_tab;
 static struct erst_erange {
 	u64 base;
 	u64 size;
-	void __iomem *vaddr;
+	void *vaddr;
 	u32 attr;
 } erst_erange;
 
@@ -1185,8 +1185,8 @@  static int __init erst_init(void)
 		goto err_unmap_reg;
 	}
 	rc = -ENOMEM;
-	erst_erange.vaddr = ioremap_cache(erst_erange.base,
-					  erst_erange.size);
+	erst_erange.vaddr = memremap(erst_erange.base, erst_erange.size,
+			MEMREMAP_CACHE);
 	if (!erst_erange.vaddr)
 		goto err_release_erange;
 
diff --git a/drivers/firmware/google/memconsole.c b/drivers/firmware/google/memconsole.c
index 2f569aaed4c7..2b8160d12ad6 100644
--- a/drivers/firmware/google/memconsole.c
+++ b/drivers/firmware/google/memconsole.c
@@ -52,14 +52,15 @@  static ssize_t memconsole_read(struct file *filp, struct kobject *kobp,
 	char *memconsole;
 	ssize_t ret;
 
-	memconsole = ioremap_cache(memconsole_baseaddr, memconsole_length);
+	memconsole = memremap(memconsole_baseaddr, memconsole_length,
+			MEMREMAP_CACHE);
 	if (!memconsole) {
 		pr_err("memconsole: ioremap_cache failed\n");
 		return -ENOMEM;
 	}
 	ret = memory_read_from_buffer(buf, count, &pos, memconsole,
 				      memconsole_length);
-	iounmap(memconsole);
+	memunmap(memconsole);
 	return ret;
 }
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 5a31bf3a4024..04e53eab71c5 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -668,6 +668,11 @@  extern void devm_free_pages(struct device *dev, unsigned long addr);
 
 void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res);
 
+static inline void *devm_memremap_resource(struct device *dev, struct resource *res)
+{
+	return (void __force *) devm_ioremap_resource(dev, res);
+}
+
 /* allows to add/remove a custom action to devres stack */
 int devm_add_action(struct device *dev, void (*action)(void *), void *data);
 void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
diff --git a/include/linux/io.h b/include/linux/io.h
index 080a4fbf2ba4..2983b6e63970 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -192,4 +192,15 @@  static inline int arch_phys_wc_index(int handle)
 #endif
 #endif
 
+enum {
+	MEMREMAP_WB = 1 << 0,
+	MEMREMAP_WT = 1 << 1,
+	MEMREMAP_WC = 1 << 2,
+	MEMREMAP_STRICT = 1 << 3,
+	MEMREMAP_CACHE = MEMREMAP_WB,
+};
+
+extern void *memremap(resource_size_t offset, size_t size, unsigned long flags);
+extern void memunmap(void *addr);
+
 #endif /* _LINUX_IO_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index fed052a1bc9f..a96376be513a 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -23,7 +23,7 @@ 
 #include <linux/pfn.h>
 #include <linux/mm.h>
 #include <linux/resource_ext.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 
 struct resource ioport_resource = {
@@ -528,6 +528,55 @@  int region_is_ram(resource_size_t start, unsigned long size)
 	return ret;
 }
 
+#ifdef CONFIG_ARCH_HAS_MEMREMAP
+/*
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable.
+ */
+static bool memremap_valid(resource_size_t offset, size_t size)
+{
+	if (region_is_ram(offset, size) != 0) {
+		WARN_ONCE(1, "memremap attempted on ram %pa size: %zu\n",
+				&offset, size);
+		return false;
+	}
+	return true;
+}
+
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+	void __iomem *addr = NULL;
+
+	if (!memremap_valid(offset, size))
+		return NULL;
+
+	/* try all mapping types requested until one returns non-NULL */
+	if (flags & MEMREMAP_CACHE) {
+		if (flags & MEMREMAP_STRICT)
+			addr = strict_ioremap_cache(offset, size);
+		else
+			addr = ioremap_cache(offset, size);
+	}
+
+	if (!addr && (flags & MEMREMAP_WT)) {
+		if (flags & MEMREMAP_STRICT)
+			addr = strict_ioremap_wt(offset, size);
+		else
+			addr = ioremap_wt(offset, size);
+	}
+
+	return (void __force *) addr;
+}
+EXPORT_SYMBOL(memremap);
+
+void memunmap(void *addr)
+{
+	iounmap((void __iomem __force *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+#endif /* CONFIG_ARCH_HAS_MEMREMAP */
+
 void __weak arch_remove_reservations(struct resource *avail)
 {
 }
diff --git a/lib/Kconfig b/lib/Kconfig
index 3a2ef67db6c7..097b99073924 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -526,7 +526,10 @@  source "lib/fonts/Kconfig"
 #
 
 config ARCH_HAS_SG_CHAIN
-	def_bool n
+	bool
+
+config ARCH_HAS_MEMREMAP
+	bool
 
 config ARCH_HAS_PMEM_API
 	bool