diff mbox

[RFC,v2,08/32] x86: Use PAGE_KERNEL protection for ioremap of memory page

Message ID 148846761276.2349.4899767672892365544.stgit@brijesh-build-machine (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show

Commit Message

Brijesh Singh March 2, 2017, 3:13 p.m. UTC
From: Tom Lendacky <thomas.lendacky@amd.com>

In order for memory pages to be properly mapped when SEV is active, we
need to use the PAGE_KERNEL protection attribute as the base protection.
This will insure that memory mapping of, e.g. ACPI tables, receives the
proper mapping attributes.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 arch/x86/mm/ioremap.c |    8 ++++++++
 include/linux/mm.h    |    1 +
 kernel/resource.c     |   40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+)

Comments

Borislav Petkov March 7, 2017, 2:59 p.m. UTC | #1
On Thu, Mar 02, 2017 at 10:13:32AM -0500, Brijesh Singh wrote:
> From: Tom Lendacky <thomas.lendacky@amd.com>
> 
> In order for memory pages to be properly mapped when SEV is active, we
> need to use the PAGE_KERNEL protection attribute as the base protection.
> This will insure that memory mapping of, e.g. ACPI tables, receives the
> proper mapping attributes.
> 
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> ---

> diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
> index c400ab5..481c999 100644
> --- a/arch/x86/mm/ioremap.c
> +++ b/arch/x86/mm/ioremap.c
> @@ -151,7 +151,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
>                 pcm = new_pcm;
>         }
> 
> +       /*
> +        * If the page being mapped is in memory and SEV is active then
> +        * make sure the memory encryption attribute is enabled in the
> +        * resulting mapping.
> +        */
>         prot = PAGE_KERNEL_IO;
> +       if (sev_active() && page_is_mem(pfn))

Hmm, a resource tree walk per ioremap call. This could get expensive for
ioremap-heavy workloads.

__ioremap_caller() gets called here during boot 55 times so not a whole
lot but I wouldn't be surprised if there were some nasty use cases which
ioremap a lot.

...

> diff --git a/kernel/resource.c b/kernel/resource.c
> index 9b5f044..db56ba3 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -518,6 +518,46 @@ int __weak page_is_ram(unsigned long pfn)
>  }
>  EXPORT_SYMBOL_GPL(page_is_ram);
>  
> +/*
> + * This function returns true if the target memory is marked as
> + * IORESOURCE_MEM and IORESOUCE_BUSY and described as other than
> + * IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
> + */
> +static int walk_mem_range(unsigned long start_pfn, unsigned long nr_pages)
> +{
> +	struct resource res;
> +	unsigned long pfn, end_pfn;
> +	u64 orig_end;
> +	int ret = -1;
> +
> +	res.start = (u64) start_pfn << PAGE_SHIFT;
> +	res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
> +	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> +	orig_end = res.end;
> +	while ((res.start < res.end) &&
> +		(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
> +		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +		end_pfn = (res.end + 1) >> PAGE_SHIFT;
> +		if (end_pfn > pfn)
> +			ret = (res.desc != IORES_DESC_NONE) ? 1 : 0;
> +		if (ret)
> +			break;
> +		res.start = res.end + 1;
> +		res.end = orig_end;
> +	}
> +	return ret;
> +}

So the relevant difference between this one and walk_system_ram_range()
is this:

-			ret = (*func)(pfn, end_pfn - pfn, arg);
+			ret = (res.desc != IORES_DESC_NONE) ? 1 : 0;

so it seems to me you can have your own *func() pointer which does that
IORES_DESC_NONE comparison. And then you can define your own workhorse
__walk_memory_range() which gets called by both walk_mem_range() and
walk_system_ram_range() instead of almost duplicating them.

And looking at walk_system_ram_res(), that one looks similar too except
the pfn computation. But AFAICT the pfn/end_pfn things are computed from
res.start and res.end so it looks to me like all those three functions
are crying for unification...
Tom Lendacky March 16, 2017, 8:04 p.m. UTC | #2
On 3/7/2017 8:59 AM, Borislav Petkov wrote:
> On Thu, Mar 02, 2017 at 10:13:32AM -0500, Brijesh Singh wrote:
>> From: Tom Lendacky <thomas.lendacky@amd.com>
>>
>> In order for memory pages to be properly mapped when SEV is active, we
>> need to use the PAGE_KERNEL protection attribute as the base protection.
>> This will insure that memory mapping of, e.g. ACPI tables, receives the
>> proper mapping attributes.
>>
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> ---
>
>> diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
>> index c400ab5..481c999 100644
>> --- a/arch/x86/mm/ioremap.c
>> +++ b/arch/x86/mm/ioremap.c
>> @@ -151,7 +151,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
>>                 pcm = new_pcm;
>>         }
>>
>> +       /*
>> +        * If the page being mapped is in memory and SEV is active then
>> +        * make sure the memory encryption attribute is enabled in the
>> +        * resulting mapping.
>> +        */
>>         prot = PAGE_KERNEL_IO;
>> +       if (sev_active() && page_is_mem(pfn))
>
> Hmm, a resource tree walk per ioremap call. This could get expensive for
> ioremap-heavy workloads.
>
> __ioremap_caller() gets called here during boot 55 times so not a whole
> lot but I wouldn't be surprised if there were some nasty use cases which
> ioremap a lot.
>
> ...
>
>> diff --git a/kernel/resource.c b/kernel/resource.c
>> index 9b5f044..db56ba3 100644
>> --- a/kernel/resource.c
>> +++ b/kernel/resource.c
>> @@ -518,6 +518,46 @@ int __weak page_is_ram(unsigned long pfn)
>>  }
>>  EXPORT_SYMBOL_GPL(page_is_ram);
>>
>> +/*
>> + * This function returns true if the target memory is marked as
>> + * IORESOURCE_MEM and IORESOUCE_BUSY and described as other than
>> + * IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
>> + */
>> +static int walk_mem_range(unsigned long start_pfn, unsigned long nr_pages)
>> +{
>> +	struct resource res;
>> +	unsigned long pfn, end_pfn;
>> +	u64 orig_end;
>> +	int ret = -1;
>> +
>> +	res.start = (u64) start_pfn << PAGE_SHIFT;
>> +	res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
>> +	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
>> +	orig_end = res.end;
>> +	while ((res.start < res.end) &&
>> +		(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
>> +		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
>> +		end_pfn = (res.end + 1) >> PAGE_SHIFT;
>> +		if (end_pfn > pfn)
>> +			ret = (res.desc != IORES_DESC_NONE) ? 1 : 0;
>> +		if (ret)
>> +			break;
>> +		res.start = res.end + 1;
>> +		res.end = orig_end;
>> +	}
>> +	return ret;
>> +}
>
> So the relevant difference between this one and walk_system_ram_range()
> is this:
>
> -			ret = (*func)(pfn, end_pfn - pfn, arg);
> +			ret = (res.desc != IORES_DESC_NONE) ? 1 : 0;
>
> so it seems to me you can have your own *func() pointer which does that
> IORES_DESC_NONE comparison. And then you can define your own workhorse
> __walk_memory_range() which gets called by both walk_mem_range() and
> walk_system_ram_range() instead of almost duplicating them.
>
> And looking at walk_system_ram_res(), that one looks similar too except
> the pfn computation. But AFAICT the pfn/end_pfn things are computed from
> res.start and res.end so it looks to me like all those three functions
> are crying for unification...

I'll take a look at what it takes to consolidate these with a pre-patch. 
Then I'll add the new support.

Thanks,
Tom

>
Tom Lendacky March 17, 2017, 2:32 p.m. UTC | #3
On 3/16/2017 3:04 PM, Tom Lendacky wrote:
> On 3/7/2017 8:59 AM, Borislav Petkov wrote:
>> On Thu, Mar 02, 2017 at 10:13:32AM -0500, Brijesh Singh wrote:
>>> From: Tom Lendacky <thomas.lendacky@amd.com>
>>>
>>> In order for memory pages to be properly mapped when SEV is active, we
>>> need to use the PAGE_KERNEL protection attribute as the base protection.
>>> This will insure that memory mapping of, e.g. ACPI tables, receives the
>>> proper mapping attributes.
>>>
>>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>>> ---
>>
>>> diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
>>> index c400ab5..481c999 100644
>>> --- a/arch/x86/mm/ioremap.c
>>> +++ b/arch/x86/mm/ioremap.c
>>> @@ -151,7 +151,15 @@ static void __iomem
>>> *__ioremap_caller(resource_size_t phys_addr,
>>>                 pcm = new_pcm;
>>>         }
>>>
>>> +       /*
>>> +        * If the page being mapped is in memory and SEV is active then
>>> +        * make sure the memory encryption attribute is enabled in the
>>> +        * resulting mapping.
>>> +        */
>>>         prot = PAGE_KERNEL_IO;
>>> +       if (sev_active() && page_is_mem(pfn))
>>
>> Hmm, a resource tree walk per ioremap call. This could get expensive for
>> ioremap-heavy workloads.
>>
>> __ioremap_caller() gets called here during boot 55 times so not a whole
>> lot but I wouldn't be surprised if there were some nasty use cases which
>> ioremap a lot.
>>
>> ...
>>
>>> diff --git a/kernel/resource.c b/kernel/resource.c
>>> index 9b5f044..db56ba3 100644
>>> --- a/kernel/resource.c
>>> +++ b/kernel/resource.c
>>> @@ -518,6 +518,46 @@ int __weak page_is_ram(unsigned long pfn)
>>>  }
>>>  EXPORT_SYMBOL_GPL(page_is_ram);
>>>
>>> +/*
>>> + * This function returns true if the target memory is marked as
>>> + * IORESOURCE_MEM and IORESOUCE_BUSY and described as other than
>>> + * IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
>>> + */
>>> +static int walk_mem_range(unsigned long start_pfn, unsigned long
>>> nr_pages)
>>> +{
>>> +    struct resource res;
>>> +    unsigned long pfn, end_pfn;
>>> +    u64 orig_end;
>>> +    int ret = -1;
>>> +
>>> +    res.start = (u64) start_pfn << PAGE_SHIFT;
>>> +    res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
>>> +    res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
>>> +    orig_end = res.end;
>>> +    while ((res.start < res.end) &&
>>> +        (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
>>> +        pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
>>> +        end_pfn = (res.end + 1) >> PAGE_SHIFT;
>>> +        if (end_pfn > pfn)
>>> +            ret = (res.desc != IORES_DESC_NONE) ? 1 : 0;
>>> +        if (ret)
>>> +            break;
>>> +        res.start = res.end + 1;
>>> +        res.end = orig_end;
>>> +    }
>>> +    return ret;
>>> +}
>>
>> So the relevant difference between this one and walk_system_ram_range()
>> is this:
>>
>> -            ret = (*func)(pfn, end_pfn - pfn, arg);
>> +            ret = (res.desc != IORES_DESC_NONE) ? 1 : 0;
>>
>> so it seems to me you can have your own *func() pointer which does that
>> IORES_DESC_NONE comparison. And then you can define your own workhorse
>> __walk_memory_range() which gets called by both walk_mem_range() and
>> walk_system_ram_range() instead of almost duplicating them.
>>
>> And looking at walk_system_ram_res(), that one looks similar too except
>> the pfn computation. But AFAICT the pfn/end_pfn things are computed from
>> res.start and res.end so it looks to me like all those three functions
>> are crying for unification...
>
> I'll take a look at what it takes to consolidate these with a pre-patch.
> Then I'll add the new support.

It looks pretty straight forward to combine walk_iomem_res_desc() and
walk_system_ram_res(). The walk_system_ram_range() function would fit
easily into this, also, except for the fact that the callback function
takes unsigned longs vs the u64s of the other functions.  Is it worth
modifying all of the callers of walk_system_ram_range() (which are only
about 8 locations) to change the callback functions to accept u64s in
order to consolidate the walk_system_ram_range() function, too?

Thanks,
Tom

>
> Thanks,
> Tom
>
>>
Tom Lendacky March 17, 2017, 2:55 p.m. UTC | #4
On 3/17/2017 9:32 AM, Tom Lendacky wrote:
> On 3/16/2017 3:04 PM, Tom Lendacky wrote:
>> On 3/7/2017 8:59 AM, Borislav Petkov wrote:
>>> On Thu, Mar 02, 2017 at 10:13:32AM -0500, Brijesh Singh wrote:
>>>> From: Tom Lendacky <thomas.lendacky@amd.com>
>>>>
>>>> In order for memory pages to be properly mapped when SEV is active, we
>>>> need to use the PAGE_KERNEL protection attribute as the base
>>>> protection.
>>>> This will insure that memory mapping of, e.g. ACPI tables, receives the
>>>> proper mapping attributes.
>>>>
>>>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>>>> ---
>>>
>>>> diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
>>>> index c400ab5..481c999 100644
>>>> --- a/arch/x86/mm/ioremap.c
>>>> +++ b/arch/x86/mm/ioremap.c
>>>> @@ -151,7 +151,15 @@ static void __iomem
>>>> *__ioremap_caller(resource_size_t phys_addr,
>>>>                 pcm = new_pcm;
>>>>         }
>>>>
>>>> +       /*
>>>> +        * If the page being mapped is in memory and SEV is active then
>>>> +        * make sure the memory encryption attribute is enabled in the
>>>> +        * resulting mapping.
>>>> +        */
>>>>         prot = PAGE_KERNEL_IO;
>>>> +       if (sev_active() && page_is_mem(pfn))
>>>
>>> Hmm, a resource tree walk per ioremap call. This could get expensive for
>>> ioremap-heavy workloads.
>>>
>>> __ioremap_caller() gets called here during boot 55 times so not a whole
>>> lot but I wouldn't be surprised if there were some nasty use cases which
>>> ioremap a lot.
>>>
>>> ...
>>>
>>>> diff --git a/kernel/resource.c b/kernel/resource.c
>>>> index 9b5f044..db56ba3 100644
>>>> --- a/kernel/resource.c
>>>> +++ b/kernel/resource.c
>>>> @@ -518,6 +518,46 @@ int __weak page_is_ram(unsigned long pfn)
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(page_is_ram);
>>>>
>>>> +/*
>>>> + * This function returns true if the target memory is marked as
>>>> + * IORESOURCE_MEM and IORESOUCE_BUSY and described as other than
>>>> + * IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
>>>> + */
>>>> +static int walk_mem_range(unsigned long start_pfn, unsigned long
>>>> nr_pages)
>>>> +{
>>>> +    struct resource res;
>>>> +    unsigned long pfn, end_pfn;
>>>> +    u64 orig_end;
>>>> +    int ret = -1;
>>>> +
>>>> +    res.start = (u64) start_pfn << PAGE_SHIFT;
>>>> +    res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
>>>> +    res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
>>>> +    orig_end = res.end;
>>>> +    while ((res.start < res.end) &&
>>>> +        (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
>>>> +        pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
>>>> +        end_pfn = (res.end + 1) >> PAGE_SHIFT;
>>>> +        if (end_pfn > pfn)
>>>> +            ret = (res.desc != IORES_DESC_NONE) ? 1 : 0;
>>>> +        if (ret)
>>>> +            break;
>>>> +        res.start = res.end + 1;
>>>> +        res.end = orig_end;
>>>> +    }
>>>> +    return ret;
>>>> +}
>>>
>>> So the relevant difference between this one and walk_system_ram_range()
>>> is this:
>>>
>>> -            ret = (*func)(pfn, end_pfn - pfn, arg);
>>> +            ret = (res.desc != IORES_DESC_NONE) ? 1 : 0;
>>>
>>> so it seems to me you can have your own *func() pointer which does that
>>> IORES_DESC_NONE comparison. And then you can define your own workhorse
>>> __walk_memory_range() which gets called by both walk_mem_range() and
>>> walk_system_ram_range() instead of almost duplicating them.
>>>
>>> And looking at walk_system_ram_res(), that one looks similar too except
>>> the pfn computation. But AFAICT the pfn/end_pfn things are computed from
>>> res.start and res.end so it looks to me like all those three functions
>>> are crying for unification...
>>
>> I'll take a look at what it takes to consolidate these with a pre-patch.
>> Then I'll add the new support.
>
> It looks pretty straight forward to combine walk_iomem_res_desc() and
> walk_system_ram_res(). The walk_system_ram_range() function would fit
> easily into this, also, except for the fact that the callback function
> takes unsigned longs vs the u64s of the other functions.  Is it worth
> modifying all of the callers of walk_system_ram_range() (which are only
> about 8 locations) to change the callback functions to accept u64s in
> order to consolidate the walk_system_ram_range() function, too?

The more I dig, the more I find that the changes keep expanding. I'll
leave walk_system_ram_range() out of the consolidation for now.

Thanks,
Tom

>
> Thanks,
> Tom
>
>>
>> Thanks,
>> Tom
>>
>>>
diff mbox

Patch

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c400ab5..481c999 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -151,7 +151,15 @@  static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		pcm = new_pcm;
 	}
 
+	/*
+	 * If the page being mapped is in memory and SEV is active then
+	 * make sure the memory encryption attribute is enabled in the
+	 * resulting mapping.
+	 */
 	prot = PAGE_KERNEL_IO;
+	if (sev_active() && page_is_mem(pfn))
+		prot = __pgprot(pgprot_val(prot) | _PAGE_ENC);
+
 	switch (pcm) {
 	case _PAGE_CACHE_MODE_UC:
 	default:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b84615b..825df27 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -445,6 +445,7 @@  static inline int get_page_unless_zero(struct page *page)
 }
 
 extern int page_is_ram(unsigned long pfn);
+extern int page_is_mem(unsigned long pfn);
 
 enum {
 	REGION_INTERSECTS,
diff --git a/kernel/resource.c b/kernel/resource.c
index 9b5f044..db56ba3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -518,6 +518,46 @@  int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
 
+/*
+ * This function returns true if the target memory is marked as
+ * IORESOURCE_MEM and IORESOUCE_BUSY and described as other than
+ * IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ */
+static int walk_mem_range(unsigned long start_pfn, unsigned long nr_pages)
+{
+	struct resource res;
+	unsigned long pfn, end_pfn;
+	u64 orig_end;
+	int ret = -1;
+
+	res.start = (u64) start_pfn << PAGE_SHIFT;
+	res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
+	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	orig_end = res.end;
+	while ((res.start < res.end) &&
+		(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
+		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		end_pfn = (res.end + 1) >> PAGE_SHIFT;
+		if (end_pfn > pfn)
+			ret = (res.desc != IORES_DESC_NONE) ? 1 : 0;
+		if (ret)
+			break;
+		res.start = res.end + 1;
+		res.end = orig_end;
+	}
+	return ret;
+}
+
+/*
+ * This generic page_is_mem() returns true if specified address is
+ * registered as memory in iomem_resource list.
+ */
+int __weak page_is_mem(unsigned long pfn)
+{
+	return walk_mem_range(pfn, 1) == 1;
+}
+EXPORT_SYMBOL_GPL(page_is_mem);
+
 /**
  * region_intersects() - determine intersection of region with known resources
  * @start: region start address