diff mbox series

[v3,12/14] arm64: realm: Support nonsecure ITS emulation shared

Message ID 20240605093006.145492-13-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: Support for running as a guest in Arm CCA | expand

Commit Message

Steven Price June 5, 2024, 9:30 a.m. UTC
Within a realm guest the ITS is emulated by the host. This means the
allocations must have been made available to the host by a call to
set_memory_decrypted(). Introduce an allocation function which performs
this extra call.

Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v2:
 * Drop 'shared' from the new its_xxx function names as they are used
   for non-realm guests too.
 * Don't handle the NUMA_NO_NODE case specially - alloc_pages_node()
   should do the right thing.
 * Drop a pointless (void *) cast.
---
 drivers/irqchip/irq-gic-v3-its.c | 90 ++++++++++++++++++++++++--------
 1 file changed, 67 insertions(+), 23 deletions(-)

Comments

Marc Zyngier June 5, 2024, 1:39 p.m. UTC | #1
The subject line is... odd. I'd expect something like:

"irqchip/gic-v3-its: Share ITS tables with a non-trusted hypervisor"

because nothing here should be CCA specific.

On Wed, 05 Jun 2024 10:30:04 +0100,
Steven Price <steven.price@arm.com> wrote:
> 
> Within a realm guest the ITS is emulated by the host. This means the
> allocations must have been made available to the host by a call to
> set_memory_decrypted(). Introduce an allocation function which performs
> this extra call.

This doesn't mention that this patch radically changes the allocation
of some tables.

> 
> Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v2:
>  * Drop 'shared' from the new its_xxx function names as they are used
>    for non-realm guests too.
>  * Don't handle the NUMA_NO_NODE case specially - alloc_pages_node()
>    should do the right thing.
>  * Drop a pointless (void *) cast.
> ---
>  drivers/irqchip/irq-gic-v3-its.c | 90 ++++++++++++++++++++++++--------
>  1 file changed, 67 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index 40ebf1726393..ca72f830f4cc 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -18,6 +18,7 @@
>  #include <linux/irqdomain.h>
>  #include <linux/list.h>
>  #include <linux/log2.h>
> +#include <linux/mem_encrypt.h>
>  #include <linux/memblock.h>
>  #include <linux/mm.h>
>  #include <linux/msi.h>
> @@ -27,6 +28,7 @@
>  #include <linux/of_pci.h>
>  #include <linux/of_platform.h>
>  #include <linux/percpu.h>
> +#include <linux/set_memory.h>
>  #include <linux/slab.h>
>  #include <linux/syscore_ops.h>
>  
> @@ -163,6 +165,7 @@ struct its_device {
>  	struct its_node		*its;
>  	struct event_lpi_map	event_map;
>  	void			*itt;
> +	u32			itt_order;
>  	u32			nr_ites;
>  	u32			device_id;
>  	bool			shared;
> @@ -198,6 +201,30 @@ static DEFINE_IDA(its_vpeid_ida);
>  #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
>  #define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + SZ_128K)
>  
> +static struct page *its_alloc_pages_node(int node, gfp_t gfp,
> +					 unsigned int order)
> +{
> +	struct page *page;
> +
> +	page = alloc_pages_node(node, gfp, order);
> +
> +	if (page)
> +		set_memory_decrypted((unsigned long)page_address(page),
> +				     1 << order);

Please use BIT(order).

> +	return page;
> +}
> +
> +static struct page *its_alloc_pages(gfp_t gfp, unsigned int order)
> +{
> +	return its_alloc_pages_node(NUMA_NO_NODE, gfp, order);
> +}
> +
> +static void its_free_pages(void *addr, unsigned int order)
> +{
> +	set_memory_encrypted((unsigned long)addr, 1 << order);
> +	free_pages((unsigned long)addr, order);
> +}
> +
>  /*
>   * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we
>   * always have vSGIs mapped.
> @@ -2212,7 +2239,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>  {
>  	struct page *prop_page;
>  
> -	prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ));
> +	prop_page = its_alloc_pages(gfp_flags,
> +				    get_order(LPI_PROPBASE_SZ));
>  	if (!prop_page)
>  		return NULL;
>  
> @@ -2223,8 +2251,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>  
>  static void its_free_prop_table(struct page *prop_page)
>  {
> -	free_pages((unsigned long)page_address(prop_page),
> -		   get_order(LPI_PROPBASE_SZ));
> +	its_free_pages(page_address(prop_page),
> +		       get_order(LPI_PROPBASE_SZ));
>  }
>  
>  static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size)
> @@ -2346,7 +2374,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  		order = get_order(GITS_BASER_PAGES_MAX * psz);
>  	}
>  
> -	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
> +	page = its_alloc_pages_node(its->numa_node,
> +				    GFP_KERNEL | __GFP_ZERO, order);
>  	if (!page)
>  		return -ENOMEM;
>  
> @@ -2359,7 +2388,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  		/* 52bit PA is supported only when PageSize=64K */
>  		if (psz != SZ_64K) {
>  			pr_err("ITS: no 52bit PA support when psz=%d\n", psz);
> -			free_pages((unsigned long)base, order);
> +			its_free_pages(base, order);
>  			return -ENXIO;
>  		}
>  
> @@ -2415,7 +2444,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  		pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
>  		       &its->phys_base, its_base_type_string[type],
>  		       val, tmp);
> -		free_pages((unsigned long)base, order);
> +		its_free_pages(base, order);
>  		return -ENXIO;
>  	}
>  
> @@ -2554,8 +2583,8 @@ static void its_free_tables(struct its_node *its)
>  
>  	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>  		if (its->tables[i].base) {
> -			free_pages((unsigned long)its->tables[i].base,
> -				   its->tables[i].order);
> +			its_free_pages(its->tables[i].base,
> +				       its->tables[i].order);
>  			its->tables[i].base = NULL;
>  		}
>  	}
> @@ -2821,7 +2850,8 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
>  
>  	/* Allocate memory for 2nd level table */
>  	if (!table[idx]) {
> -		page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz));
> +		page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO,
> +				       get_order(psz));
>  		if (!page)
>  			return false;
>  
> @@ -2940,7 +2970,8 @@ static int allocate_vpe_l1_table(void)
>  
>  	pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n",
>  		 np, npg, psz, epp, esz);
> -	page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE));
> +	page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO,
> +			       get_order(np * PAGE_SIZE));
>  	if (!page)
>  		return -ENOMEM;
>  
> @@ -2986,8 +3017,8 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>  {
>  	struct page *pend_page;
>  
> -	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
> -				get_order(LPI_PENDBASE_SZ));
> +	pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO,
> +				    get_order(LPI_PENDBASE_SZ));
>  	if (!pend_page)
>  		return NULL;
>  
> @@ -2999,7 +3030,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>  
>  static void its_free_pending_table(struct page *pt)
>  {
> -	free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ));
> +	its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ));
>  }
>  
>  /*
> @@ -3334,8 +3365,9 @@ static bool its_alloc_table_entry(struct its_node *its,
>  
>  	/* Allocate memory for 2nd level table */
>  	if (!table[idx]) {
> -		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
> -					get_order(baser->psz));
> +		page = its_alloc_pages_node(its->numa_node,
> +					    GFP_KERNEL | __GFP_ZERO,
> +					    get_order(baser->psz));
>  		if (!page)
>  			return false;
>  
> @@ -3418,7 +3450,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>  	unsigned long *lpi_map = NULL;
>  	unsigned long flags;
>  	u16 *col_map = NULL;
> +	struct page *page;
>  	void *itt;
> +	int itt_order;
>  	int lpi_base;
>  	int nr_lpis;
>  	int nr_ites;
> @@ -3430,7 +3464,6 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>  	if (WARN_ON(!is_power_of_2(nvecs)))
>  		nvecs = roundup_pow_of_two(nvecs);
>  
> -	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
>  	/*
>  	 * Even if the device wants a single LPI, the ITT must be
>  	 * sized as a power of two (and you need at least one bit...).
> @@ -3438,7 +3471,16 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>  	nr_ites = max(2, nvecs);
>  	sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
>  	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
> -	itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
> +	itt_order = get_order(sz);
> +	page = its_alloc_pages_node(its->numa_node,
> +				    GFP_KERNEL | __GFP_ZERO,
> +				    itt_order);

So we go from an allocation that was so far measured in *bytes* to
something that is now at least a page. Per device. This seems a bit
excessive to me, specially when it isn't conditioned on anything and
is now imposed on all platforms, including the non-CCA systems (which
are exactly 100% of the machines).

Another thing is that if we go with page alignment, then the 256 byte
alignment can obviously be removed everywhere (hint: MAPD needs to
change).

Thanks,

	M.
Steven Price June 5, 2024, 3:08 p.m. UTC | #2
Hi Marc,

On 05/06/2024 14:39, Marc Zyngier wrote:
> The subject line is... odd. I'd expect something like:
> 
> "irqchip/gic-v3-its: Share ITS tables with a non-trusted hypervisor"
> 
> because nothing here should be CCA specific.

Good point - that's a much better subject.

> On Wed, 05 Jun 2024 10:30:04 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> Within a realm guest the ITS is emulated by the host. This means the
>> allocations must have been made available to the host by a call to
>> set_memory_decrypted(). Introduce an allocation function which performs
>> this extra call.
> 
> This doesn't mention that this patch radically changes the allocation
> of some tables.

I guess that depends on your definition of radical, see below.

>>
>> Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
>> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v2:
>>  * Drop 'shared' from the new its_xxx function names as they are used
>>    for non-realm guests too.
>>  * Don't handle the NUMA_NO_NODE case specially - alloc_pages_node()
>>    should do the right thing.
>>  * Drop a pointless (void *) cast.
>> ---
>>  drivers/irqchip/irq-gic-v3-its.c | 90 ++++++++++++++++++++++++--------
>>  1 file changed, 67 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
>> index 40ebf1726393..ca72f830f4cc 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -18,6 +18,7 @@
>>  #include <linux/irqdomain.h>
>>  #include <linux/list.h>
>>  #include <linux/log2.h>
>> +#include <linux/mem_encrypt.h>
>>  #include <linux/memblock.h>
>>  #include <linux/mm.h>
>>  #include <linux/msi.h>
>> @@ -27,6 +28,7 @@
>>  #include <linux/of_pci.h>
>>  #include <linux/of_platform.h>
>>  #include <linux/percpu.h>
>> +#include <linux/set_memory.h>
>>  #include <linux/slab.h>
>>  #include <linux/syscore_ops.h>
>>  
>> @@ -163,6 +165,7 @@ struct its_device {
>>  	struct its_node		*its;
>>  	struct event_lpi_map	event_map;
>>  	void			*itt;
>> +	u32			itt_order;
>>  	u32			nr_ites;
>>  	u32			device_id;
>>  	bool			shared;
>> @@ -198,6 +201,30 @@ static DEFINE_IDA(its_vpeid_ida);
>>  #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
>>  #define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + SZ_128K)
>>  
>> +static struct page *its_alloc_pages_node(int node, gfp_t gfp,
>> +					 unsigned int order)
>> +{
>> +	struct page *page;
>> +
>> +	page = alloc_pages_node(node, gfp, order);
>> +
>> +	if (page)
>> +		set_memory_decrypted((unsigned long)page_address(page),
>> +				     1 << order);
> 
> Please use BIT(order).

Sure.

>> +	return page;
>> +}
>> +
>> +static struct page *its_alloc_pages(gfp_t gfp, unsigned int order)
>> +{
>> +	return its_alloc_pages_node(NUMA_NO_NODE, gfp, order);
>> +}
>> +
>> +static void its_free_pages(void *addr, unsigned int order)
>> +{
>> +	set_memory_encrypted((unsigned long)addr, 1 << order);
>> +	free_pages((unsigned long)addr, order);
>> +}
>> +
>>  /*
>>   * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we
>>   * always have vSGIs mapped.
>> @@ -2212,7 +2239,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>>  {
>>  	struct page *prop_page;
>>  
>> -	prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ));
>> +	prop_page = its_alloc_pages(gfp_flags,
>> +				    get_order(LPI_PROPBASE_SZ));
>>  	if (!prop_page)
>>  		return NULL;
>>  
>> @@ -2223,8 +2251,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>>  
>>  static void its_free_prop_table(struct page *prop_page)
>>  {
>> -	free_pages((unsigned long)page_address(prop_page),
>> -		   get_order(LPI_PROPBASE_SZ));
>> +	its_free_pages(page_address(prop_page),
>> +		       get_order(LPI_PROPBASE_SZ));
>>  }
>>  
>>  static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size)
>> @@ -2346,7 +2374,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>  		order = get_order(GITS_BASER_PAGES_MAX * psz);
>>  	}
>>  
>> -	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
>> +	page = its_alloc_pages_node(its->numa_node,
>> +				    GFP_KERNEL | __GFP_ZERO, order);
>>  	if (!page)
>>  		return -ENOMEM;
>>  
>> @@ -2359,7 +2388,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>  		/* 52bit PA is supported only when PageSize=64K */
>>  		if (psz != SZ_64K) {
>>  			pr_err("ITS: no 52bit PA support when psz=%d\n", psz);
>> -			free_pages((unsigned long)base, order);
>> +			its_free_pages(base, order);
>>  			return -ENXIO;
>>  		}
>>  
>> @@ -2415,7 +2444,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>  		pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
>>  		       &its->phys_base, its_base_type_string[type],
>>  		       val, tmp);
>> -		free_pages((unsigned long)base, order);
>> +		its_free_pages(base, order);
>>  		return -ENXIO;
>>  	}
>>  
>> @@ -2554,8 +2583,8 @@ static void its_free_tables(struct its_node *its)
>>  
>>  	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>>  		if (its->tables[i].base) {
>> -			free_pages((unsigned long)its->tables[i].base,
>> -				   its->tables[i].order);
>> +			its_free_pages(its->tables[i].base,
>> +				       its->tables[i].order);
>>  			its->tables[i].base = NULL;
>>  		}
>>  	}
>> @@ -2821,7 +2850,8 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
>>  
>>  	/* Allocate memory for 2nd level table */
>>  	if (!table[idx]) {
>> -		page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz));
>> +		page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO,
>> +				       get_order(psz));
>>  		if (!page)
>>  			return false;
>>  
>> @@ -2940,7 +2970,8 @@ static int allocate_vpe_l1_table(void)
>>  
>>  	pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n",
>>  		 np, npg, psz, epp, esz);
>> -	page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE));
>> +	page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO,
>> +			       get_order(np * PAGE_SIZE));
>>  	if (!page)
>>  		return -ENOMEM;
>>  
>> @@ -2986,8 +3017,8 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>>  {
>>  	struct page *pend_page;
>>  
>> -	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
>> -				get_order(LPI_PENDBASE_SZ));
>> +	pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO,
>> +				    get_order(LPI_PENDBASE_SZ));
>>  	if (!pend_page)
>>  		return NULL;
>>  
>> @@ -2999,7 +3030,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>>  
>>  static void its_free_pending_table(struct page *pt)
>>  {
>> -	free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ));
>> +	its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ));
>>  }
>>  
>>  /*
>> @@ -3334,8 +3365,9 @@ static bool its_alloc_table_entry(struct its_node *its,
>>  
>>  	/* Allocate memory for 2nd level table */
>>  	if (!table[idx]) {
>> -		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>> -					get_order(baser->psz));
>> +		page = its_alloc_pages_node(its->numa_node,
>> +					    GFP_KERNEL | __GFP_ZERO,
>> +					    get_order(baser->psz));
>>  		if (!page)
>>  			return false;
>>  
>> @@ -3418,7 +3450,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>>  	unsigned long *lpi_map = NULL;
>>  	unsigned long flags;
>>  	u16 *col_map = NULL;
>> +	struct page *page;
>>  	void *itt;
>> +	int itt_order;
>>  	int lpi_base;
>>  	int nr_lpis;
>>  	int nr_ites;
>> @@ -3430,7 +3464,6 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>>  	if (WARN_ON(!is_power_of_2(nvecs)))
>>  		nvecs = roundup_pow_of_two(nvecs);
>>  
>> -	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
>>  	/*
>>  	 * Even if the device wants a single LPI, the ITT must be
>>  	 * sized as a power of two (and you need at least one bit...).
>> @@ -3438,7 +3471,16 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>>  	nr_ites = max(2, nvecs);
>>  	sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
>>  	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
>> -	itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
>> +	itt_order = get_order(sz);
>> +	page = its_alloc_pages_node(its->numa_node,
>> +				    GFP_KERNEL | __GFP_ZERO,
>> +				    itt_order);
> 
> So we go from an allocation that was so far measured in *bytes* to
> something that is now at least a page. Per device. This seems a bit
> excessive to me, specially when it isn't conditioned on anything and
> is now imposed on all platforms, including the non-CCA systems (which
> are exactly 100% of the machines).

Catalin asked about this in v2:
https://lore.kernel.org/lkml/c329ae18-2b61-4851-8d6a-9e691a2007c8@arm.com/

To be honest, I don't have a great handle on how much memory is being
wasted here. Within the realm guest I was testing this is rounding up an
otherwise 511 byte allocation to a 4k page, and there are 3 of them.
Which seems reasonable from a realm guest perspective.

I can see two options to improve here:

1. Add a !is_realm_world() check and return to the previous behaviour
when not running in a realm. It's ugly, and doesn't deal with any other
potential future memory encryption. cc_platform_has(CC_ATTR_MEM_ENCRYPT)
might be preferable? But this means no impact to non-realm guests.

2. Use a special (global) memory allocator that does the
set_memory_decrypted() dance on the pages that it allocates but allows
packing the allocations. I'm not aware of an existing kernel API for
this, so it's potentially quite a bit of code. The benefit is that it
reduces memory consumption in a realm guest, although fragmentation
still means we're likely to see a (small) growth.

Any thoughts on what you think would be best?

> Another thing is that if we go with page alignment, then the 256 byte
> alignment can obviously be removed everywhere (hint: MAPD needs to
> change).

Ah, good point - I'll need to look into that, my GIC-foo isn't quite up
to speed on that.

Thanks,

Steve
Marc Zyngier June 6, 2024, 10:17 a.m. UTC | #3
On Wed, 05 Jun 2024 16:08:49 +0100,
Steven Price <steven.price@arm.com> wrote:
> 
> Hi Marc,
> 
> On 05/06/2024 14:39, Marc Zyngier wrote:
> > The subject line is... odd. I'd expect something like:
> > 
> > "irqchip/gic-v3-its: Share ITS tables with a non-trusted hypervisor"
> > 
> > because nothing here should be CCA specific.
> 
> Good point - that's a much better subject.
> 
> > On Wed, 05 Jun 2024 10:30:04 +0100,
> > Steven Price <steven.price@arm.com> wrote:
> >>
> >> Within a realm guest the ITS is emulated by the host. This means the
> >> allocations must have been made available to the host by a call to
> >> set_memory_decrypted(). Introduce an allocation function which performs
> >> this extra call.
> > 
> > This doesn't mention that this patch radically changes the allocation
> > of some tables.
> 
> I guess that depends on your definition of radical, see below.

It's election time, I'm all about making bold statements!

[...]

> >> @@ -3334,8 +3365,9 @@ static bool its_alloc_table_entry(struct its_node *its,
> >>  
> >>  	/* Allocate memory for 2nd level table */
> >>  	if (!table[idx]) {
> >> -		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
> >> -					get_order(baser->psz));
> >> +		page = its_alloc_pages_node(its->numa_node,
> >> +					    GFP_KERNEL | __GFP_ZERO,
> >> +					    get_order(baser->psz));
> >>  		if (!page)
> >>  			return false;
> >>  
> >> @@ -3418,7 +3450,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
> >>  	unsigned long *lpi_map = NULL;
> >>  	unsigned long flags;
> >>  	u16 *col_map = NULL;
> >> +	struct page *page;
> >>  	void *itt;
> >> +	int itt_order;
> >>  	int lpi_base;
> >>  	int nr_lpis;
> >>  	int nr_ites;
> >> @@ -3430,7 +3464,6 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
> >>  	if (WARN_ON(!is_power_of_2(nvecs)))
> >>  		nvecs = roundup_pow_of_two(nvecs);
> >>  
> >> -	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> >>  	/*
> >>  	 * Even if the device wants a single LPI, the ITT must be
> >>  	 * sized as a power of two (and you need at least one bit...).
> >> @@ -3438,7 +3471,16 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
> >>  	nr_ites = max(2, nvecs);
> >>  	sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
> >>  	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
> >> -	itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
> >> +	itt_order = get_order(sz);
> >> +	page = its_alloc_pages_node(its->numa_node,
> >> +				    GFP_KERNEL | __GFP_ZERO,
> >> +				    itt_order);
> > 
> > So we go from an allocation that was so far measured in *bytes* to
> > something that is now at least a page. Per device. This seems a bit
> > excessive to me, specially when it isn't conditioned on anything and
> > is now imposed on all platforms, including the non-CCA systems (which
> > are exactly 100% of the machines).
> 
> Catalin asked about this in v2:
> https://lore.kernel.org/lkml/c329ae18-2b61-4851-8d6a-9e691a2007c8@arm.com/
> 
> To be honest, I don't have a great handle on how much memory is being
> wasted here. Within the realm guest I was testing this is rounding up an
> otherwise 511 byte allocation to a 4k page, and there are 3 of them.
> Which seems reasonable from a realm guest perspective.

And not that reasonable on a smaller system, such as my own router VM
that has a whole lot of devices and very little memory. Not to mention
that while CCA is stuck with 4k pages (duh!), the world is moving
towards larger pages, meaning that this is wasting even more memory.

> 
> I can see two options to improve here:
> 
> 1. Add a !is_realm_world() check and return to the previous behaviour
> when not running in a realm. It's ugly, and doesn't deal with any other
> potential future memory encryption. cc_platform_has(CC_ATTR_MEM_ENCRYPT)
> might be preferable? But this means no impact to non-realm guests.

No, this is way too ugly, and doesn't help with things like pKVM.

> 
> 2. Use a special (global) memory allocator that does the
> set_memory_decrypted() dance on the pages that it allocates but allows
> packing the allocations. I'm not aware of an existing kernel API for
> this, so it's potentially quite a bit of code. The benefit is that it
> reduces memory consumption in a realm guest, although fragmentation
> still means we're likely to see a (small) growth.
> 
> Any thoughts on what you think would be best?

I would expect that something similar to kmem_cache could be of help,
only with the ability to deal with variable object sizes (in this
case: minimum of 256 bytes, in increments defined by the
implementation, and with a 256 byte alignment).

I don't think the ITS is particularly special here, and we should come
up with something that is generic enough to support sharing of
non-page-sized objects.

Thanks,

	M.
Catalin Marinas June 6, 2024, 6:38 p.m. UTC | #4
On Thu, Jun 06, 2024 at 11:17:36AM +0100, Marc Zyngier wrote:
> On Wed, 05 Jun 2024 16:08:49 +0100,
> Steven Price <steven.price@arm.com> wrote:
> > 2. Use a special (global) memory allocator that does the
> > set_memory_decrypted() dance on the pages that it allocates but allows
> > packing the allocations. I'm not aware of an existing kernel API for
> > this, so it's potentially quite a bit of code. The benefit is that it
> > reduces memory consumption in a realm guest, although fragmentation
> > still means we're likely to see a (small) growth.
> > 
> > Any thoughts on what you think would be best?
> 
> I would expect that something similar to kmem_cache could be of help,
> only with the ability to deal with variable object sizes (in this
> case: minimum of 256 bytes, in increments defined by the
> implementation, and with a 256 byte alignment).

Hmm, that's doable but not that easy to make generic. We'd need a new
class of kmalloc-* caches (e.g. kmalloc-decrypted-*) which use only
decrypted pages together with a GFP_DECRYPTED flag or something to get
the slab allocator to go for these pages and avoid merging with other
caches. It would actually be the page allocator parsing this gfp flag,
probably in post_alloc_hook() to set the page decrypted and re-encrypt
it in free_pages_prepare(). A slight problem here is that free_pages()
doesn't get the gfp flag, so we'd need to store some bit in the page
flags. Maybe the flag is not that bad, do we have something like for
page_to_phys() to give us the high IPA address for decrypted pages?

Similarly if we go for a kmem_cache (or a few for multiple sizes). One
can specify a constructor which could set the memory decrypted but
there's no destructor (and also the constructor is per object, not per
page, so we'd need some refcounting).

Another approach contained within the driver is to use mempool_create()
with our own _alloc_fn/_free_fn that sets the memory decrypted/encrypted
accordingly, though sub-page allocations need additional tracking. Also
that's fairly similar to kmem_cache, fixed size.

Yet another option would be to wire it somehow in the DMA API but the
minimum allocation is already a page size, so we don't gain anything.

What gets somewhat closer to what we need is gen_pool. It can track
different sizes, we just need to populate the chunks as needed. I don't
think this would work as a generic allocator but may be good enough
within the ITS code.

If there's a need for such generic allocations in other parts of the
kernel, my preference would be something around kmalloc caches and a new
GFP flag (first option; subject to the selling it to the mm folk). But
that's more of a separate prototyping effort that may or may not
succeed.

Anyway, we could do some hacking around gen_pool as a temporary solution
(maybe as a set of patches on top of this series to be easier to revert)
and start investigating a proper decrypted page allocator in parallel.
We just need to find a victim that has the page allocator fresh in mind
(Ryan or Alexandru ;)).
Steven Price June 7, 2024, 3:45 p.m. UTC | #5
On 06/06/2024 19:38, Catalin Marinas wrote:
> On Thu, Jun 06, 2024 at 11:17:36AM +0100, Marc Zyngier wrote:
>> On Wed, 05 Jun 2024 16:08:49 +0100,
>> Steven Price <steven.price@arm.com> wrote:
>>> 2. Use a special (global) memory allocator that does the
>>> set_memory_decrypted() dance on the pages that it allocates but allows
>>> packing the allocations. I'm not aware of an existing kernel API for
>>> this, so it's potentially quite a bit of code. The benefit is that it
>>> reduces memory consumption in a realm guest, although fragmentation
>>> still means we're likely to see a (small) growth.
>>>
>>> Any thoughts on what you think would be best?
>>
>> I would expect that something similar to kmem_cache could be of help,
>> only with the ability to deal with variable object sizes (in this
>> case: minimum of 256 bytes, in increments defined by the
>> implementation, and with a 256 byte alignment).
> 
> Hmm, that's doable but not that easy to make generic. We'd need a new
> class of kmalloc-* caches (e.g. kmalloc-decrypted-*) which use only
> decrypted pages together with a GFP_DECRYPTED flag or something to get
> the slab allocator to go for these pages and avoid merging with other
> caches. It would actually be the page allocator parsing this gfp flag,
> probably in post_alloc_hook() to set the page decrypted and re-encrypt
> it in free_pages_prepare(). A slight problem here is that free_pages()
> doesn't get the gfp flag, so we'd need to store some bit in the page
> flags. Maybe the flag is not that bad, do we have something like for
> page_to_phys() to give us the high IPA address for decrypted pages?
> 
> Similarly if we go for a kmem_cache (or a few for multiple sizes). One
> can specify a constructor which could set the memory decrypted but
> there's no destructor (and also the constructor is per object, not per
> page, so we'd need some refcounting).
> 
> Another approach contained within the driver is to use mempool_create()
> with our own _alloc_fn/_free_fn that sets the memory decrypted/encrypted
> accordingly, though sub-page allocations need additional tracking. Also
> that's fairly similar to kmem_cache, fixed size.
> 
> Yet another option would be to wire it somehow in the DMA API but the
> minimum allocation is already a page size, so we don't gain anything.
> 
> What gets somewhat closer to what we need is gen_pool. It can track
> different sizes, we just need to populate the chunks as needed. I don't
> think this would work as a generic allocator but may be good enough
> within the ITS code.
> 
> If there's a need for such generic allocations in other parts of the
> kernel, my preference would be something around kmalloc caches and a new
> GFP flag (first option; subject to the selling it to the mm folk). But
> that's more of a separate prototyping effort that may or may not
> succeed.
> 
> Anyway, we could do some hacking around gen_pool as a temporary solution
> (maybe as a set of patches on top of this series to be easier to revert)
> and start investigating a proper decrypted page allocator in parallel.
> We just need to find a victim that has the page allocator fresh in mind
> (Ryan or Alexandru ;)).

Thanks for the suggestions Catalin. I had a go at implementing something
with gen_pool - the below (very lightly tested) hack seems to work. This
is on top of the current series.

I *think* it should also be safe to drop the whole alignment part with
this custom allocator, which could actually save memory. But I haven't
quite got my head around that yet.

Steve

----8<-----
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index ca72f830f4cc..e78634d4d22c 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -12,6 +12,7 @@
 #include <linux/crash_dump.h>
 #include <linux/delay.h>
 #include <linux/efi.h>
+#include <linux/genalloc.h>
 #include <linux/interrupt.h>
 #include <linux/iommu.h>
 #include <linux/iopoll.h>
@@ -165,7 +166,7 @@ struct its_device {
 	struct its_node		*its;
 	struct event_lpi_map	event_map;
 	void			*itt;
-	u32			itt_order;
+	u32			itt_sz;
 	u32			nr_ites;
 	u32			device_id;
 	bool			shared;
@@ -225,6 +226,50 @@ static void its_free_pages(void *addr, unsigned int order)
 	free_pages((unsigned long)addr, order);
 }
 
+static struct gen_pool *itt_pool;
+
+static void *itt_alloc_pool(int node, int size)
+{
+	unsigned long addr;
+	struct page *page;
+
+	if (size >= PAGE_SIZE) {
+		page = its_alloc_pages_node(node,
+					    GFP_KERNEL | __GFP_ZERO,
+					    get_order(size));
+
+		return page_address(page);
+	}
+
+	do {
+		addr = gen_pool_alloc(itt_pool, size);
+		if (addr)
+			break;
+
+		page = its_alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 1);
+		if (!page)
+			break;
+
+		gen_pool_add(itt_pool, (unsigned long)page_address(page),
+			     PAGE_SIZE, node);
+	} while (!addr);
+
+	return (void*)addr;
+}
+
+static void itt_free_pool(void *addr, int size)
+{
+	if (!addr)
+		return;
+
+	if (size >= PAGE_SIZE) {
+		its_free_pages(addr, get_order(size));
+		return;
+	}
+
+	gen_pool_free(itt_pool, (unsigned long)addr, size);
+}
+
 /*
  * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we
  * always have vSGIs mapped.
@@ -3450,9 +3495,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 	unsigned long *lpi_map = NULL;
 	unsigned long flags;
 	u16 *col_map = NULL;
-	struct page *page;
 	void *itt;
-	int itt_order;
 	int lpi_base;
 	int nr_lpis;
 	int nr_ites;
@@ -3471,13 +3514,8 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 	nr_ites = max(2, nvecs);
 	sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
 	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
-	itt_order = get_order(sz);
-	page = its_alloc_pages_node(its->numa_node,
-				    GFP_KERNEL | __GFP_ZERO,
-				    itt_order);
-	if (!page)
-		return NULL;
-	itt = page_address(page);
+
+	itt = itt_alloc_pool(its->numa_node, sz);
 
 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 
@@ -3492,9 +3530,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 		lpi_base = 0;
 	}
 
-	if (!dev || !col_map || (!lpi_map && alloc_lpis)) {
+	if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) {
 		kfree(dev);
-		its_free_pages(itt, itt_order);
+		itt_free_pool(itt, sz);
 		bitmap_free(lpi_map);
 		kfree(col_map);
 		return NULL;
@@ -3504,7 +3542,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 
 	dev->its = its;
 	dev->itt = itt;
-	dev->itt_order = itt_order;
+	dev->itt_sz = sz;
 	dev->nr_ites = nr_ites;
 	dev->event_map.lpi_map = lpi_map;
 	dev->event_map.col_map = col_map;
@@ -3532,7 +3570,7 @@ static void its_free_device(struct its_device *its_dev)
 	list_del(&its_dev->entry);
 	raw_spin_unlock_irqrestore(&its_dev->its->lock, flags);
 	kfree(its_dev->event_map.col_map);
-	its_free_pages(its_dev->itt, its_dev->itt_order);
+	itt_free_pool(its_dev->itt, its_dev->itt_sz);
 	kfree(its_dev);
 }
 
@@ -5722,6 +5760,10 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
 	bool has_v4_1 = false;
 	int err;
 
+	itt_pool = gen_pool_create(get_order(ITS_ITT_ALIGN), -1);
+	if (!itt_pool)
+		return -ENOMEM;
+
 	gic_rdists = rdists;
 
 	its_parent = parent_domain;
Catalin Marinas June 7, 2024, 4:46 p.m. UTC | #6
On Fri, Jun 07, 2024 at 04:45:14PM +0100, Steven Price wrote:
> On 06/06/2024 19:38, Catalin Marinas wrote:
> > Anyway, we could do some hacking around gen_pool as a temporary solution
> > (maybe as a set of patches on top of this series to be easier to revert)
> > and start investigating a proper decrypted page allocator in parallel.
> > We just need to find a victim that has the page allocator fresh in mind
> > (Ryan or Alexandru ;)).
> 
> Thanks for the suggestions Catalin. I had a go at implementing something
> with gen_pool - the below (very lightly tested) hack seems to work. This
> is on top of the current series.
> 
> I *think* it should also be safe to drop the whole alignment part with
> this custom allocator, which could actually save memory. But I haven't
> quite got my head around that yet.

Thanks Steven. It doesn't look too complex and it solves the memory
wasting. We don't actually free the pages from gen_pool but I don't
think it matters much, the memory would get reused if devices are
removed and re-added.
Catalin Marinas June 7, 2024, 5:55 p.m. UTC | #7
On Thu, Jun 06, 2024 at 07:38:08PM +0100, Catalin Marinas wrote:
> On Thu, Jun 06, 2024 at 11:17:36AM +0100, Marc Zyngier wrote:
> > On Wed, 05 Jun 2024 16:08:49 +0100,
> > Steven Price <steven.price@arm.com> wrote:
> > > 2. Use a special (global) memory allocator that does the
> > > set_memory_decrypted() dance on the pages that it allocates but allows
> > > packing the allocations. I'm not aware of an existing kernel API for
> > > this, so it's potentially quite a bit of code. The benefit is that it
> > > reduces memory consumption in a realm guest, although fragmentation
> > > still means we're likely to see a (small) growth.
> > > 
> > > Any thoughts on what you think would be best?
> > 
> > I would expect that something similar to kmem_cache could be of help,
> > only with the ability to deal with variable object sizes (in this
> > case: minimum of 256 bytes, in increments defined by the
> > implementation, and with a 256 byte alignment).
> 
> Hmm, that's doable but not that easy to make generic. We'd need a new
> class of kmalloc-* caches (e.g. kmalloc-decrypted-*) which use only
> decrypted pages together with a GFP_DECRYPTED flag or something to get
> the slab allocator to go for these pages and avoid merging with other
> caches. It would actually be the page allocator parsing this gfp flag,
> probably in post_alloc_hook() to set the page decrypted and re-encrypt
> it in free_pages_prepare(). A slight problem here is that free_pages()
> doesn't get the gfp flag, so we'd need to store some bit in the page
> flags. Maybe the flag is not that bad, do we have something like for
> page_to_phys() to give us the high IPA address for decrypted pages?

I had a go at this generic approach, Friday afternoon fun. Not tested
with the CCA patches, just my own fake set_memory_*() functionality on
top of 6.10-rc2. I also mindlessly added __GFP_DECRYPTED to the ITS code
following the changes in this patch. I noticed that
allocate_vpe_l2_table() doesn't use shared pages (I either missed it or
it's not needed).

If we want to go this way, I can tidy up the diff below, split it into a
proper series and add what's missing. What I can't tell is how a driver
writer knows when to pass __GFP_DECRYPTED. There's also a theoretical
overlap with __GFP_DMA, it can't handle both. The DMA flag takes
priority currently but I realised it probably needs to be the other way
around, we wouldn't have dma mask restrictions for emulated devices.

--------------------8<----------------------------------
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 40ebf1726393..b6627e839e62 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2212,7 +2212,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
 {
 	struct page *prop_page;
 
-	prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ));
+	prop_page = alloc_pages(gfp_flags | __GFP_DECRYPTED,
+				get_order(LPI_PROPBASE_SZ));
 	if (!prop_page)
 		return NULL;
 
@@ -2346,7 +2347,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		order = get_order(GITS_BASER_PAGES_MAX * psz);
 	}
 
-	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
+	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO |
+				__GFP_DECRYPTED, order);
 	if (!page)
 		return -ENOMEM;
 
@@ -2940,7 +2942,8 @@ static int allocate_vpe_l1_table(void)
 
 	pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n",
 		 np, npg, psz, epp, esz);
-	page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE));
+	page = alloc_pages(GFP_ATOMIC | __GFP_ZERO | __GFP_DECRYPTED,
+			   get_order(np * PAGE_SIZE));
 	if (!page)
 		return -ENOMEM;
 
@@ -2986,7 +2989,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
 {
 	struct page *pend_page;
 
-	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
+	pend_page = alloc_pages(gfp_flags | __GFP_ZERO | __GFP_DECRYPTED,
 				get_order(LPI_PENDBASE_SZ));
 	if (!pend_page)
 		return NULL;
@@ -3334,7 +3337,8 @@ static bool its_alloc_table_entry(struct its_node *its,
 
 	/* Allocate memory for 2nd level table */
 	if (!table[idx]) {
-		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
+		page = alloc_pages_node(its->numa_node, GFP_KERNEL |
+					__GFP_ZERO | __GFP_DECRYPTED,
 					get_order(baser->psz));
 		if (!page)
 			return false;
@@ -3438,7 +3442,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 	nr_ites = max(2, nvecs);
 	sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
 	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
-	itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
+	itt = kzalloc_node(sz, GFP_KERNEL | __GFP_DECRYPTED, its->numa_node);
 	if (alloc_lpis) {
 		lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
 		if (lpi_map)
@@ -5131,8 +5135,8 @@ static int __init its_probe_one(struct its_node *its)
 		}
 	}
 
-	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
-				get_order(ITS_CMD_QUEUE_SZ));
+	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO |
+				__GFP_DECRYPTED, get_order(ITS_CMD_QUEUE_SZ));
 	if (!page) {
 		err = -ENOMEM;
 		goto out_unmap_sgir;
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 313be4ad79fd..573989664639 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -57,6 +57,9 @@ enum {
 #endif
 #ifdef CONFIG_SLAB_OBJ_EXT
 	___GFP_NO_OBJ_EXT_BIT,
+#endif
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+	___GFP_DECRYPTED_BIT,
 #endif
 	___GFP_LAST_BIT
 };
@@ -103,6 +106,11 @@ enum {
 #else
 #define ___GFP_NO_OBJ_EXT       0
 #endif
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+#define ___GFP_DECRYPTED	BIT(___GFP_DECRYPTED_BIT)
+#else
+#define ___GFP_DECRYPTED	0
+#endif
 
 /*
  * Physical address zone modifiers (see linux/mmzone.h - low four bits)
@@ -117,6 +125,8 @@ enum {
 #define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
 #define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
 
+#define __GFP_DECRYPTED	((__force gfp_t)___GFP_DECRYPTED)
+
 /**
  * DOC: Page mobility and placement hints
  *
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 104078afe0b1..705707052274 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -127,6 +127,9 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+	PG_decrypted,
+#endif
 #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 	PG_young,
 	PG_idle,
@@ -626,6 +629,15 @@ PAGEFLAG_FALSE(HWPoison, hwpoison)
 #define __PG_HWPOISON 0
 #endif
 
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+PAGEFLAG(Decrypted, decrypted, PF_HEAD)
+#define __PG_DECRYPTED	(1UL << PG_decrypted)
+#else
+PAGEFLAG_FALSE(Decrypted, decrypted)
+#define __PG_DECRYPTED	0
+#endif
+
+
 #ifdef CONFIG_PAGE_IDLE_FLAG
 #ifdef CONFIG_64BIT
 FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7247e217e21b..f7a2cf624c35 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -422,6 +422,9 @@ enum kmalloc_cache_type {
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 	KMALLOC_CGROUP,
+#endif
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+	KMALLOC_DECRYPTED,
 #endif
 	NR_KMALLOC_TYPES
 };
@@ -433,7 +436,7 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
  * Define gfp bits that should not be set for KMALLOC_NORMAL.
  */
 #define KMALLOC_NOT_NORMAL_BITS					\
-	(__GFP_RECLAIMABLE |					\
+	(__GFP_RECLAIMABLE | __GFP_DECRYPTED |			\
 	(IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |	\
 	(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
 
@@ -458,11 +461,14 @@ static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigne
 	 * At least one of the flags has to be set. Their priorities in
 	 * decreasing order are:
 	 *  1) __GFP_DMA
-	 *  2) __GFP_RECLAIMABLE
-	 *  3) __GFP_ACCOUNT
+	 *  2) __GFP_DECRYPTED
+	 *  3) __GFP_RECLAIMABLE
+	 *  4) __GFP_ACCOUNT
 	 */
 	if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
 		return KMALLOC_DMA;
+	if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) && (flags & __GFP_DECRYPTED))
+		return KMALLOC_DECRYPTED;
 	if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE))
 		return KMALLOC_RECLAIM;
 	else
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index e46d6e82765e..a0879155f892 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -83,6 +83,12 @@
 #define IF_HAVE_PG_HWPOISON(_name)
 #endif
 
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+#define IF_HAVE_PG_DECRYPTED(_name) ,{1UL << PG_##_name, __stringify(_name)}
+#else
+#define IF_HAVE_PG_DECRYPTED(_name)
+#endif
+
 #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 #define IF_HAVE_PG_IDLE(_name) ,{1UL << PG_##_name, __stringify(_name)}
 #else
@@ -121,6 +127,7 @@
 IF_HAVE_PG_MLOCK(mlocked)						\
 IF_HAVE_PG_UNCACHED(uncached)						\
 IF_HAVE_PG_HWPOISON(hwpoison)						\
+IF_HAVE_PG_DECRYPTED(decrypted)						\
 IF_HAVE_PG_IDLE(idle)							\
 IF_HAVE_PG_IDLE(young)							\
 IF_HAVE_PG_ARCH_X(arch_2)						\
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2e22ce5675ca..c93ae50ec402 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -47,6 +47,7 @@
 #include <linux/sched/mm.h>
 #include <linux/page_owner.h>
 #include <linux/page_table_check.h>
+#include <linux/set_memory.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
 #include <linux/lockdep.h>
@@ -1051,6 +1052,12 @@ __always_inline bool free_pages_prepare(struct page *page,
 		return false;
 	}
 
+	if (unlikely(PageDecrypted(page))) {
+		set_memory_encrypted((unsigned long)page_address(page),
+				     1 << order);
+		ClearPageDecrypted(page);
+	}
+
 	VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
 
 	/*
@@ -1415,6 +1422,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
 			!should_skip_init(gfp_flags);
 	bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+	bool decrypted = true; //gfp_flags & __GFP_DECRYPTED;
 	int i;
 
 	set_page_private(page, 0);
@@ -1465,6 +1473,12 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	if (init)
 		kernel_init_pages(page, 1 << order);
 
+	if (decrypted) {
+		set_memory_decrypted((unsigned long)page_address(page),
+				     1 << order);
+		SetPageDecrypted(page);
+	}
+
 	set_page_owner(page, order, gfp_flags);
 	page_table_check_alloc(page, order);
 	pgalloc_tag_add(page, current, 1 << order);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1560a1546bb1..de9c8c674aa1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -737,6 +737,12 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
 #define KMALLOC_RCL_NAME(sz)
 #endif
 
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+#define KMALLOC_DECRYPTED_NAME(sz)	.name[KMALLOC_DECRYPTED] = "kmalloc-decrypted-" #sz,
+#else
+#define KMALLOC_DECRYPTED_NAME(sz)
+#endif
+
 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
 #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
 #define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
@@ -765,6 +771,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
 	KMALLOC_RCL_NAME(__short_size)				\
 	KMALLOC_CGROUP_NAME(__short_size)			\
 	KMALLOC_DMA_NAME(__short_size)				\
+	KMALLOC_DECRYPTED_NAME(__short_size)			\
 	KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size)	\
 	.size = __size,						\
 }
@@ -889,6 +896,9 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
 	if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL))
 		flags |= SLAB_NO_MERGE;
 
+	if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) && (type == KMALLOC_DECRYPTED))
+		flags |= SLAB_NO_MERGE;
+
 	if (minalign > ARCH_KMALLOC_MINALIGN) {
 		aligned_size = ALIGN(aligned_size, minalign);
 		aligned_idx = __kmalloc_index(aligned_size, false);
Michael Kelley June 17, 2024, 3:54 a.m. UTC | #8
From: Steven Price <steven.price@arm.com> Sent: Wednesday, June 5, 2024 2:30 AM
> 
> Within a realm guest the ITS is emulated by the host. This means the
> allocations must have been made available to the host by a call to
> set_memory_decrypted(). Introduce an allocation function which performs
> this extra call.
> 
> Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v2:
>  * Drop 'shared' from the new its_xxx function names as they are used
>    for non-realm guests too.
>  * Don't handle the NUMA_NO_NODE case specially - alloc_pages_node()
>    should do the right thing.
>  * Drop a pointless (void *) cast.
> ---
>  drivers/irqchip/irq-gic-v3-its.c | 90 ++++++++++++++++++++++++--------
>  1 file changed, 67 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index 40ebf1726393..ca72f830f4cc 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -18,6 +18,7 @@
>  #include <linux/irqdomain.h>
>  #include <linux/list.h>
>  #include <linux/log2.h>
> +#include <linux/mem_encrypt.h>
>  #include <linux/memblock.h>
>  #include <linux/mm.h>
>  #include <linux/msi.h>
> @@ -27,6 +28,7 @@
>  #include <linux/of_pci.h>
>  #include <linux/of_platform.h>
>  #include <linux/percpu.h>
> +#include <linux/set_memory.h>
>  #include <linux/slab.h>
>  #include <linux/syscore_ops.h>
> 
> @@ -163,6 +165,7 @@ struct its_device {
>  	struct its_node		*its;
>  	struct event_lpi_map	event_map;
>  	void			*itt;
> +	u32			itt_order;
>  	u32			nr_ites;
>  	u32			device_id;
>  	bool			shared;
> @@ -198,6 +201,30 @@ static DEFINE_IDA(its_vpeid_ida);
>  #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
>  #define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + SZ_128K)
> 
> +static struct page *its_alloc_pages_node(int node, gfp_t gfp,
> +					 unsigned int order)
> +{
> +	struct page *page;
> +
> +	page = alloc_pages_node(node, gfp, order);
> +
> +	if (page)
> +		set_memory_decrypted((unsigned long)page_address(page),
> +				     1 << order);

There's been considerable discussion on the x86 side about
what to do when set_memory_decrypted() or set_memory_encrypted()
fails. The conclusions are:

1) set_memory_decrypted()/encrypted() *could* fail due to a
compromised/malicious host, due to a bug somewhere in the
software stack, or due to resource constraints (x86 might need to
split a large page mapping, and need to allocate additional page
table pages, which could fail).

2) The guest memory that was the target of such a failed call could
be left in an indeterminate state that the guest could not reliably
undo or correct. The guest's view of the page's decrypted/encrypted
state might not match the host's view. Therefore, any such guest
memory must be leaked rather than being used or put back on the
free list.

3) After some discussion, we decided not to panic in such a case.
Instead, set_memory_decrypted()/encrypted() generates a WARN,
as well as returns a failure result. The most security conscious
users could set panic_on_warn=1 in their VMs, and thereby stop
further operation if there any indication that the transition between
encrypted and decrypt is suspect. The caller of these functions
also can take explicit action in the case of a failure.

It seems like the same guidelines should apply here. On the x86
side we've also cleaned up cases where the return value isn't
checked, like here and the use of set_memory_encrypted() below.

Michael

> +	return page;
> +}
> +
> +static struct page *its_alloc_pages(gfp_t gfp, unsigned int order)
> +{
> +	return its_alloc_pages_node(NUMA_NO_NODE, gfp, order);
> +}
> +
> +static void its_free_pages(void *addr, unsigned int order)
> +{
> +	set_memory_encrypted((unsigned long)addr, 1 << order);
> +	free_pages((unsigned long)addr, order);
> +}
> +
>  /*
>   * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we
>   * always have vSGIs mapped.
> @@ -2212,7 +2239,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>  {
>  	struct page *prop_page;
> 
> -	prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ));
> +	prop_page = its_alloc_pages(gfp_flags,
> +				    get_order(LPI_PROPBASE_SZ));
>  	if (!prop_page)
>  		return NULL;
> 
> @@ -2223,8 +2251,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
> 
>  static void its_free_prop_table(struct page *prop_page)
>  {
> -	free_pages((unsigned long)page_address(prop_page),
> -		   get_order(LPI_PROPBASE_SZ));
> +	its_free_pages(page_address(prop_page),
> +		       get_order(LPI_PROPBASE_SZ));
>  }
> 
>  static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size)
> @@ -2346,7 +2374,8 @@ static int its_setup_baser(struct its_node *its, struct
> its_baser *baser,
>  		order = get_order(GITS_BASER_PAGES_MAX * psz);
>  	}
> 
> -	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
> +	page = its_alloc_pages_node(its->numa_node,
> +				    GFP_KERNEL | __GFP_ZERO, order);
>  	if (!page)
>  		return -ENOMEM;
> 
> @@ -2359,7 +2388,7 @@ static int its_setup_baser(struct its_node *its, struct
> its_baser *baser,
>  		/* 52bit PA is supported only when PageSize=64K */
>  		if (psz != SZ_64K) {
>  			pr_err("ITS: no 52bit PA support when psz=%d\n", psz);
> -			free_pages((unsigned long)base, order);
> +			its_free_pages(base, order);
>  			return -ENXIO;
>  		}
> 
> @@ -2415,7 +2444,7 @@ static int its_setup_baser(struct its_node *its, struct
> its_baser *baser,
>  		pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
>  		       &its->phys_base, its_base_type_string[type],
>  		       val, tmp);
> -		free_pages((unsigned long)base, order);
> +		its_free_pages(base, order);
>  		return -ENXIO;
>  	}
> 
> @@ -2554,8 +2583,8 @@ static void its_free_tables(struct its_node *its)
> 
>  	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>  		if (its->tables[i].base) {
> -			free_pages((unsigned long)its->tables[i].base,
> -				   its->tables[i].order);
> +			its_free_pages(its->tables[i].base,
> +				       its->tables[i].order);
>  			its->tables[i].base = NULL;
>  		}
>  	}
> @@ -2821,7 +2850,8 @@ static bool allocate_vpe_l2_table(int cpu, u32 id)
> 
>  	/* Allocate memory for 2nd level table */
>  	if (!table[idx]) {
> -		page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz));
> +		page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO,
> +				       get_order(psz));
>  		if (!page)
>  			return false;
> 
> @@ -2940,7 +2970,8 @@ static int allocate_vpe_l1_table(void)
> 
>  	pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n",
>  		 np, npg, psz, epp, esz);
> -	page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE));
> +	page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO,
> +			       get_order(np * PAGE_SIZE));
>  	if (!page)
>  		return -ENOMEM;
> 
> @@ -2986,8 +3017,8 @@ static struct page *its_allocate_pending_table(gfp_t
> gfp_flags)
>  {
>  	struct page *pend_page;
> 
> -	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
> -				get_order(LPI_PENDBASE_SZ));
> +	pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO,
> +				    get_order(LPI_PENDBASE_SZ));
>  	if (!pend_page)
>  		return NULL;
> 
> @@ -2999,7 +3030,7 @@ static struct page *its_allocate_pending_table(gfp_t
> gfp_flags)
> 
>  static void its_free_pending_table(struct page *pt)
>  {
> -	free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ));
> +	its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ));
>  }
> 
>  /*
> @@ -3334,8 +3365,9 @@ static bool its_alloc_table_entry(struct its_node *its,
> 
>  	/* Allocate memory for 2nd level table */
>  	if (!table[idx]) {
> -		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
> -					get_order(baser->psz));
> +		page = its_alloc_pages_node(its->numa_node,
> +					    GFP_KERNEL | __GFP_ZERO,
> +					    get_order(baser->psz));
>  		if (!page)
>  			return false;
> 
> @@ -3418,7 +3450,9 @@ static struct its_device *its_create_device(struct its_node
> *its, u32 dev_id,
>  	unsigned long *lpi_map = NULL;
>  	unsigned long flags;
>  	u16 *col_map = NULL;
> +	struct page *page;
>  	void *itt;
> +	int itt_order;
>  	int lpi_base;
>  	int nr_lpis;
>  	int nr_ites;
> @@ -3430,7 +3464,6 @@ static struct its_device *its_create_device(struct its_node
> *its, u32 dev_id,
>  	if (WARN_ON(!is_power_of_2(nvecs)))
>  		nvecs = roundup_pow_of_two(nvecs);
> 
> -	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
>  	/*
>  	 * Even if the device wants a single LPI, the ITT must be
>  	 * sized as a power of two (and you need at least one bit...).
> @@ -3438,7 +3471,16 @@ static struct its_device *its_create_device(struct its_node
> *its, u32 dev_id,
>  	nr_ites = max(2, nvecs);
>  	sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
>  	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
> -	itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
> +	itt_order = get_order(sz);
> +	page = its_alloc_pages_node(its->numa_node,
> +				    GFP_KERNEL | __GFP_ZERO,
> +				    itt_order);
> +	if (!page)
> +		return NULL;
> +	itt = page_address(page);
> +
> +	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> +
>  	if (alloc_lpis) {
>  		lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
>  		if (lpi_map)
> @@ -3450,9 +3492,9 @@ static struct its_device *its_create_device(struct its_node
> *its, u32 dev_id,
>  		lpi_base = 0;
>  	}
> 
> -	if (!dev || !itt ||  !col_map || (!lpi_map && alloc_lpis)) {
> +	if (!dev || !col_map || (!lpi_map && alloc_lpis)) {
>  		kfree(dev);
> -		kfree(itt);
> +		its_free_pages(itt, itt_order);
>  		bitmap_free(lpi_map);
>  		kfree(col_map);
>  		return NULL;
> @@ -3462,6 +3504,7 @@ static struct its_device *its_create_device(struct its_node
> *its, u32 dev_id,
> 
>  	dev->its = its;
>  	dev->itt = itt;
> +	dev->itt_order = itt_order;
>  	dev->nr_ites = nr_ites;
>  	dev->event_map.lpi_map = lpi_map;
>  	dev->event_map.col_map = col_map;
> @@ -3489,7 +3532,7 @@ static void its_free_device(struct its_device *its_dev)
>  	list_del(&its_dev->entry);
>  	raw_spin_unlock_irqrestore(&its_dev->its->lock, flags);
>  	kfree(its_dev->event_map.col_map);
> -	kfree(its_dev->itt);
> +	its_free_pages(its_dev->itt, its_dev->itt_order);
>  	kfree(its_dev);
>  }
> 
> @@ -5131,8 +5174,9 @@ static int __init its_probe_one(struct its_node *its)
>  		}
>  	}
> 
> -	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
> -				get_order(ITS_CMD_QUEUE_SZ));
> +	page = its_alloc_pages_node(its->numa_node,
> +				    GFP_KERNEL | __GFP_ZERO,
> +				    get_order(ITS_CMD_QUEUE_SZ));
>  	if (!page) {
>  		err = -ENOMEM;
>  		goto out_unmap_sgir;
> @@ -5196,7 +5240,7 @@ static int __init its_probe_one(struct its_node *its)
>  out_free_tables:
>  	its_free_tables(its);
>  out_free_cmd:
> -	free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
> +	its_free_pages(its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
>  out_unmap_sgir:
>  	if (its->sgir_base)
>  		iounmap(its->sgir_base);
> --
> 2.34.1
>
Michael Kelley June 18, 2024, 4:04 p.m. UTC | #9
From: Catalin Marinas <catalin.marinas@arm.com> Sent: Friday, June 7, 2024 10:55 AM
> 
> On Thu, Jun 06, 2024 at 07:38:08PM +0100, Catalin Marinas wrote:
> > On Thu, Jun 06, 2024 at 11:17:36AM +0100, Marc Zyngier wrote:
> > > On Wed, 05 Jun 2024 16:08:49 +0100,
> > > Steven Price <steven.price@arm.com> wrote:
> > > > 2. Use a special (global) memory allocator that does the
> > > > set_memory_decrypted() dance on the pages that it allocates but allows
> > > > packing the allocations. I'm not aware of an existing kernel API for
> > > > this, so it's potentially quite a bit of code. The benefit is that it
> > > > reduces memory consumption in a realm guest, although fragmentation
> > > > still means we're likely to see a (small) growth.
> > > >
> > > > Any thoughts on what you think would be best?
> > >
> > > I would expect that something similar to kmem_cache could be of help,
> > > only with the ability to deal with variable object sizes (in this
> > > case: minimum of 256 bytes, in increments defined by the
> > > implementation, and with a 256 byte alignment).
> >
> > Hmm, that's doable but not that easy to make generic. We'd need a new
> > class of kmalloc-* caches (e.g. kmalloc-decrypted-*) which use only
> > decrypted pages together with a GFP_DECRYPTED flag or something to get
> > the slab allocator to go for these pages and avoid merging with other
> > caches. It would actually be the page allocator parsing this gfp flag,
> > probably in post_alloc_hook() to set the page decrypted and re-encrypt
> > it in free_pages_prepare(). A slight problem here is that free_pages()
> > doesn't get the gfp flag, so we'd need to store some bit in the page
> > flags. Maybe the flag is not that bad, do we have something like for
> > page_to_phys() to give us the high IPA address for decrypted pages?
> 
> I had a go at this generic approach, Friday afternoon fun. Not tested
> with the CCA patches, just my own fake set_memory_*() functionality on
> top of 6.10-rc2. I also mindlessly added __GFP_DECRYPTED to the ITS code
> following the changes in this patch. I noticed that
> allocate_vpe_l2_table() doesn't use shared pages (I either missed it or
> it's not needed).
> 
> If we want to go this way, I can tidy up the diff below, split it into a
> proper series and add what's missing. What I can't tell is how a driver
> writer knows when to pass __GFP_DECRYPTED. There's also a theoretical
> overlap with __GFP_DMA, it can't handle both. The DMA flag takes
> priority currently but I realised it probably needs to be the other way
> around, we wouldn't have dma mask restrictions for emulated devices.

As someone who has spent time in Linux code on the x86 side that
manages memory shared between the guest and host, a GFP_DECRYPTED
flag on the memory allocation calls seems very useful. Here are my
general comments, some of which are probably obvious, but I thought
I'd write them down anyway.

1)  The impetus in this case is to efficiently allow sub-page allocations.
But on x86, there's also an issue in that the x86 direct map uses large
page mappings, which must be split if any 4K page in the large page
is decrypted. Ideally, we could cluster such decrypted pages into the
same large page(s) vs. just grabbing any 4K page, and reduce the
number of splits. I haven't looked at how feasible that is to implement
in the context of the existing allocator code, so this is aspirational.

2) GFP_DECRYPTED should work for the three memory allocation
interfaces and their variants: alloc_pages(), kmalloc(), and vmalloc().

3) The current paradigm is to allocate memory and call
set_memory_decrypted(). Then do the reverse when freeing memory.
Using GFP_DECRYPTED on the allocation, and having the re-encryption
done automatically when freeing certainly simplifies the call sites.
The error handling at the call sites is additional messiness that gets
much simpler.

4) GFP_DECRYPTED should be silently ignored when not running
in a CoCo VM. Drivers that need decrypted memory in a CoCo VM
always set this flag, and work normally as if the flag isn't set when
not in a CoCo VM. This is how set_memory_decrypted()/encrypted()
work today. Drivers call them in all cases, and they are no-ops
when not in a CoCo VM.

5) The implementation of GFP_DECRYPTED must correctly handle
errors from set_memory_decrypted()/encrypted() as I've described
separately on this thread.

Michael

> 
> --------------------8<----------------------------------
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index 40ebf1726393..b6627e839e62 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -2212,7 +2212,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>  {
>  	struct page *prop_page;
> 
> -	prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ));
> +	prop_page = alloc_pages(gfp_flags | __GFP_DECRYPTED,
> +				get_order(LPI_PROPBASE_SZ));
>  	if (!prop_page)
>  		return NULL;
> 
> @@ -2346,7 +2347,8 @@ static int its_setup_baser(struct its_node *its, struct
> its_baser *baser,
>  		order = get_order(GITS_BASER_PAGES_MAX * psz);
>  	}
> 
> -	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
> +	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO |
> +				__GFP_DECRYPTED, order);
>  	if (!page)
>  		return -ENOMEM;
> 
> @@ -2940,7 +2942,8 @@ static int allocate_vpe_l1_table(void)
> 
>  	pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n",
>  		 np, npg, psz, epp, esz);
> -	page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE));
> +	page = alloc_pages(GFP_ATOMIC | __GFP_ZERO | __GFP_DECRYPTED,
> +			   get_order(np * PAGE_SIZE));
>  	if (!page)
>  		return -ENOMEM;
> 
> @@ -2986,7 +2989,7 @@ static struct page *its_allocate_pending_table(gfp_t
> gfp_flags)
>  {
>  	struct page *pend_page;
> 
> -	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
> +	pend_page = alloc_pages(gfp_flags | __GFP_ZERO | __GFP_DECRYPTED,
>  				get_order(LPI_PENDBASE_SZ));
>  	if (!pend_page)
>  		return NULL;
> @@ -3334,7 +3337,8 @@ static bool its_alloc_table_entry(struct its_node *its,
> 
>  	/* Allocate memory for 2nd level table */
>  	if (!table[idx]) {
> -		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
> +		page = alloc_pages_node(its->numa_node, GFP_KERNEL |
> +					__GFP_ZERO | __GFP_DECRYPTED,
>  					get_order(baser->psz));
>  		if (!page)
>  			return false;
> @@ -3438,7 +3442,7 @@ static struct its_device *its_create_device(struct its_node
> *its, u32 dev_id,
>  	nr_ites = max(2, nvecs);
>  	sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
>  	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
> -	itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
> +	itt = kzalloc_node(sz, GFP_KERNEL | __GFP_DECRYPTED, its->numa_node);
>  	if (alloc_lpis) {
>  		lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
>  		if (lpi_map)
> @@ -5131,8 +5135,8 @@ static int __init its_probe_one(struct its_node *its)
>  		}
>  	}
> 
> -	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
> -				get_order(ITS_CMD_QUEUE_SZ));
> +	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO |
> +				__GFP_DECRYPTED, get_order(ITS_CMD_QUEUE_SZ));
>  	if (!page) {
>  		err = -ENOMEM;
>  		goto out_unmap_sgir;
> diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
> index 313be4ad79fd..573989664639 100644
> --- a/include/linux/gfp_types.h
> +++ b/include/linux/gfp_types.h
> @@ -57,6 +57,9 @@ enum {
>  #endif
>  #ifdef CONFIG_SLAB_OBJ_EXT
>  	___GFP_NO_OBJ_EXT_BIT,
> +#endif
> +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
> +	___GFP_DECRYPTED_BIT,
>  #endif
>  	___GFP_LAST_BIT
>  };
> @@ -103,6 +106,11 @@ enum {
>  #else
>  #define ___GFP_NO_OBJ_EXT       0
>  #endif
> +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
> +#define ___GFP_DECRYPTED	BIT(___GFP_DECRYPTED_BIT)
> +#else
> +#define ___GFP_DECRYPTED	0
> +#endif
> 
>  /*
>   * Physical address zone modifiers (see linux/mmzone.h - low four bits)
> @@ -117,6 +125,8 @@ enum {
>  #define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /*
> ZONE_MOVABLE allowed */
>  #define GFP_ZONEMASK
> 	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
> 
> +#define __GFP_DECRYPTED	((__force gfp_t)___GFP_DECRYPTED)
> +
>  /**
>   * DOC: Page mobility and placement hints
>   *
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 104078afe0b1..705707052274 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -127,6 +127,9 @@ enum pageflags {
>  #ifdef CONFIG_MEMORY_FAILURE
>  	PG_hwpoison,		/* hardware poisoned page. Don't touch */
>  #endif
> +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
> +	PG_decrypted,
> +#endif
>  #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
>  	PG_young,
>  	PG_idle,
> @@ -626,6 +629,15 @@ PAGEFLAG_FALSE(HWPoison, hwpoison)
>  #define __PG_HWPOISON 0
>  #endif
> 
> +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
> +PAGEFLAG(Decrypted, decrypted, PF_HEAD)
> +#define __PG_DECRYPTED	(1UL << PG_decrypted)
> +#else
> +PAGEFLAG_FALSE(Decrypted, decrypted)
> +#define __PG_DECRYPTED	0
> +#endif
> +
> +
>  #ifdef CONFIG_PAGE_IDLE_FLAG
>  #ifdef CONFIG_64BIT
>  FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 7247e217e21b..f7a2cf624c35 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -422,6 +422,9 @@ enum kmalloc_cache_type {
>  #endif
>  #ifdef CONFIG_MEMCG_KMEM
>  	KMALLOC_CGROUP,
> +#endif
> +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
> +	KMALLOC_DECRYPTED,
>  #endif
>  	NR_KMALLOC_TYPES
>  };
> @@ -433,7 +436,7 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH
> + 1];
>   * Define gfp bits that should not be set for KMALLOC_NORMAL.
>   */
>  #define KMALLOC_NOT_NORMAL_BITS					\
> -	(__GFP_RECLAIMABLE |					\
> +	(__GFP_RECLAIMABLE | __GFP_DECRYPTED |			\
>  	(IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |	\
>  	(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
> 
> @@ -458,11 +461,14 @@ static __always_inline enum kmalloc_cache_type
> kmalloc_type(gfp_t flags, unsigne
>  	 * At least one of the flags has to be set. Their priorities in
>  	 * decreasing order are:
>  	 *  1) __GFP_DMA
> -	 *  2) __GFP_RECLAIMABLE
> -	 *  3) __GFP_ACCOUNT
> +	 *  2) __GFP_DECRYPTED
> +	 *  3) __GFP_RECLAIMABLE
> +	 *  4) __GFP_ACCOUNT
>  	 */
>  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
>  		return KMALLOC_DMA;
> +	if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) && (flags &
> __GFP_DECRYPTED))
> +		return KMALLOC_DECRYPTED;
>  	if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE))
>  		return KMALLOC_RECLAIM;
>  	else
> diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
> index e46d6e82765e..a0879155f892 100644
> --- a/include/trace/events/mmflags.h
> +++ b/include/trace/events/mmflags.h
> @@ -83,6 +83,12 @@
>  #define IF_HAVE_PG_HWPOISON(_name)
>  #endif
> 
> +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
> +#define IF_HAVE_PG_DECRYPTED(_name) ,{1UL << PG_##_name, __stringify(_name)}
> +#else
> +#define IF_HAVE_PG_DECRYPTED(_name)
> +#endif
> +
>  #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
>  #define IF_HAVE_PG_IDLE(_name) ,{1UL << PG_##_name, __stringify(_name)}
>  #else
> @@ -121,6 +127,7 @@
>  IF_HAVE_PG_MLOCK(mlocked)						\
>  IF_HAVE_PG_UNCACHED(uncached)						\
>  IF_HAVE_PG_HWPOISON(hwpoison)						\
> +IF_HAVE_PG_DECRYPTED(decrypted)						\
>  IF_HAVE_PG_IDLE(idle)							\
>  IF_HAVE_PG_IDLE(young)							\
>  IF_HAVE_PG_ARCH_X(arch_2)						\
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 2e22ce5675ca..c93ae50ec402 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -47,6 +47,7 @@
>  #include <linux/sched/mm.h>
>  #include <linux/page_owner.h>
>  #include <linux/page_table_check.h>
> +#include <linux/set_memory.h>
>  #include <linux/memcontrol.h>
>  #include <linux/ftrace.h>
>  #include <linux/lockdep.h>
> @@ -1051,6 +1052,12 @@ __always_inline bool free_pages_prepare(struct page
> *page,
>  		return false;
>  	}
> 
> +	if (unlikely(PageDecrypted(page))) {
> +		set_memory_encrypted((unsigned long)page_address(page),
> +				     1 << order);
> +		ClearPageDecrypted(page);
> +	}
> +
>  	VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
> 
>  	/*
> @@ -1415,6 +1422,7 @@ inline void post_alloc_hook(struct page *page, unsigned int
> order,
>  	bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
>  			!should_skip_init(gfp_flags);
>  	bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
> +	bool decrypted = true; //gfp_flags & __GFP_DECRYPTED;
>  	int i;
> 
>  	set_page_private(page, 0);
> @@ -1465,6 +1473,12 @@ inline void post_alloc_hook(struct page *page, unsigned int
> order,
>  	if (init)
>  		kernel_init_pages(page, 1 << order);
> 
> +	if (decrypted) {
> +		set_memory_decrypted((unsigned long)page_address(page),
> +				     1 << order);
> +		SetPageDecrypted(page);
> +	}
> +
>  	set_page_owner(page, order, gfp_flags);
>  	page_table_check_alloc(page, order);
>  	pgalloc_tag_add(page, current, 1 << order);
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 1560a1546bb1..de9c8c674aa1 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -737,6 +737,12 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
>  #define KMALLOC_RCL_NAME(sz)
>  #endif
> 
> +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
> +#define KMALLOC_DECRYPTED_NAME(sz)	.name[KMALLOC_DECRYPTED] =
> "kmalloc-decrypted-" #sz,
> +#else
> +#define KMALLOC_DECRYPTED_NAME(sz)
> +#endif
> +
>  #ifdef CONFIG_RANDOM_KMALLOC_CACHES
>  #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
>  #define KMALLOC_RANDOM_NAME(N, sz)
> __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
> @@ -765,6 +771,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
>  	KMALLOC_RCL_NAME(__short_size)				\
>  	KMALLOC_CGROUP_NAME(__short_size)			\
>  	KMALLOC_DMA_NAME(__short_size)				\
> +	KMALLOC_DECRYPTED_NAME(__short_size)			\
>  	KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size)
> 	\
>  	.size = __size,						\
>  }
> @@ -889,6 +896,9 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
>  	if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL))
>  		flags |= SLAB_NO_MERGE;
> 
> +	if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) && (type ==
> KMALLOC_DECRYPTED))
> +		flags |= SLAB_NO_MERGE;
> +
>  	if (minalign > ARCH_KMALLOC_MINALIGN) {
>  		aligned_size = ALIGN(aligned_size, minalign);
>  		aligned_idx = __kmalloc_index(aligned_size, false);
Catalin Marinas June 21, 2024, 2:24 p.m. UTC | #10
Hi Michael,

All good points below, thanks.

On Tue, Jun 18, 2024 at 04:04:17PM +0000, Michael Kelley wrote:
> As someone who has spent time in Linux code on the x86 side that
> manages memory shared between the guest and host, a GFP_DECRYPTED
> flag on the memory allocation calls seems very useful. Here are my
> general comments, some of which are probably obvious, but I thought
> I'd write them down anyway.
> 
> 1)  The impetus in this case is to efficiently allow sub-page allocations.
> But on x86, there's also an issue in that the x86 direct map uses large
> page mappings, which must be split if any 4K page in the large page
> is decrypted. Ideally, we could cluster such decrypted pages into the
> same large page(s) vs. just grabbing any 4K page, and reduce the
> number of splits. I haven't looked at how feasible that is to implement
> in the context of the existing allocator code, so this is aspirational.

It might be doable, though it's a lot more intrusive than just a GFP
flag. However, such memory cannot be used for any other things than
sharing data with the host.

> 2) GFP_DECRYPTED should work for the three memory allocation
> interfaces and their variants: alloc_pages(), kmalloc(), and vmalloc().

Yes. There are two main aspects: (1) the page allocation + linear map
change and (2) additional mapping attributes like in the vmalloc() case.

> 3) The current paradigm is to allocate memory and call
> set_memory_decrypted(). Then do the reverse when freeing memory.
> Using GFP_DECRYPTED on the allocation, and having the re-encryption
> done automatically when freeing certainly simplifies the call sites.
> The error handling at the call sites is additional messiness that gets
> much simpler.
> 
> 4) GFP_DECRYPTED should be silently ignored when not running
> in a CoCo VM. Drivers that need decrypted memory in a CoCo VM
> always set this flag, and work normally as if the flag isn't set when
> not in a CoCo VM. This is how set_memory_decrypted()/encrypted()
> work today. Drivers call them in all cases, and they are no-ops
> when not in a CoCo VM.

Yes, this should be straightforward. Maybe simplified further to avoid
creating additional kmalloc() caches if these functions are not
supported, though we seem to be missing some generic
arch_has_mem_encrypt() API.

> 5) The implementation of GFP_DECRYPTED must correctly handle
> errors from set_memory_decrypted()/encrypted() as I've described
> separately on this thread.

Good point, I completely missed this aspect.

I'll prepare some patches and post them next week to get feedback from
the mm folk.

Thanks.
Steven Price June 28, 2024, 9:59 a.m. UTC | #11
On 17/06/2024 04:54, Michael Kelley wrote:
> From: Steven Price <steven.price@arm.com> Sent: Wednesday, June 5, 2024 2:30 AM
>>
>> Within a realm guest the ITS is emulated by the host. This means the
>> allocations must have been made available to the host by a call to
>> set_memory_decrypted(). Introduce an allocation function which performs
>> this extra call.
>>
>> Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
>> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v2:
>>  * Drop 'shared' from the new its_xxx function names as they are used
>>    for non-realm guests too.
>>  * Don't handle the NUMA_NO_NODE case specially - alloc_pages_node()
>>    should do the right thing.
>>  * Drop a pointless (void *) cast.
>> ---
>>  drivers/irqchip/irq-gic-v3-its.c | 90 ++++++++++++++++++++++++--------
>>  1 file changed, 67 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
>> index 40ebf1726393..ca72f830f4cc 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -18,6 +18,7 @@
>>  #include <linux/irqdomain.h>
>>  #include <linux/list.h>
>>  #include <linux/log2.h>
>> +#include <linux/mem_encrypt.h>
>>  #include <linux/memblock.h>
>>  #include <linux/mm.h>
>>  #include <linux/msi.h>
>> @@ -27,6 +28,7 @@
>>  #include <linux/of_pci.h>
>>  #include <linux/of_platform.h>
>>  #include <linux/percpu.h>
>> +#include <linux/set_memory.h>
>>  #include <linux/slab.h>
>>  #include <linux/syscore_ops.h>
>>
>> @@ -163,6 +165,7 @@ struct its_device {
>>  	struct its_node		*its;
>>  	struct event_lpi_map	event_map;
>>  	void			*itt;
>> +	u32			itt_order;
>>  	u32			nr_ites;
>>  	u32			device_id;
>>  	bool			shared;
>> @@ -198,6 +201,30 @@ static DEFINE_IDA(its_vpeid_ida);
>>  #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
>>  #define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + SZ_128K)
>>
>> +static struct page *its_alloc_pages_node(int node, gfp_t gfp,
>> +					 unsigned int order)
>> +{
>> +	struct page *page;
>> +
>> +	page = alloc_pages_node(node, gfp, order);
>> +
>> +	if (page)
>> +		set_memory_decrypted((unsigned long)page_address(page),
>> +				     1 << order);
> 
> There's been considerable discussion on the x86 side about
> what to do when set_memory_decrypted() or set_memory_encrypted()
> fails. The conclusions are:
> 
> 1) set_memory_decrypted()/encrypted() *could* fail due to a
> compromised/malicious host, due to a bug somewhere in the
> software stack, or due to resource constraints (x86 might need to
> split a large page mapping, and need to allocate additional page
> table pages, which could fail).
> 
> 2) The guest memory that was the target of such a failed call could
> be left in an indeterminate state that the guest could not reliably
> undo or correct. The guest's view of the page's decrypted/encrypted
> state might not match the host's view. Therefore, any such guest
> memory must be leaked rather than being used or put back on the
> free list.
> 
> 3) After some discussion, we decided not to panic in such a case.
> Instead, set_memory_decrypted()/encrypted() generates a WARN,
> as well as returns a failure result. The most security conscious
> users could set panic_on_warn=1 in their VMs, and thereby stop
> further operation if there any indication that the transition between
> encrypted and decrypt is suspect. The caller of these functions
> also can take explicit action in the case of a failure.
> 
> It seems like the same guidelines should apply here. On the x86
> side we've also cleaned up cases where the return value isn't
> checked, like here and the use of set_memory_encrypted() below.

Very good points - this code was lacking error handling. I think you are
also right that the correct situation when set_memory_{en,de}crypted()
fails is to WARN() and leak the page. It's something that shouldn't
happen with a well behaving host and it's unclear how to safely recover
the page - so leaking the page is the safest result. And the WARN()
approach gives the user the option as to whether this is fatal via
panic_on_warn.

Thanks,
Steve
diff mbox series

Patch

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 40ebf1726393..ca72f830f4cc 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -18,6 +18,7 @@ 
 #include <linux/irqdomain.h>
 #include <linux/list.h>
 #include <linux/log2.h>
+#include <linux/mem_encrypt.h>
 #include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/msi.h>
@@ -27,6 +28,7 @@ 
 #include <linux/of_pci.h>
 #include <linux/of_platform.h>
 #include <linux/percpu.h>
+#include <linux/set_memory.h>
 #include <linux/slab.h>
 #include <linux/syscore_ops.h>
 
@@ -163,6 +165,7 @@  struct its_device {
 	struct its_node		*its;
 	struct event_lpi_map	event_map;
 	void			*itt;
+	u32			itt_order;
 	u32			nr_ites;
 	u32			device_id;
 	bool			shared;
@@ -198,6 +201,30 @@  static DEFINE_IDA(its_vpeid_ida);
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
 #define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + SZ_128K)
 
+static struct page *its_alloc_pages_node(int node, gfp_t gfp,
+					 unsigned int order)
+{
+	struct page *page;
+
+	page = alloc_pages_node(node, gfp, order);
+
+	if (page)
+		set_memory_decrypted((unsigned long)page_address(page),
+				     1 << order);
+	return page;
+}
+
+static struct page *its_alloc_pages(gfp_t gfp, unsigned int order)
+{
+	return its_alloc_pages_node(NUMA_NO_NODE, gfp, order);
+}
+
+static void its_free_pages(void *addr, unsigned int order)
+{
+	set_memory_encrypted((unsigned long)addr, 1 << order);
+	free_pages((unsigned long)addr, order);
+}
+
 /*
  * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we
  * always have vSGIs mapped.
@@ -2212,7 +2239,8 @@  static struct page *its_allocate_prop_table(gfp_t gfp_flags)
 {
 	struct page *prop_page;
 
-	prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ));
+	prop_page = its_alloc_pages(gfp_flags,
+				    get_order(LPI_PROPBASE_SZ));
 	if (!prop_page)
 		return NULL;
 
@@ -2223,8 +2251,8 @@  static struct page *its_allocate_prop_table(gfp_t gfp_flags)
 
 static void its_free_prop_table(struct page *prop_page)
 {
-	free_pages((unsigned long)page_address(prop_page),
-		   get_order(LPI_PROPBASE_SZ));
+	its_free_pages(page_address(prop_page),
+		       get_order(LPI_PROPBASE_SZ));
 }
 
 static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size)
@@ -2346,7 +2374,8 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		order = get_order(GITS_BASER_PAGES_MAX * psz);
 	}
 
-	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
+	page = its_alloc_pages_node(its->numa_node,
+				    GFP_KERNEL | __GFP_ZERO, order);
 	if (!page)
 		return -ENOMEM;
 
@@ -2359,7 +2388,7 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		/* 52bit PA is supported only when PageSize=64K */
 		if (psz != SZ_64K) {
 			pr_err("ITS: no 52bit PA support when psz=%d\n", psz);
-			free_pages((unsigned long)base, order);
+			its_free_pages(base, order);
 			return -ENXIO;
 		}
 
@@ -2415,7 +2444,7 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
 		       &its->phys_base, its_base_type_string[type],
 		       val, tmp);
-		free_pages((unsigned long)base, order);
+		its_free_pages(base, order);
 		return -ENXIO;
 	}
 
@@ -2554,8 +2583,8 @@  static void its_free_tables(struct its_node *its)
 
 	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
 		if (its->tables[i].base) {
-			free_pages((unsigned long)its->tables[i].base,
-				   its->tables[i].order);
+			its_free_pages(its->tables[i].base,
+				       its->tables[i].order);
 			its->tables[i].base = NULL;
 		}
 	}
@@ -2821,7 +2850,8 @@  static bool allocate_vpe_l2_table(int cpu, u32 id)
 
 	/* Allocate memory for 2nd level table */
 	if (!table[idx]) {
-		page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz));
+		page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO,
+				       get_order(psz));
 		if (!page)
 			return false;
 
@@ -2940,7 +2970,8 @@  static int allocate_vpe_l1_table(void)
 
 	pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n",
 		 np, npg, psz, epp, esz);
-	page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE));
+	page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO,
+			       get_order(np * PAGE_SIZE));
 	if (!page)
 		return -ENOMEM;
 
@@ -2986,8 +3017,8 @@  static struct page *its_allocate_pending_table(gfp_t gfp_flags)
 {
 	struct page *pend_page;
 
-	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
-				get_order(LPI_PENDBASE_SZ));
+	pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO,
+				    get_order(LPI_PENDBASE_SZ));
 	if (!pend_page)
 		return NULL;
 
@@ -2999,7 +3030,7 @@  static struct page *its_allocate_pending_table(gfp_t gfp_flags)
 
 static void its_free_pending_table(struct page *pt)
 {
-	free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ));
+	its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ));
 }
 
 /*
@@ -3334,8 +3365,9 @@  static bool its_alloc_table_entry(struct its_node *its,
 
 	/* Allocate memory for 2nd level table */
 	if (!table[idx]) {
-		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
-					get_order(baser->psz));
+		page = its_alloc_pages_node(its->numa_node,
+					    GFP_KERNEL | __GFP_ZERO,
+					    get_order(baser->psz));
 		if (!page)
 			return false;
 
@@ -3418,7 +3450,9 @@  static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 	unsigned long *lpi_map = NULL;
 	unsigned long flags;
 	u16 *col_map = NULL;
+	struct page *page;
 	void *itt;
+	int itt_order;
 	int lpi_base;
 	int nr_lpis;
 	int nr_ites;
@@ -3430,7 +3464,6 @@  static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 	if (WARN_ON(!is_power_of_2(nvecs)))
 		nvecs = roundup_pow_of_two(nvecs);
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	/*
 	 * Even if the device wants a single LPI, the ITT must be
 	 * sized as a power of two (and you need at least one bit...).
@@ -3438,7 +3471,16 @@  static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 	nr_ites = max(2, nvecs);
 	sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
 	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
-	itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
+	itt_order = get_order(sz);
+	page = its_alloc_pages_node(its->numa_node,
+				    GFP_KERNEL | __GFP_ZERO,
+				    itt_order);
+	if (!page)
+		return NULL;
+	itt = page_address(page);
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+
 	if (alloc_lpis) {
 		lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
 		if (lpi_map)
@@ -3450,9 +3492,9 @@  static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 		lpi_base = 0;
 	}
 
-	if (!dev || !itt ||  !col_map || (!lpi_map && alloc_lpis)) {
+	if (!dev || !col_map || (!lpi_map && alloc_lpis)) {
 		kfree(dev);
-		kfree(itt);
+		its_free_pages(itt, itt_order);
 		bitmap_free(lpi_map);
 		kfree(col_map);
 		return NULL;
@@ -3462,6 +3504,7 @@  static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 
 	dev->its = its;
 	dev->itt = itt;
+	dev->itt_order = itt_order;
 	dev->nr_ites = nr_ites;
 	dev->event_map.lpi_map = lpi_map;
 	dev->event_map.col_map = col_map;
@@ -3489,7 +3532,7 @@  static void its_free_device(struct its_device *its_dev)
 	list_del(&its_dev->entry);
 	raw_spin_unlock_irqrestore(&its_dev->its->lock, flags);
 	kfree(its_dev->event_map.col_map);
-	kfree(its_dev->itt);
+	its_free_pages(its_dev->itt, its_dev->itt_order);
 	kfree(its_dev);
 }
 
@@ -5131,8 +5174,9 @@  static int __init its_probe_one(struct its_node *its)
 		}
 	}
 
-	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
-				get_order(ITS_CMD_QUEUE_SZ));
+	page = its_alloc_pages_node(its->numa_node,
+				    GFP_KERNEL | __GFP_ZERO,
+				    get_order(ITS_CMD_QUEUE_SZ));
 	if (!page) {
 		err = -ENOMEM;
 		goto out_unmap_sgir;
@@ -5196,7 +5240,7 @@  static int __init its_probe_one(struct its_node *its)
 out_free_tables:
 	its_free_tables(its);
 out_free_cmd:
-	free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
+	its_free_pages(its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
 out_unmap_sgir:
 	if (its->sgir_base)
 		iounmap(its->sgir_base);