Message ID | 20240605093006.145492-13-steven.price@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64: Support for running as a guest in Arm CCA | expand |
The subject line is... odd. I'd expect something like: "irqchip/gic-v3-its: Share ITS tables with a non-trusted hypervisor" because nothing here should be CCA specific. On Wed, 05 Jun 2024 10:30:04 +0100, Steven Price <steven.price@arm.com> wrote: > > Within a realm guest the ITS is emulated by the host. This means the > allocations must have been made available to the host by a call to > set_memory_decrypted(). Introduce an allocation function which performs > this extra call. This doesn't mention that this patch radically changes the allocation of some tables. > > Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com> > Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com> > Signed-off-by: Steven Price <steven.price@arm.com> > --- > Changes since v2: > * Drop 'shared' from the new its_xxx function names as they are used > for non-realm guests too. > * Don't handle the NUMA_NO_NODE case specially - alloc_pages_node() > should do the right thing. > * Drop a pointless (void *) cast. > --- > drivers/irqchip/irq-gic-v3-its.c | 90 ++++++++++++++++++++++++-------- > 1 file changed, 67 insertions(+), 23 deletions(-) > > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c > index 40ebf1726393..ca72f830f4cc 100644 > --- a/drivers/irqchip/irq-gic-v3-its.c > +++ b/drivers/irqchip/irq-gic-v3-its.c > @@ -18,6 +18,7 @@ > #include <linux/irqdomain.h> > #include <linux/list.h> > #include <linux/log2.h> > +#include <linux/mem_encrypt.h> > #include <linux/memblock.h> > #include <linux/mm.h> > #include <linux/msi.h> > @@ -27,6 +28,7 @@ > #include <linux/of_pci.h> > #include <linux/of_platform.h> > #include <linux/percpu.h> > +#include <linux/set_memory.h> > #include <linux/slab.h> > #include <linux/syscore_ops.h> > > @@ -163,6 +165,7 @@ struct its_device { > struct its_node *its; > struct event_lpi_map event_map; > void *itt; > + u32 itt_order; > u32 nr_ites; > u32 device_id; > bool shared; > @@ -198,6 +201,30 @@ static DEFINE_IDA(its_vpeid_ida); > #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) > #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) > > +static struct page *its_alloc_pages_node(int node, gfp_t gfp, > + unsigned int order) > +{ > + struct page *page; > + > + page = alloc_pages_node(node, gfp, order); > + > + if (page) > + set_memory_decrypted((unsigned long)page_address(page), > + 1 << order); Please use BIT(order). > + return page; > +} > + > +static struct page *its_alloc_pages(gfp_t gfp, unsigned int order) > +{ > + return its_alloc_pages_node(NUMA_NO_NODE, gfp, order); > +} > + > +static void its_free_pages(void *addr, unsigned int order) > +{ > + set_memory_encrypted((unsigned long)addr, 1 << order); > + free_pages((unsigned long)addr, order); > +} > + > /* > * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we > * always have vSGIs mapped. > @@ -2212,7 +2239,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) > { > struct page *prop_page; > > - prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ)); > + prop_page = its_alloc_pages(gfp_flags, > + get_order(LPI_PROPBASE_SZ)); > if (!prop_page) > return NULL; > > @@ -2223,8 +2251,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) > > static void its_free_prop_table(struct page *prop_page) > { > - free_pages((unsigned long)page_address(prop_page), > - get_order(LPI_PROPBASE_SZ)); > + its_free_pages(page_address(prop_page), > + get_order(LPI_PROPBASE_SZ)); > } > > static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size) > @@ -2346,7 +2374,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, > order = get_order(GITS_BASER_PAGES_MAX * psz); > } > > - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); > + page = its_alloc_pages_node(its->numa_node, > + GFP_KERNEL | __GFP_ZERO, order); > if (!page) > return -ENOMEM; > > @@ -2359,7 +2388,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, > /* 52bit PA is supported only when PageSize=64K */ > if (psz != SZ_64K) { > pr_err("ITS: no 52bit PA support when psz=%d\n", psz); > - free_pages((unsigned long)base, order); > + its_free_pages(base, order); > return -ENXIO; > } > > @@ -2415,7 +2444,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, > pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", > &its->phys_base, its_base_type_string[type], > val, tmp); > - free_pages((unsigned long)base, order); > + its_free_pages(base, order); > return -ENXIO; > } > > @@ -2554,8 +2583,8 @@ static void its_free_tables(struct its_node *its) > > for (i = 0; i < GITS_BASER_NR_REGS; i++) { > if (its->tables[i].base) { > - free_pages((unsigned long)its->tables[i].base, > - its->tables[i].order); > + its_free_pages(its->tables[i].base, > + its->tables[i].order); > its->tables[i].base = NULL; > } > } > @@ -2821,7 +2850,8 @@ static bool allocate_vpe_l2_table(int cpu, u32 id) > > /* Allocate memory for 2nd level table */ > if (!table[idx]) { > - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz)); > + page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO, > + get_order(psz)); > if (!page) > return false; > > @@ -2940,7 +2970,8 @@ static int allocate_vpe_l1_table(void) > > pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n", > np, npg, psz, epp, esz); > - page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); > + page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO, > + get_order(np * PAGE_SIZE)); > if (!page) > return -ENOMEM; > > @@ -2986,8 +3017,8 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) > { > struct page *pend_page; > > - pend_page = alloc_pages(gfp_flags | __GFP_ZERO, > - get_order(LPI_PENDBASE_SZ)); > + pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO, > + get_order(LPI_PENDBASE_SZ)); > if (!pend_page) > return NULL; > > @@ -2999,7 +3030,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) > > static void its_free_pending_table(struct page *pt) > { > - free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ)); > + its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ)); > } > > /* > @@ -3334,8 +3365,9 @@ static bool its_alloc_table_entry(struct its_node *its, > > /* Allocate memory for 2nd level table */ > if (!table[idx]) { > - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, > - get_order(baser->psz)); > + page = its_alloc_pages_node(its->numa_node, > + GFP_KERNEL | __GFP_ZERO, > + get_order(baser->psz)); > if (!page) > return false; > > @@ -3418,7 +3450,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > unsigned long *lpi_map = NULL; > unsigned long flags; > u16 *col_map = NULL; > + struct page *page; > void *itt; > + int itt_order; > int lpi_base; > int nr_lpis; > int nr_ites; > @@ -3430,7 +3464,6 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > if (WARN_ON(!is_power_of_2(nvecs))) > nvecs = roundup_pow_of_two(nvecs); > > - dev = kzalloc(sizeof(*dev), GFP_KERNEL); > /* > * Even if the device wants a single LPI, the ITT must be > * sized as a power of two (and you need at least one bit...). > @@ -3438,7 +3471,16 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > nr_ites = max(2, nvecs); > sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); > sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; > - itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); > + itt_order = get_order(sz); > + page = its_alloc_pages_node(its->numa_node, > + GFP_KERNEL | __GFP_ZERO, > + itt_order); So we go from an allocation that was so far measured in *bytes* to something that is now at least a page. Per device. This seems a bit excessive to me, specially when it isn't conditioned on anything and is now imposed on all platforms, including the non-CCA systems (which are exactly 100% of the machines). Another thing is that if we go with page alignment, then the 256 byte alignment can obviously be removed everywhere (hint: MAPD needs to change). Thanks, M.
Hi Marc, On 05/06/2024 14:39, Marc Zyngier wrote: > The subject line is... odd. I'd expect something like: > > "irqchip/gic-v3-its: Share ITS tables with a non-trusted hypervisor" > > because nothing here should be CCA specific. Good point - that's a much better subject. > On Wed, 05 Jun 2024 10:30:04 +0100, > Steven Price <steven.price@arm.com> wrote: >> >> Within a realm guest the ITS is emulated by the host. This means the >> allocations must have been made available to the host by a call to >> set_memory_decrypted(). Introduce an allocation function which performs >> this extra call. > > This doesn't mention that this patch radically changes the allocation > of some tables. I guess that depends on your definition of radical, see below. >> >> Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com> >> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com> >> Signed-off-by: Steven Price <steven.price@arm.com> >> --- >> Changes since v2: >> * Drop 'shared' from the new its_xxx function names as they are used >> for non-realm guests too. >> * Don't handle the NUMA_NO_NODE case specially - alloc_pages_node() >> should do the right thing. >> * Drop a pointless (void *) cast. >> --- >> drivers/irqchip/irq-gic-v3-its.c | 90 ++++++++++++++++++++++++-------- >> 1 file changed, 67 insertions(+), 23 deletions(-) >> >> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c >> index 40ebf1726393..ca72f830f4cc 100644 >> --- a/drivers/irqchip/irq-gic-v3-its.c >> +++ b/drivers/irqchip/irq-gic-v3-its.c >> @@ -18,6 +18,7 @@ >> #include <linux/irqdomain.h> >> #include <linux/list.h> >> #include <linux/log2.h> >> +#include <linux/mem_encrypt.h> >> #include <linux/memblock.h> >> #include <linux/mm.h> >> #include <linux/msi.h> >> @@ -27,6 +28,7 @@ >> #include <linux/of_pci.h> >> #include <linux/of_platform.h> >> #include <linux/percpu.h> >> +#include <linux/set_memory.h> >> #include <linux/slab.h> >> #include <linux/syscore_ops.h> >> >> @@ -163,6 +165,7 @@ struct its_device { >> struct its_node *its; >> struct event_lpi_map event_map; >> void *itt; >> + u32 itt_order; >> u32 nr_ites; >> u32 device_id; >> bool shared; >> @@ -198,6 +201,30 @@ static DEFINE_IDA(its_vpeid_ida); >> #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) >> #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) >> >> +static struct page *its_alloc_pages_node(int node, gfp_t gfp, >> + unsigned int order) >> +{ >> + struct page *page; >> + >> + page = alloc_pages_node(node, gfp, order); >> + >> + if (page) >> + set_memory_decrypted((unsigned long)page_address(page), >> + 1 << order); > > Please use BIT(order). Sure. >> + return page; >> +} >> + >> +static struct page *its_alloc_pages(gfp_t gfp, unsigned int order) >> +{ >> + return its_alloc_pages_node(NUMA_NO_NODE, gfp, order); >> +} >> + >> +static void its_free_pages(void *addr, unsigned int order) >> +{ >> + set_memory_encrypted((unsigned long)addr, 1 << order); >> + free_pages((unsigned long)addr, order); >> +} >> + >> /* >> * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we >> * always have vSGIs mapped. >> @@ -2212,7 +2239,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) >> { >> struct page *prop_page; >> >> - prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ)); >> + prop_page = its_alloc_pages(gfp_flags, >> + get_order(LPI_PROPBASE_SZ)); >> if (!prop_page) >> return NULL; >> >> @@ -2223,8 +2251,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) >> >> static void its_free_prop_table(struct page *prop_page) >> { >> - free_pages((unsigned long)page_address(prop_page), >> - get_order(LPI_PROPBASE_SZ)); >> + its_free_pages(page_address(prop_page), >> + get_order(LPI_PROPBASE_SZ)); >> } >> >> static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size) >> @@ -2346,7 +2374,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> order = get_order(GITS_BASER_PAGES_MAX * psz); >> } >> >> - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); >> + page = its_alloc_pages_node(its->numa_node, >> + GFP_KERNEL | __GFP_ZERO, order); >> if (!page) >> return -ENOMEM; >> >> @@ -2359,7 +2388,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> /* 52bit PA is supported only when PageSize=64K */ >> if (psz != SZ_64K) { >> pr_err("ITS: no 52bit PA support when psz=%d\n", psz); >> - free_pages((unsigned long)base, order); >> + its_free_pages(base, order); >> return -ENXIO; >> } >> >> @@ -2415,7 +2444,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", >> &its->phys_base, its_base_type_string[type], >> val, tmp); >> - free_pages((unsigned long)base, order); >> + its_free_pages(base, order); >> return -ENXIO; >> } >> >> @@ -2554,8 +2583,8 @@ static void its_free_tables(struct its_node *its) >> >> for (i = 0; i < GITS_BASER_NR_REGS; i++) { >> if (its->tables[i].base) { >> - free_pages((unsigned long)its->tables[i].base, >> - its->tables[i].order); >> + its_free_pages(its->tables[i].base, >> + its->tables[i].order); >> its->tables[i].base = NULL; >> } >> } >> @@ -2821,7 +2850,8 @@ static bool allocate_vpe_l2_table(int cpu, u32 id) >> >> /* Allocate memory for 2nd level table */ >> if (!table[idx]) { >> - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz)); >> + page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO, >> + get_order(psz)); >> if (!page) >> return false; >> >> @@ -2940,7 +2970,8 @@ static int allocate_vpe_l1_table(void) >> >> pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n", >> np, npg, psz, epp, esz); >> - page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); >> + page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO, >> + get_order(np * PAGE_SIZE)); >> if (!page) >> return -ENOMEM; >> >> @@ -2986,8 +3017,8 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) >> { >> struct page *pend_page; >> >> - pend_page = alloc_pages(gfp_flags | __GFP_ZERO, >> - get_order(LPI_PENDBASE_SZ)); >> + pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO, >> + get_order(LPI_PENDBASE_SZ)); >> if (!pend_page) >> return NULL; >> >> @@ -2999,7 +3030,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) >> >> static void its_free_pending_table(struct page *pt) >> { >> - free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ)); >> + its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ)); >> } >> >> /* >> @@ -3334,8 +3365,9 @@ static bool its_alloc_table_entry(struct its_node *its, >> >> /* Allocate memory for 2nd level table */ >> if (!table[idx]) { >> - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, >> - get_order(baser->psz)); >> + page = its_alloc_pages_node(its->numa_node, >> + GFP_KERNEL | __GFP_ZERO, >> + get_order(baser->psz)); >> if (!page) >> return false; >> >> @@ -3418,7 +3450,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, >> unsigned long *lpi_map = NULL; >> unsigned long flags; >> u16 *col_map = NULL; >> + struct page *page; >> void *itt; >> + int itt_order; >> int lpi_base; >> int nr_lpis; >> int nr_ites; >> @@ -3430,7 +3464,6 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, >> if (WARN_ON(!is_power_of_2(nvecs))) >> nvecs = roundup_pow_of_two(nvecs); >> >> - dev = kzalloc(sizeof(*dev), GFP_KERNEL); >> /* >> * Even if the device wants a single LPI, the ITT must be >> * sized as a power of two (and you need at least one bit...). >> @@ -3438,7 +3471,16 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, >> nr_ites = max(2, nvecs); >> sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); >> sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; >> - itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); >> + itt_order = get_order(sz); >> + page = its_alloc_pages_node(its->numa_node, >> + GFP_KERNEL | __GFP_ZERO, >> + itt_order); > > So we go from an allocation that was so far measured in *bytes* to > something that is now at least a page. Per device. This seems a bit > excessive to me, specially when it isn't conditioned on anything and > is now imposed on all platforms, including the non-CCA systems (which > are exactly 100% of the machines). Catalin asked about this in v2: https://lore.kernel.org/lkml/c329ae18-2b61-4851-8d6a-9e691a2007c8@arm.com/ To be honest, I don't have a great handle on how much memory is being wasted here. Within the realm guest I was testing this is rounding up an otherwise 511 byte allocation to a 4k page, and there are 3 of them. Which seems reasonable from a realm guest perspective. I can see two options to improve here: 1. Add a !is_realm_world() check and return to the previous behaviour when not running in a realm. It's ugly, and doesn't deal with any other potential future memory encryption. cc_platform_has(CC_ATTR_MEM_ENCRYPT) might be preferable? But this means no impact to non-realm guests. 2. Use a special (global) memory allocator that does the set_memory_decrypted() dance on the pages that it allocates but allows packing the allocations. I'm not aware of an existing kernel API for this, so it's potentially quite a bit of code. The benefit is that it reduces memory consumption in a realm guest, although fragmentation still means we're likely to see a (small) growth. Any thoughts on what you think would be best? > Another thing is that if we go with page alignment, then the 256 byte > alignment can obviously be removed everywhere (hint: MAPD needs to > change). Ah, good point - I'll need to look into that, my GIC-foo isn't quite up to speed on that. Thanks, Steve
On Wed, 05 Jun 2024 16:08:49 +0100, Steven Price <steven.price@arm.com> wrote: > > Hi Marc, > > On 05/06/2024 14:39, Marc Zyngier wrote: > > The subject line is... odd. I'd expect something like: > > > > "irqchip/gic-v3-its: Share ITS tables with a non-trusted hypervisor" > > > > because nothing here should be CCA specific. > > Good point - that's a much better subject. > > > On Wed, 05 Jun 2024 10:30:04 +0100, > > Steven Price <steven.price@arm.com> wrote: > >> > >> Within a realm guest the ITS is emulated by the host. This means the > >> allocations must have been made available to the host by a call to > >> set_memory_decrypted(). Introduce an allocation function which performs > >> this extra call. > > > > This doesn't mention that this patch radically changes the allocation > > of some tables. > > I guess that depends on your definition of radical, see below. It's election time, I'm all about making bold statements! [...] > >> @@ -3334,8 +3365,9 @@ static bool its_alloc_table_entry(struct its_node *its, > >> > >> /* Allocate memory for 2nd level table */ > >> if (!table[idx]) { > >> - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, > >> - get_order(baser->psz)); > >> + page = its_alloc_pages_node(its->numa_node, > >> + GFP_KERNEL | __GFP_ZERO, > >> + get_order(baser->psz)); > >> if (!page) > >> return false; > >> > >> @@ -3418,7 +3450,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > >> unsigned long *lpi_map = NULL; > >> unsigned long flags; > >> u16 *col_map = NULL; > >> + struct page *page; > >> void *itt; > >> + int itt_order; > >> int lpi_base; > >> int nr_lpis; > >> int nr_ites; > >> @@ -3430,7 +3464,6 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > >> if (WARN_ON(!is_power_of_2(nvecs))) > >> nvecs = roundup_pow_of_two(nvecs); > >> > >> - dev = kzalloc(sizeof(*dev), GFP_KERNEL); > >> /* > >> * Even if the device wants a single LPI, the ITT must be > >> * sized as a power of two (and you need at least one bit...). > >> @@ -3438,7 +3471,16 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > >> nr_ites = max(2, nvecs); > >> sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); > >> sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; > >> - itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); > >> + itt_order = get_order(sz); > >> + page = its_alloc_pages_node(its->numa_node, > >> + GFP_KERNEL | __GFP_ZERO, > >> + itt_order); > > > > So we go from an allocation that was so far measured in *bytes* to > > something that is now at least a page. Per device. This seems a bit > > excessive to me, specially when it isn't conditioned on anything and > > is now imposed on all platforms, including the non-CCA systems (which > > are exactly 100% of the machines). > > Catalin asked about this in v2: > https://lore.kernel.org/lkml/c329ae18-2b61-4851-8d6a-9e691a2007c8@arm.com/ > > To be honest, I don't have a great handle on how much memory is being > wasted here. Within the realm guest I was testing this is rounding up an > otherwise 511 byte allocation to a 4k page, and there are 3 of them. > Which seems reasonable from a realm guest perspective. And not that reasonable on a smaller system, such as my own router VM that has a whole lot of devices and very little memory. Not to mention that while CCA is stuck with 4k pages (duh!), the world is moving towards larger pages, meaning that this is wasting even more memory. > > I can see two options to improve here: > > 1. Add a !is_realm_world() check and return to the previous behaviour > when not running in a realm. It's ugly, and doesn't deal with any other > potential future memory encryption. cc_platform_has(CC_ATTR_MEM_ENCRYPT) > might be preferable? But this means no impact to non-realm guests. No, this is way too ugly, and doesn't help with things like pKVM. > > 2. Use a special (global) memory allocator that does the > set_memory_decrypted() dance on the pages that it allocates but allows > packing the allocations. I'm not aware of an existing kernel API for > this, so it's potentially quite a bit of code. The benefit is that it > reduces memory consumption in a realm guest, although fragmentation > still means we're likely to see a (small) growth. > > Any thoughts on what you think would be best? I would expect that something similar to kmem_cache could be of help, only with the ability to deal with variable object sizes (in this case: minimum of 256 bytes, in increments defined by the implementation, and with a 256 byte alignment). I don't think the ITS is particularly special here, and we should come up with something that is generic enough to support sharing of non-page-sized objects. Thanks, M.
On Thu, Jun 06, 2024 at 11:17:36AM +0100, Marc Zyngier wrote: > On Wed, 05 Jun 2024 16:08:49 +0100, > Steven Price <steven.price@arm.com> wrote: > > 2. Use a special (global) memory allocator that does the > > set_memory_decrypted() dance on the pages that it allocates but allows > > packing the allocations. I'm not aware of an existing kernel API for > > this, so it's potentially quite a bit of code. The benefit is that it > > reduces memory consumption in a realm guest, although fragmentation > > still means we're likely to see a (small) growth. > > > > Any thoughts on what you think would be best? > > I would expect that something similar to kmem_cache could be of help, > only with the ability to deal with variable object sizes (in this > case: minimum of 256 bytes, in increments defined by the > implementation, and with a 256 byte alignment). Hmm, that's doable but not that easy to make generic. We'd need a new class of kmalloc-* caches (e.g. kmalloc-decrypted-*) which use only decrypted pages together with a GFP_DECRYPTED flag or something to get the slab allocator to go for these pages and avoid merging with other caches. It would actually be the page allocator parsing this gfp flag, probably in post_alloc_hook() to set the page decrypted and re-encrypt it in free_pages_prepare(). A slight problem here is that free_pages() doesn't get the gfp flag, so we'd need to store some bit in the page flags. Maybe the flag is not that bad, do we have something like for page_to_phys() to give us the high IPA address for decrypted pages? Similarly if we go for a kmem_cache (or a few for multiple sizes). One can specify a constructor which could set the memory decrypted but there's no destructor (and also the constructor is per object, not per page, so we'd need some refcounting). Another approach contained within the driver is to use mempool_create() with our own _alloc_fn/_free_fn that sets the memory decrypted/encrypted accordingly, though sub-page allocations need additional tracking. Also that's fairly similar to kmem_cache, fixed size. Yet another option would be to wire it somehow in the DMA API but the minimum allocation is already a page size, so we don't gain anything. What gets somewhat closer to what we need is gen_pool. It can track different sizes, we just need to populate the chunks as needed. I don't think this would work as a generic allocator but may be good enough within the ITS code. If there's a need for such generic allocations in other parts of the kernel, my preference would be something around kmalloc caches and a new GFP flag (first option; subject to the selling it to the mm folk). But that's more of a separate prototyping effort that may or may not succeed. Anyway, we could do some hacking around gen_pool as a temporary solution (maybe as a set of patches on top of this series to be easier to revert) and start investigating a proper decrypted page allocator in parallel. We just need to find a victim that has the page allocator fresh in mind (Ryan or Alexandru ;)).
On 06/06/2024 19:38, Catalin Marinas wrote: > On Thu, Jun 06, 2024 at 11:17:36AM +0100, Marc Zyngier wrote: >> On Wed, 05 Jun 2024 16:08:49 +0100, >> Steven Price <steven.price@arm.com> wrote: >>> 2. Use a special (global) memory allocator that does the >>> set_memory_decrypted() dance on the pages that it allocates but allows >>> packing the allocations. I'm not aware of an existing kernel API for >>> this, so it's potentially quite a bit of code. The benefit is that it >>> reduces memory consumption in a realm guest, although fragmentation >>> still means we're likely to see a (small) growth. >>> >>> Any thoughts on what you think would be best? >> >> I would expect that something similar to kmem_cache could be of help, >> only with the ability to deal with variable object sizes (in this >> case: minimum of 256 bytes, in increments defined by the >> implementation, and with a 256 byte alignment). > > Hmm, that's doable but not that easy to make generic. We'd need a new > class of kmalloc-* caches (e.g. kmalloc-decrypted-*) which use only > decrypted pages together with a GFP_DECRYPTED flag or something to get > the slab allocator to go for these pages and avoid merging with other > caches. It would actually be the page allocator parsing this gfp flag, > probably in post_alloc_hook() to set the page decrypted and re-encrypt > it in free_pages_prepare(). A slight problem here is that free_pages() > doesn't get the gfp flag, so we'd need to store some bit in the page > flags. Maybe the flag is not that bad, do we have something like for > page_to_phys() to give us the high IPA address for decrypted pages? > > Similarly if we go for a kmem_cache (or a few for multiple sizes). One > can specify a constructor which could set the memory decrypted but > there's no destructor (and also the constructor is per object, not per > page, so we'd need some refcounting). > > Another approach contained within the driver is to use mempool_create() > with our own _alloc_fn/_free_fn that sets the memory decrypted/encrypted > accordingly, though sub-page allocations need additional tracking. Also > that's fairly similar to kmem_cache, fixed size. > > Yet another option would be to wire it somehow in the DMA API but the > minimum allocation is already a page size, so we don't gain anything. > > What gets somewhat closer to what we need is gen_pool. It can track > different sizes, we just need to populate the chunks as needed. I don't > think this would work as a generic allocator but may be good enough > within the ITS code. > > If there's a need for such generic allocations in other parts of the > kernel, my preference would be something around kmalloc caches and a new > GFP flag (first option; subject to the selling it to the mm folk). But > that's more of a separate prototyping effort that may or may not > succeed. > > Anyway, we could do some hacking around gen_pool as a temporary solution > (maybe as a set of patches on top of this series to be easier to revert) > and start investigating a proper decrypted page allocator in parallel. > We just need to find a victim that has the page allocator fresh in mind > (Ryan or Alexandru ;)). Thanks for the suggestions Catalin. I had a go at implementing something with gen_pool - the below (very lightly tested) hack seems to work. This is on top of the current series. I *think* it should also be safe to drop the whole alignment part with this custom allocator, which could actually save memory. But I haven't quite got my head around that yet. Steve ----8<----- diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index ca72f830f4cc..e78634d4d22c 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -12,6 +12,7 @@ #include <linux/crash_dump.h> #include <linux/delay.h> #include <linux/efi.h> +#include <linux/genalloc.h> #include <linux/interrupt.h> #include <linux/iommu.h> #include <linux/iopoll.h> @@ -165,7 +166,7 @@ struct its_device { struct its_node *its; struct event_lpi_map event_map; void *itt; - u32 itt_order; + u32 itt_sz; u32 nr_ites; u32 device_id; bool shared; @@ -225,6 +226,50 @@ static void its_free_pages(void *addr, unsigned int order) free_pages((unsigned long)addr, order); } +static struct gen_pool *itt_pool; + +static void *itt_alloc_pool(int node, int size) +{ + unsigned long addr; + struct page *page; + + if (size >= PAGE_SIZE) { + page = its_alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO, + get_order(size)); + + return page_address(page); + } + + do { + addr = gen_pool_alloc(itt_pool, size); + if (addr) + break; + + page = its_alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 1); + if (!page) + break; + + gen_pool_add(itt_pool, (unsigned long)page_address(page), + PAGE_SIZE, node); + } while (!addr); + + return (void*)addr; +} + +static void itt_free_pool(void *addr, int size) +{ + if (!addr) + return; + + if (size >= PAGE_SIZE) { + its_free_pages(addr, get_order(size)); + return; + } + + gen_pool_free(itt_pool, (unsigned long)addr, size); +} + /* * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we * always have vSGIs mapped. @@ -3450,9 +3495,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, unsigned long *lpi_map = NULL; unsigned long flags; u16 *col_map = NULL; - struct page *page; void *itt; - int itt_order; int lpi_base; int nr_lpis; int nr_ites; @@ -3471,13 +3514,8 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, nr_ites = max(2, nvecs); sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; - itt_order = get_order(sz); - page = its_alloc_pages_node(its->numa_node, - GFP_KERNEL | __GFP_ZERO, - itt_order); - if (!page) - return NULL; - itt = page_address(page); + + itt = itt_alloc_pool(its->numa_node, sz); dev = kzalloc(sizeof(*dev), GFP_KERNEL); @@ -3492,9 +3530,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, lpi_base = 0; } - if (!dev || !col_map || (!lpi_map && alloc_lpis)) { + if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) { kfree(dev); - its_free_pages(itt, itt_order); + itt_free_pool(itt, sz); bitmap_free(lpi_map); kfree(col_map); return NULL; @@ -3504,7 +3542,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, dev->its = its; dev->itt = itt; - dev->itt_order = itt_order; + dev->itt_sz = sz; dev->nr_ites = nr_ites; dev->event_map.lpi_map = lpi_map; dev->event_map.col_map = col_map; @@ -3532,7 +3570,7 @@ static void its_free_device(struct its_device *its_dev) list_del(&its_dev->entry); raw_spin_unlock_irqrestore(&its_dev->its->lock, flags); kfree(its_dev->event_map.col_map); - its_free_pages(its_dev->itt, its_dev->itt_order); + itt_free_pool(its_dev->itt, its_dev->itt_sz); kfree(its_dev); } @@ -5722,6 +5760,10 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, bool has_v4_1 = false; int err; + itt_pool = gen_pool_create(get_order(ITS_ITT_ALIGN), -1); + if (!itt_pool) + return -ENOMEM; + gic_rdists = rdists; its_parent = parent_domain;
On Fri, Jun 07, 2024 at 04:45:14PM +0100, Steven Price wrote: > On 06/06/2024 19:38, Catalin Marinas wrote: > > Anyway, we could do some hacking around gen_pool as a temporary solution > > (maybe as a set of patches on top of this series to be easier to revert) > > and start investigating a proper decrypted page allocator in parallel. > > We just need to find a victim that has the page allocator fresh in mind > > (Ryan or Alexandru ;)). > > Thanks for the suggestions Catalin. I had a go at implementing something > with gen_pool - the below (very lightly tested) hack seems to work. This > is on top of the current series. > > I *think* it should also be safe to drop the whole alignment part with > this custom allocator, which could actually save memory. But I haven't > quite got my head around that yet. Thanks Steven. It doesn't look too complex and it solves the memory wasting. We don't actually free the pages from gen_pool but I don't think it matters much, the memory would get reused if devices are removed and re-added.
On Thu, Jun 06, 2024 at 07:38:08PM +0100, Catalin Marinas wrote: > On Thu, Jun 06, 2024 at 11:17:36AM +0100, Marc Zyngier wrote: > > On Wed, 05 Jun 2024 16:08:49 +0100, > > Steven Price <steven.price@arm.com> wrote: > > > 2. Use a special (global) memory allocator that does the > > > set_memory_decrypted() dance on the pages that it allocates but allows > > > packing the allocations. I'm not aware of an existing kernel API for > > > this, so it's potentially quite a bit of code. The benefit is that it > > > reduces memory consumption in a realm guest, although fragmentation > > > still means we're likely to see a (small) growth. > > > > > > Any thoughts on what you think would be best? > > > > I would expect that something similar to kmem_cache could be of help, > > only with the ability to deal with variable object sizes (in this > > case: minimum of 256 bytes, in increments defined by the > > implementation, and with a 256 byte alignment). > > Hmm, that's doable but not that easy to make generic. We'd need a new > class of kmalloc-* caches (e.g. kmalloc-decrypted-*) which use only > decrypted pages together with a GFP_DECRYPTED flag or something to get > the slab allocator to go for these pages and avoid merging with other > caches. It would actually be the page allocator parsing this gfp flag, > probably in post_alloc_hook() to set the page decrypted and re-encrypt > it in free_pages_prepare(). A slight problem here is that free_pages() > doesn't get the gfp flag, so we'd need to store some bit in the page > flags. Maybe the flag is not that bad, do we have something like for > page_to_phys() to give us the high IPA address for decrypted pages? I had a go at this generic approach, Friday afternoon fun. Not tested with the CCA patches, just my own fake set_memory_*() functionality on top of 6.10-rc2. I also mindlessly added __GFP_DECRYPTED to the ITS code following the changes in this patch. I noticed that allocate_vpe_l2_table() doesn't use shared pages (I either missed it or it's not needed). If we want to go this way, I can tidy up the diff below, split it into a proper series and add what's missing. What I can't tell is how a driver writer knows when to pass __GFP_DECRYPTED. There's also a theoretical overlap with __GFP_DMA, it can't handle both. The DMA flag takes priority currently but I realised it probably needs to be the other way around, we wouldn't have dma mask restrictions for emulated devices. --------------------8<---------------------------------- diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 40ebf1726393..b6627e839e62 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2212,7 +2212,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) { struct page *prop_page; - prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ)); + prop_page = alloc_pages(gfp_flags | __GFP_DECRYPTED, + get_order(LPI_PROPBASE_SZ)); if (!prop_page) return NULL; @@ -2346,7 +2347,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, order = get_order(GITS_BASER_PAGES_MAX * psz); } - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO | + __GFP_DECRYPTED, order); if (!page) return -ENOMEM; @@ -2940,7 +2942,8 @@ static int allocate_vpe_l1_table(void) pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n", np, npg, psz, epp, esz); - page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); + page = alloc_pages(GFP_ATOMIC | __GFP_ZERO | __GFP_DECRYPTED, + get_order(np * PAGE_SIZE)); if (!page) return -ENOMEM; @@ -2986,7 +2989,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) { struct page *pend_page; - pend_page = alloc_pages(gfp_flags | __GFP_ZERO, + pend_page = alloc_pages(gfp_flags | __GFP_ZERO | __GFP_DECRYPTED, get_order(LPI_PENDBASE_SZ)); if (!pend_page) return NULL; @@ -3334,7 +3337,8 @@ static bool its_alloc_table_entry(struct its_node *its, /* Allocate memory for 2nd level table */ if (!table[idx]) { - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, + page = alloc_pages_node(its->numa_node, GFP_KERNEL | + __GFP_ZERO | __GFP_DECRYPTED, get_order(baser->psz)); if (!page) return false; @@ -3438,7 +3442,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, nr_ites = max(2, nvecs); sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; - itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); + itt = kzalloc_node(sz, GFP_KERNEL | __GFP_DECRYPTED, its->numa_node); if (alloc_lpis) { lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis); if (lpi_map) @@ -5131,8 +5135,8 @@ static int __init its_probe_one(struct its_node *its) } } - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, - get_order(ITS_CMD_QUEUE_SZ)); + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO | + __GFP_DECRYPTED, get_order(ITS_CMD_QUEUE_SZ)); if (!page) { err = -ENOMEM; goto out_unmap_sgir; diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 313be4ad79fd..573989664639 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -57,6 +57,9 @@ enum { #endif #ifdef CONFIG_SLAB_OBJ_EXT ___GFP_NO_OBJ_EXT_BIT, +#endif +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT + ___GFP_DECRYPTED_BIT, #endif ___GFP_LAST_BIT }; @@ -103,6 +106,11 @@ enum { #else #define ___GFP_NO_OBJ_EXT 0 #endif +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT +#define ___GFP_DECRYPTED BIT(___GFP_DECRYPTED_BIT) +#else +#define ___GFP_DECRYPTED 0 +#endif /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) @@ -117,6 +125,8 @@ enum { #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) +#define __GFP_DECRYPTED ((__force gfp_t)___GFP_DECRYPTED) + /** * DOC: Page mobility and placement hints * diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 104078afe0b1..705707052274 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -127,6 +127,9 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT + PG_decrypted, +#endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, @@ -626,6 +629,15 @@ PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT +PAGEFLAG(Decrypted, decrypted, PF_HEAD) +#define __PG_DECRYPTED (1UL << PG_decrypted) +#else +PAGEFLAG_FALSE(Decrypted, decrypted) +#define __PG_DECRYPTED 0 +#endif + + #ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) diff --git a/include/linux/slab.h b/include/linux/slab.h index 7247e217e21b..f7a2cf624c35 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -422,6 +422,9 @@ enum kmalloc_cache_type { #endif #ifdef CONFIG_MEMCG_KMEM KMALLOC_CGROUP, +#endif +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT + KMALLOC_DECRYPTED, #endif NR_KMALLOC_TYPES }; @@ -433,7 +436,7 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; * Define gfp bits that should not be set for KMALLOC_NORMAL. */ #define KMALLOC_NOT_NORMAL_BITS \ - (__GFP_RECLAIMABLE | \ + (__GFP_RECLAIMABLE | __GFP_DECRYPTED | \ (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) @@ -458,11 +461,14 @@ static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigne * At least one of the flags has to be set. Their priorities in * decreasing order are: * 1) __GFP_DMA - * 2) __GFP_RECLAIMABLE - * 3) __GFP_ACCOUNT + * 2) __GFP_DECRYPTED + * 3) __GFP_RECLAIMABLE + * 4) __GFP_ACCOUNT */ if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) return KMALLOC_DMA; + if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) && (flags & __GFP_DECRYPTED)) + return KMALLOC_DECRYPTED; if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) return KMALLOC_RECLAIM; else diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index e46d6e82765e..a0879155f892 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -83,6 +83,12 @@ #define IF_HAVE_PG_HWPOISON(_name) #endif +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT +#define IF_HAVE_PG_DECRYPTED(_name) ,{1UL << PG_##_name, __stringify(_name)} +#else +#define IF_HAVE_PG_DECRYPTED(_name) +#endif + #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) #define IF_HAVE_PG_IDLE(_name) ,{1UL << PG_##_name, __stringify(_name)} #else @@ -121,6 +127,7 @@ IF_HAVE_PG_MLOCK(mlocked) \ IF_HAVE_PG_UNCACHED(uncached) \ IF_HAVE_PG_HWPOISON(hwpoison) \ +IF_HAVE_PG_DECRYPTED(decrypted) \ IF_HAVE_PG_IDLE(idle) \ IF_HAVE_PG_IDLE(young) \ IF_HAVE_PG_ARCH_X(arch_2) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2e22ce5675ca..c93ae50ec402 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -47,6 +47,7 @@ #include <linux/sched/mm.h> #include <linux/page_owner.h> #include <linux/page_table_check.h> +#include <linux/set_memory.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/lockdep.h> @@ -1051,6 +1052,12 @@ __always_inline bool free_pages_prepare(struct page *page, return false; } + if (unlikely(PageDecrypted(page))) { + set_memory_encrypted((unsigned long)page_address(page), + 1 << order); + ClearPageDecrypted(page); + } + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); /* @@ -1415,6 +1422,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool decrypted = true; //gfp_flags & __GFP_DECRYPTED; int i; set_page_private(page, 0); @@ -1465,6 +1473,12 @@ inline void post_alloc_hook(struct page *page, unsigned int order, if (init) kernel_init_pages(page, 1 << order); + if (decrypted) { + set_memory_decrypted((unsigned long)page_address(page), + 1 << order); + SetPageDecrypted(page); + } + set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); pgalloc_tag_add(page, current, 1 << order); diff --git a/mm/slab_common.c b/mm/slab_common.c index 1560a1546bb1..de9c8c674aa1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -737,6 +737,12 @@ EXPORT_SYMBOL(kmalloc_size_roundup); #define KMALLOC_RCL_NAME(sz) #endif +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT +#define KMALLOC_DECRYPTED_NAME(sz) .name[KMALLOC_DECRYPTED] = "kmalloc-decrypted-" #sz, +#else +#define KMALLOC_DECRYPTED_NAME(sz) +#endif + #ifdef CONFIG_RANDOM_KMALLOC_CACHES #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b #define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz) @@ -765,6 +771,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup); KMALLOC_RCL_NAME(__short_size) \ KMALLOC_CGROUP_NAME(__short_size) \ KMALLOC_DMA_NAME(__short_size) \ + KMALLOC_DECRYPTED_NAME(__short_size) \ KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \ .size = __size, \ } @@ -889,6 +896,9 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type) if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL)) flags |= SLAB_NO_MERGE; + if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) && (type == KMALLOC_DECRYPTED)) + flags |= SLAB_NO_MERGE; + if (minalign > ARCH_KMALLOC_MINALIGN) { aligned_size = ALIGN(aligned_size, minalign); aligned_idx = __kmalloc_index(aligned_size, false);
From: Steven Price <steven.price@arm.com> Sent: Wednesday, June 5, 2024 2:30 AM > > Within a realm guest the ITS is emulated by the host. This means the > allocations must have been made available to the host by a call to > set_memory_decrypted(). Introduce an allocation function which performs > this extra call. > > Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com> > Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com> > Signed-off-by: Steven Price <steven.price@arm.com> > --- > Changes since v2: > * Drop 'shared' from the new its_xxx function names as they are used > for non-realm guests too. > * Don't handle the NUMA_NO_NODE case specially - alloc_pages_node() > should do the right thing. > * Drop a pointless (void *) cast. > --- > drivers/irqchip/irq-gic-v3-its.c | 90 ++++++++++++++++++++++++-------- > 1 file changed, 67 insertions(+), 23 deletions(-) > > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c > index 40ebf1726393..ca72f830f4cc 100644 > --- a/drivers/irqchip/irq-gic-v3-its.c > +++ b/drivers/irqchip/irq-gic-v3-its.c > @@ -18,6 +18,7 @@ > #include <linux/irqdomain.h> > #include <linux/list.h> > #include <linux/log2.h> > +#include <linux/mem_encrypt.h> > #include <linux/memblock.h> > #include <linux/mm.h> > #include <linux/msi.h> > @@ -27,6 +28,7 @@ > #include <linux/of_pci.h> > #include <linux/of_platform.h> > #include <linux/percpu.h> > +#include <linux/set_memory.h> > #include <linux/slab.h> > #include <linux/syscore_ops.h> > > @@ -163,6 +165,7 @@ struct its_device { > struct its_node *its; > struct event_lpi_map event_map; > void *itt; > + u32 itt_order; > u32 nr_ites; > u32 device_id; > bool shared; > @@ -198,6 +201,30 @@ static DEFINE_IDA(its_vpeid_ida); > #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) > #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) > > +static struct page *its_alloc_pages_node(int node, gfp_t gfp, > + unsigned int order) > +{ > + struct page *page; > + > + page = alloc_pages_node(node, gfp, order); > + > + if (page) > + set_memory_decrypted((unsigned long)page_address(page), > + 1 << order); There's been considerable discussion on the x86 side about what to do when set_memory_decrypted() or set_memory_encrypted() fails. The conclusions are: 1) set_memory_decrypted()/encrypted() *could* fail due to a compromised/malicious host, due to a bug somewhere in the software stack, or due to resource constraints (x86 might need to split a large page mapping, and need to allocate additional page table pages, which could fail). 2) The guest memory that was the target of such a failed call could be left in an indeterminate state that the guest could not reliably undo or correct. The guest's view of the page's decrypted/encrypted state might not match the host's view. Therefore, any such guest memory must be leaked rather than being used or put back on the free list. 3) After some discussion, we decided not to panic in such a case. Instead, set_memory_decrypted()/encrypted() generates a WARN, as well as returns a failure result. The most security conscious users could set panic_on_warn=1 in their VMs, and thereby stop further operation if there any indication that the transition between encrypted and decrypt is suspect. The caller of these functions also can take explicit action in the case of a failure. It seems like the same guidelines should apply here. On the x86 side we've also cleaned up cases where the return value isn't checked, like here and the use of set_memory_encrypted() below. Michael > + return page; > +} > + > +static struct page *its_alloc_pages(gfp_t gfp, unsigned int order) > +{ > + return its_alloc_pages_node(NUMA_NO_NODE, gfp, order); > +} > + > +static void its_free_pages(void *addr, unsigned int order) > +{ > + set_memory_encrypted((unsigned long)addr, 1 << order); > + free_pages((unsigned long)addr, order); > +} > + > /* > * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we > * always have vSGIs mapped. > @@ -2212,7 +2239,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) > { > struct page *prop_page; > > - prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ)); > + prop_page = its_alloc_pages(gfp_flags, > + get_order(LPI_PROPBASE_SZ)); > if (!prop_page) > return NULL; > > @@ -2223,8 +2251,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) > > static void its_free_prop_table(struct page *prop_page) > { > - free_pages((unsigned long)page_address(prop_page), > - get_order(LPI_PROPBASE_SZ)); > + its_free_pages(page_address(prop_page), > + get_order(LPI_PROPBASE_SZ)); > } > > static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size) > @@ -2346,7 +2374,8 @@ static int its_setup_baser(struct its_node *its, struct > its_baser *baser, > order = get_order(GITS_BASER_PAGES_MAX * psz); > } > > - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); > + page = its_alloc_pages_node(its->numa_node, > + GFP_KERNEL | __GFP_ZERO, order); > if (!page) > return -ENOMEM; > > @@ -2359,7 +2388,7 @@ static int its_setup_baser(struct its_node *its, struct > its_baser *baser, > /* 52bit PA is supported only when PageSize=64K */ > if (psz != SZ_64K) { > pr_err("ITS: no 52bit PA support when psz=%d\n", psz); > - free_pages((unsigned long)base, order); > + its_free_pages(base, order); > return -ENXIO; > } > > @@ -2415,7 +2444,7 @@ static int its_setup_baser(struct its_node *its, struct > its_baser *baser, > pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", > &its->phys_base, its_base_type_string[type], > val, tmp); > - free_pages((unsigned long)base, order); > + its_free_pages(base, order); > return -ENXIO; > } > > @@ -2554,8 +2583,8 @@ static void its_free_tables(struct its_node *its) > > for (i = 0; i < GITS_BASER_NR_REGS; i++) { > if (its->tables[i].base) { > - free_pages((unsigned long)its->tables[i].base, > - its->tables[i].order); > + its_free_pages(its->tables[i].base, > + its->tables[i].order); > its->tables[i].base = NULL; > } > } > @@ -2821,7 +2850,8 @@ static bool allocate_vpe_l2_table(int cpu, u32 id) > > /* Allocate memory for 2nd level table */ > if (!table[idx]) { > - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz)); > + page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO, > + get_order(psz)); > if (!page) > return false; > > @@ -2940,7 +2970,8 @@ static int allocate_vpe_l1_table(void) > > pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n", > np, npg, psz, epp, esz); > - page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); > + page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO, > + get_order(np * PAGE_SIZE)); > if (!page) > return -ENOMEM; > > @@ -2986,8 +3017,8 @@ static struct page *its_allocate_pending_table(gfp_t > gfp_flags) > { > struct page *pend_page; > > - pend_page = alloc_pages(gfp_flags | __GFP_ZERO, > - get_order(LPI_PENDBASE_SZ)); > + pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO, > + get_order(LPI_PENDBASE_SZ)); > if (!pend_page) > return NULL; > > @@ -2999,7 +3030,7 @@ static struct page *its_allocate_pending_table(gfp_t > gfp_flags) > > static void its_free_pending_table(struct page *pt) > { > - free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ)); > + its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ)); > } > > /* > @@ -3334,8 +3365,9 @@ static bool its_alloc_table_entry(struct its_node *its, > > /* Allocate memory for 2nd level table */ > if (!table[idx]) { > - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, > - get_order(baser->psz)); > + page = its_alloc_pages_node(its->numa_node, > + GFP_KERNEL | __GFP_ZERO, > + get_order(baser->psz)); > if (!page) > return false; > > @@ -3418,7 +3450,9 @@ static struct its_device *its_create_device(struct its_node > *its, u32 dev_id, > unsigned long *lpi_map = NULL; > unsigned long flags; > u16 *col_map = NULL; > + struct page *page; > void *itt; > + int itt_order; > int lpi_base; > int nr_lpis; > int nr_ites; > @@ -3430,7 +3464,6 @@ static struct its_device *its_create_device(struct its_node > *its, u32 dev_id, > if (WARN_ON(!is_power_of_2(nvecs))) > nvecs = roundup_pow_of_two(nvecs); > > - dev = kzalloc(sizeof(*dev), GFP_KERNEL); > /* > * Even if the device wants a single LPI, the ITT must be > * sized as a power of two (and you need at least one bit...). > @@ -3438,7 +3471,16 @@ static struct its_device *its_create_device(struct its_node > *its, u32 dev_id, > nr_ites = max(2, nvecs); > sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); > sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; > - itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); > + itt_order = get_order(sz); > + page = its_alloc_pages_node(its->numa_node, > + GFP_KERNEL | __GFP_ZERO, > + itt_order); > + if (!page) > + return NULL; > + itt = page_address(page); > + > + dev = kzalloc(sizeof(*dev), GFP_KERNEL); > + > if (alloc_lpis) { > lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis); > if (lpi_map) > @@ -3450,9 +3492,9 @@ static struct its_device *its_create_device(struct its_node > *its, u32 dev_id, > lpi_base = 0; > } > > - if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) { > + if (!dev || !col_map || (!lpi_map && alloc_lpis)) { > kfree(dev); > - kfree(itt); > + its_free_pages(itt, itt_order); > bitmap_free(lpi_map); > kfree(col_map); > return NULL; > @@ -3462,6 +3504,7 @@ static struct its_device *its_create_device(struct its_node > *its, u32 dev_id, > > dev->its = its; > dev->itt = itt; > + dev->itt_order = itt_order; > dev->nr_ites = nr_ites; > dev->event_map.lpi_map = lpi_map; > dev->event_map.col_map = col_map; > @@ -3489,7 +3532,7 @@ static void its_free_device(struct its_device *its_dev) > list_del(&its_dev->entry); > raw_spin_unlock_irqrestore(&its_dev->its->lock, flags); > kfree(its_dev->event_map.col_map); > - kfree(its_dev->itt); > + its_free_pages(its_dev->itt, its_dev->itt_order); > kfree(its_dev); > } > > @@ -5131,8 +5174,9 @@ static int __init its_probe_one(struct its_node *its) > } > } > > - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, > - get_order(ITS_CMD_QUEUE_SZ)); > + page = its_alloc_pages_node(its->numa_node, > + GFP_KERNEL | __GFP_ZERO, > + get_order(ITS_CMD_QUEUE_SZ)); > if (!page) { > err = -ENOMEM; > goto out_unmap_sgir; > @@ -5196,7 +5240,7 @@ static int __init its_probe_one(struct its_node *its) > out_free_tables: > its_free_tables(its); > out_free_cmd: > - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); > + its_free_pages(its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); > out_unmap_sgir: > if (its->sgir_base) > iounmap(its->sgir_base); > -- > 2.34.1 >
From: Catalin Marinas <catalin.marinas@arm.com> Sent: Friday, June 7, 2024 10:55 AM > > On Thu, Jun 06, 2024 at 07:38:08PM +0100, Catalin Marinas wrote: > > On Thu, Jun 06, 2024 at 11:17:36AM +0100, Marc Zyngier wrote: > > > On Wed, 05 Jun 2024 16:08:49 +0100, > > > Steven Price <steven.price@arm.com> wrote: > > > > 2. Use a special (global) memory allocator that does the > > > > set_memory_decrypted() dance on the pages that it allocates but allows > > > > packing the allocations. I'm not aware of an existing kernel API for > > > > this, so it's potentially quite a bit of code. The benefit is that it > > > > reduces memory consumption in a realm guest, although fragmentation > > > > still means we're likely to see a (small) growth. > > > > > > > > Any thoughts on what you think would be best? > > > > > > I would expect that something similar to kmem_cache could be of help, > > > only with the ability to deal with variable object sizes (in this > > > case: minimum of 256 bytes, in increments defined by the > > > implementation, and with a 256 byte alignment). > > > > Hmm, that's doable but not that easy to make generic. We'd need a new > > class of kmalloc-* caches (e.g. kmalloc-decrypted-*) which use only > > decrypted pages together with a GFP_DECRYPTED flag or something to get > > the slab allocator to go for these pages and avoid merging with other > > caches. It would actually be the page allocator parsing this gfp flag, > > probably in post_alloc_hook() to set the page decrypted and re-encrypt > > it in free_pages_prepare(). A slight problem here is that free_pages() > > doesn't get the gfp flag, so we'd need to store some bit in the page > > flags. Maybe the flag is not that bad, do we have something like for > > page_to_phys() to give us the high IPA address for decrypted pages? > > I had a go at this generic approach, Friday afternoon fun. Not tested > with the CCA patches, just my own fake set_memory_*() functionality on > top of 6.10-rc2. I also mindlessly added __GFP_DECRYPTED to the ITS code > following the changes in this patch. I noticed that > allocate_vpe_l2_table() doesn't use shared pages (I either missed it or > it's not needed). > > If we want to go this way, I can tidy up the diff below, split it into a > proper series and add what's missing. What I can't tell is how a driver > writer knows when to pass __GFP_DECRYPTED. There's also a theoretical > overlap with __GFP_DMA, it can't handle both. The DMA flag takes > priority currently but I realised it probably needs to be the other way > around, we wouldn't have dma mask restrictions for emulated devices. As someone who has spent time in Linux code on the x86 side that manages memory shared between the guest and host, a GFP_DECRYPTED flag on the memory allocation calls seems very useful. Here are my general comments, some of which are probably obvious, but I thought I'd write them down anyway. 1) The impetus in this case is to efficiently allow sub-page allocations. But on x86, there's also an issue in that the x86 direct map uses large page mappings, which must be split if any 4K page in the large page is decrypted. Ideally, we could cluster such decrypted pages into the same large page(s) vs. just grabbing any 4K page, and reduce the number of splits. I haven't looked at how feasible that is to implement in the context of the existing allocator code, so this is aspirational. 2) GFP_DECRYPTED should work for the three memory allocation interfaces and their variants: alloc_pages(), kmalloc(), and vmalloc(). 3) The current paradigm is to allocate memory and call set_memory_decrypted(). Then do the reverse when freeing memory. Using GFP_DECRYPTED on the allocation, and having the re-encryption done automatically when freeing certainly simplifies the call sites. The error handling at the call sites is additional messiness that gets much simpler. 4) GFP_DECRYPTED should be silently ignored when not running in a CoCo VM. Drivers that need decrypted memory in a CoCo VM always set this flag, and work normally as if the flag isn't set when not in a CoCo VM. This is how set_memory_decrypted()/encrypted() work today. Drivers call them in all cases, and they are no-ops when not in a CoCo VM. 5) The implementation of GFP_DECRYPTED must correctly handle errors from set_memory_decrypted()/encrypted() as I've described separately on this thread. Michael > > --------------------8<---------------------------------- > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c > index 40ebf1726393..b6627e839e62 100644 > --- a/drivers/irqchip/irq-gic-v3-its.c > +++ b/drivers/irqchip/irq-gic-v3-its.c > @@ -2212,7 +2212,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) > { > struct page *prop_page; > > - prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ)); > + prop_page = alloc_pages(gfp_flags | __GFP_DECRYPTED, > + get_order(LPI_PROPBASE_SZ)); > if (!prop_page) > return NULL; > > @@ -2346,7 +2347,8 @@ static int its_setup_baser(struct its_node *its, struct > its_baser *baser, > order = get_order(GITS_BASER_PAGES_MAX * psz); > } > > - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); > + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO | > + __GFP_DECRYPTED, order); > if (!page) > return -ENOMEM; > > @@ -2940,7 +2942,8 @@ static int allocate_vpe_l1_table(void) > > pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n", > np, npg, psz, epp, esz); > - page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); > + page = alloc_pages(GFP_ATOMIC | __GFP_ZERO | __GFP_DECRYPTED, > + get_order(np * PAGE_SIZE)); > if (!page) > return -ENOMEM; > > @@ -2986,7 +2989,7 @@ static struct page *its_allocate_pending_table(gfp_t > gfp_flags) > { > struct page *pend_page; > > - pend_page = alloc_pages(gfp_flags | __GFP_ZERO, > + pend_page = alloc_pages(gfp_flags | __GFP_ZERO | __GFP_DECRYPTED, > get_order(LPI_PENDBASE_SZ)); > if (!pend_page) > return NULL; > @@ -3334,7 +3337,8 @@ static bool its_alloc_table_entry(struct its_node *its, > > /* Allocate memory for 2nd level table */ > if (!table[idx]) { > - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, > + page = alloc_pages_node(its->numa_node, GFP_KERNEL | > + __GFP_ZERO | __GFP_DECRYPTED, > get_order(baser->psz)); > if (!page) > return false; > @@ -3438,7 +3442,7 @@ static struct its_device *its_create_device(struct its_node > *its, u32 dev_id, > nr_ites = max(2, nvecs); > sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); > sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; > - itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); > + itt = kzalloc_node(sz, GFP_KERNEL | __GFP_DECRYPTED, its->numa_node); > if (alloc_lpis) { > lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis); > if (lpi_map) > @@ -5131,8 +5135,8 @@ static int __init its_probe_one(struct its_node *its) > } > } > > - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, > - get_order(ITS_CMD_QUEUE_SZ)); > + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO | > + __GFP_DECRYPTED, get_order(ITS_CMD_QUEUE_SZ)); > if (!page) { > err = -ENOMEM; > goto out_unmap_sgir; > diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h > index 313be4ad79fd..573989664639 100644 > --- a/include/linux/gfp_types.h > +++ b/include/linux/gfp_types.h > @@ -57,6 +57,9 @@ enum { > #endif > #ifdef CONFIG_SLAB_OBJ_EXT > ___GFP_NO_OBJ_EXT_BIT, > +#endif > +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT > + ___GFP_DECRYPTED_BIT, > #endif > ___GFP_LAST_BIT > }; > @@ -103,6 +106,11 @@ enum { > #else > #define ___GFP_NO_OBJ_EXT 0 > #endif > +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT > +#define ___GFP_DECRYPTED BIT(___GFP_DECRYPTED_BIT) > +#else > +#define ___GFP_DECRYPTED 0 > +#endif > > /* > * Physical address zone modifiers (see linux/mmzone.h - low four bits) > @@ -117,6 +125,8 @@ enum { > #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* > ZONE_MOVABLE allowed */ > #define GFP_ZONEMASK > (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) > > +#define __GFP_DECRYPTED ((__force gfp_t)___GFP_DECRYPTED) > + > /** > * DOC: Page mobility and placement hints > * > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h > index 104078afe0b1..705707052274 100644 > --- a/include/linux/page-flags.h > +++ b/include/linux/page-flags.h > @@ -127,6 +127,9 @@ enum pageflags { > #ifdef CONFIG_MEMORY_FAILURE > PG_hwpoison, /* hardware poisoned page. Don't touch */ > #endif > +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT > + PG_decrypted, > +#endif > #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) > PG_young, > PG_idle, > @@ -626,6 +629,15 @@ PAGEFLAG_FALSE(HWPoison, hwpoison) > #define __PG_HWPOISON 0 > #endif > > +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT > +PAGEFLAG(Decrypted, decrypted, PF_HEAD) > +#define __PG_DECRYPTED (1UL << PG_decrypted) > +#else > +PAGEFLAG_FALSE(Decrypted, decrypted) > +#define __PG_DECRYPTED 0 > +#endif > + > + > #ifdef CONFIG_PAGE_IDLE_FLAG > #ifdef CONFIG_64BIT > FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) > diff --git a/include/linux/slab.h b/include/linux/slab.h > index 7247e217e21b..f7a2cf624c35 100644 > --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -422,6 +422,9 @@ enum kmalloc_cache_type { > #endif > #ifdef CONFIG_MEMCG_KMEM > KMALLOC_CGROUP, > +#endif > +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT > + KMALLOC_DECRYPTED, > #endif > NR_KMALLOC_TYPES > }; > @@ -433,7 +436,7 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH > + 1]; > * Define gfp bits that should not be set for KMALLOC_NORMAL. > */ > #define KMALLOC_NOT_NORMAL_BITS \ > - (__GFP_RECLAIMABLE | \ > + (__GFP_RECLAIMABLE | __GFP_DECRYPTED | \ > (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ > (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) > > @@ -458,11 +461,14 @@ static __always_inline enum kmalloc_cache_type > kmalloc_type(gfp_t flags, unsigne > * At least one of the flags has to be set. Their priorities in > * decreasing order are: > * 1) __GFP_DMA > - * 2) __GFP_RECLAIMABLE > - * 3) __GFP_ACCOUNT > + * 2) __GFP_DECRYPTED > + * 3) __GFP_RECLAIMABLE > + * 4) __GFP_ACCOUNT > */ > if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) > return KMALLOC_DMA; > + if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) && (flags & > __GFP_DECRYPTED)) > + return KMALLOC_DECRYPTED; > if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) > return KMALLOC_RECLAIM; > else > diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h > index e46d6e82765e..a0879155f892 100644 > --- a/include/trace/events/mmflags.h > +++ b/include/trace/events/mmflags.h > @@ -83,6 +83,12 @@ > #define IF_HAVE_PG_HWPOISON(_name) > #endif > > +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT > +#define IF_HAVE_PG_DECRYPTED(_name) ,{1UL << PG_##_name, __stringify(_name)} > +#else > +#define IF_HAVE_PG_DECRYPTED(_name) > +#endif > + > #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) > #define IF_HAVE_PG_IDLE(_name) ,{1UL << PG_##_name, __stringify(_name)} > #else > @@ -121,6 +127,7 @@ > IF_HAVE_PG_MLOCK(mlocked) \ > IF_HAVE_PG_UNCACHED(uncached) \ > IF_HAVE_PG_HWPOISON(hwpoison) \ > +IF_HAVE_PG_DECRYPTED(decrypted) \ > IF_HAVE_PG_IDLE(idle) \ > IF_HAVE_PG_IDLE(young) \ > IF_HAVE_PG_ARCH_X(arch_2) \ > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 2e22ce5675ca..c93ae50ec402 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -47,6 +47,7 @@ > #include <linux/sched/mm.h> > #include <linux/page_owner.h> > #include <linux/page_table_check.h> > +#include <linux/set_memory.h> > #include <linux/memcontrol.h> > #include <linux/ftrace.h> > #include <linux/lockdep.h> > @@ -1051,6 +1052,12 @@ __always_inline bool free_pages_prepare(struct page > *page, > return false; > } > > + if (unlikely(PageDecrypted(page))) { > + set_memory_encrypted((unsigned long)page_address(page), > + 1 << order); > + ClearPageDecrypted(page); > + } > + > VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); > > /* > @@ -1415,6 +1422,7 @@ inline void post_alloc_hook(struct page *page, unsigned int > order, > bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && > !should_skip_init(gfp_flags); > bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); > + bool decrypted = true; //gfp_flags & __GFP_DECRYPTED; > int i; > > set_page_private(page, 0); > @@ -1465,6 +1473,12 @@ inline void post_alloc_hook(struct page *page, unsigned int > order, > if (init) > kernel_init_pages(page, 1 << order); > > + if (decrypted) { > + set_memory_decrypted((unsigned long)page_address(page), > + 1 << order); > + SetPageDecrypted(page); > + } > + > set_page_owner(page, order, gfp_flags); > page_table_check_alloc(page, order); > pgalloc_tag_add(page, current, 1 << order); > diff --git a/mm/slab_common.c b/mm/slab_common.c > index 1560a1546bb1..de9c8c674aa1 100644 > --- a/mm/slab_common.c > +++ b/mm/slab_common.c > @@ -737,6 +737,12 @@ EXPORT_SYMBOL(kmalloc_size_roundup); > #define KMALLOC_RCL_NAME(sz) > #endif > > +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT > +#define KMALLOC_DECRYPTED_NAME(sz) .name[KMALLOC_DECRYPTED] = > "kmalloc-decrypted-" #sz, > +#else > +#define KMALLOC_DECRYPTED_NAME(sz) > +#endif > + > #ifdef CONFIG_RANDOM_KMALLOC_CACHES > #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b > #define KMALLOC_RANDOM_NAME(N, sz) > __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz) > @@ -765,6 +771,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup); > KMALLOC_RCL_NAME(__short_size) \ > KMALLOC_CGROUP_NAME(__short_size) \ > KMALLOC_DMA_NAME(__short_size) \ > + KMALLOC_DECRYPTED_NAME(__short_size) \ > KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) > \ > .size = __size, \ > } > @@ -889,6 +896,9 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type) > if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL)) > flags |= SLAB_NO_MERGE; > > + if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) && (type == > KMALLOC_DECRYPTED)) > + flags |= SLAB_NO_MERGE; > + > if (minalign > ARCH_KMALLOC_MINALIGN) { > aligned_size = ALIGN(aligned_size, minalign); > aligned_idx = __kmalloc_index(aligned_size, false);
Hi Michael, All good points below, thanks. On Tue, Jun 18, 2024 at 04:04:17PM +0000, Michael Kelley wrote: > As someone who has spent time in Linux code on the x86 side that > manages memory shared between the guest and host, a GFP_DECRYPTED > flag on the memory allocation calls seems very useful. Here are my > general comments, some of which are probably obvious, but I thought > I'd write them down anyway. > > 1) The impetus in this case is to efficiently allow sub-page allocations. > But on x86, there's also an issue in that the x86 direct map uses large > page mappings, which must be split if any 4K page in the large page > is decrypted. Ideally, we could cluster such decrypted pages into the > same large page(s) vs. just grabbing any 4K page, and reduce the > number of splits. I haven't looked at how feasible that is to implement > in the context of the existing allocator code, so this is aspirational. It might be doable, though it's a lot more intrusive than just a GFP flag. However, such memory cannot be used for any other things than sharing data with the host. > 2) GFP_DECRYPTED should work for the three memory allocation > interfaces and their variants: alloc_pages(), kmalloc(), and vmalloc(). Yes. There are two main aspects: (1) the page allocation + linear map change and (2) additional mapping attributes like in the vmalloc() case. > 3) The current paradigm is to allocate memory and call > set_memory_decrypted(). Then do the reverse when freeing memory. > Using GFP_DECRYPTED on the allocation, and having the re-encryption > done automatically when freeing certainly simplifies the call sites. > The error handling at the call sites is additional messiness that gets > much simpler. > > 4) GFP_DECRYPTED should be silently ignored when not running > in a CoCo VM. Drivers that need decrypted memory in a CoCo VM > always set this flag, and work normally as if the flag isn't set when > not in a CoCo VM. This is how set_memory_decrypted()/encrypted() > work today. Drivers call them in all cases, and they are no-ops > when not in a CoCo VM. Yes, this should be straightforward. Maybe simplified further to avoid creating additional kmalloc() caches if these functions are not supported, though we seem to be missing some generic arch_has_mem_encrypt() API. > 5) The implementation of GFP_DECRYPTED must correctly handle > errors from set_memory_decrypted()/encrypted() as I've described > separately on this thread. Good point, I completely missed this aspect. I'll prepare some patches and post them next week to get feedback from the mm folk. Thanks.
On 17/06/2024 04:54, Michael Kelley wrote: > From: Steven Price <steven.price@arm.com> Sent: Wednesday, June 5, 2024 2:30 AM >> >> Within a realm guest the ITS is emulated by the host. This means the >> allocations must have been made available to the host by a call to >> set_memory_decrypted(). Introduce an allocation function which performs >> this extra call. >> >> Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com> >> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com> >> Signed-off-by: Steven Price <steven.price@arm.com> >> --- >> Changes since v2: >> * Drop 'shared' from the new its_xxx function names as they are used >> for non-realm guests too. >> * Don't handle the NUMA_NO_NODE case specially - alloc_pages_node() >> should do the right thing. >> * Drop a pointless (void *) cast. >> --- >> drivers/irqchip/irq-gic-v3-its.c | 90 ++++++++++++++++++++++++-------- >> 1 file changed, 67 insertions(+), 23 deletions(-) >> >> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c >> index 40ebf1726393..ca72f830f4cc 100644 >> --- a/drivers/irqchip/irq-gic-v3-its.c >> +++ b/drivers/irqchip/irq-gic-v3-its.c >> @@ -18,6 +18,7 @@ >> #include <linux/irqdomain.h> >> #include <linux/list.h> >> #include <linux/log2.h> >> +#include <linux/mem_encrypt.h> >> #include <linux/memblock.h> >> #include <linux/mm.h> >> #include <linux/msi.h> >> @@ -27,6 +28,7 @@ >> #include <linux/of_pci.h> >> #include <linux/of_platform.h> >> #include <linux/percpu.h> >> +#include <linux/set_memory.h> >> #include <linux/slab.h> >> #include <linux/syscore_ops.h> >> >> @@ -163,6 +165,7 @@ struct its_device { >> struct its_node *its; >> struct event_lpi_map event_map; >> void *itt; >> + u32 itt_order; >> u32 nr_ites; >> u32 device_id; >> bool shared; >> @@ -198,6 +201,30 @@ static DEFINE_IDA(its_vpeid_ida); >> #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) >> #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) >> >> +static struct page *its_alloc_pages_node(int node, gfp_t gfp, >> + unsigned int order) >> +{ >> + struct page *page; >> + >> + page = alloc_pages_node(node, gfp, order); >> + >> + if (page) >> + set_memory_decrypted((unsigned long)page_address(page), >> + 1 << order); > > There's been considerable discussion on the x86 side about > what to do when set_memory_decrypted() or set_memory_encrypted() > fails. The conclusions are: > > 1) set_memory_decrypted()/encrypted() *could* fail due to a > compromised/malicious host, due to a bug somewhere in the > software stack, or due to resource constraints (x86 might need to > split a large page mapping, and need to allocate additional page > table pages, which could fail). > > 2) The guest memory that was the target of such a failed call could > be left in an indeterminate state that the guest could not reliably > undo or correct. The guest's view of the page's decrypted/encrypted > state might not match the host's view. Therefore, any such guest > memory must be leaked rather than being used or put back on the > free list. > > 3) After some discussion, we decided not to panic in such a case. > Instead, set_memory_decrypted()/encrypted() generates a WARN, > as well as returns a failure result. The most security conscious > users could set panic_on_warn=1 in their VMs, and thereby stop > further operation if there any indication that the transition between > encrypted and decrypt is suspect. The caller of these functions > also can take explicit action in the case of a failure. > > It seems like the same guidelines should apply here. On the x86 > side we've also cleaned up cases where the return value isn't > checked, like here and the use of set_memory_encrypted() below. Very good points - this code was lacking error handling. I think you are also right that the correct situation when set_memory_{en,de}crypted() fails is to WARN() and leak the page. It's something that shouldn't happen with a well behaving host and it's unclear how to safely recover the page - so leaking the page is the safest result. And the WARN() approach gives the user the option as to whether this is fatal via panic_on_warn. Thanks, Steve
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 40ebf1726393..ca72f830f4cc 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -18,6 +18,7 @@ #include <linux/irqdomain.h> #include <linux/list.h> #include <linux/log2.h> +#include <linux/mem_encrypt.h> #include <linux/memblock.h> #include <linux/mm.h> #include <linux/msi.h> @@ -27,6 +28,7 @@ #include <linux/of_pci.h> #include <linux/of_platform.h> #include <linux/percpu.h> +#include <linux/set_memory.h> #include <linux/slab.h> #include <linux/syscore_ops.h> @@ -163,6 +165,7 @@ struct its_device { struct its_node *its; struct event_lpi_map event_map; void *itt; + u32 itt_order; u32 nr_ites; u32 device_id; bool shared; @@ -198,6 +201,30 @@ static DEFINE_IDA(its_vpeid_ida); #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) +static struct page *its_alloc_pages_node(int node, gfp_t gfp, + unsigned int order) +{ + struct page *page; + + page = alloc_pages_node(node, gfp, order); + + if (page) + set_memory_decrypted((unsigned long)page_address(page), + 1 << order); + return page; +} + +static struct page *its_alloc_pages(gfp_t gfp, unsigned int order) +{ + return its_alloc_pages_node(NUMA_NO_NODE, gfp, order); +} + +static void its_free_pages(void *addr, unsigned int order) +{ + set_memory_encrypted((unsigned long)addr, 1 << order); + free_pages((unsigned long)addr, order); +} + /* * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we * always have vSGIs mapped. @@ -2212,7 +2239,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) { struct page *prop_page; - prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ)); + prop_page = its_alloc_pages(gfp_flags, + get_order(LPI_PROPBASE_SZ)); if (!prop_page) return NULL; @@ -2223,8 +2251,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) static void its_free_prop_table(struct page *prop_page) { - free_pages((unsigned long)page_address(prop_page), - get_order(LPI_PROPBASE_SZ)); + its_free_pages(page_address(prop_page), + get_order(LPI_PROPBASE_SZ)); } static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size) @@ -2346,7 +2374,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, order = get_order(GITS_BASER_PAGES_MAX * psz); } - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); + page = its_alloc_pages_node(its->numa_node, + GFP_KERNEL | __GFP_ZERO, order); if (!page) return -ENOMEM; @@ -2359,7 +2388,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, /* 52bit PA is supported only when PageSize=64K */ if (psz != SZ_64K) { pr_err("ITS: no 52bit PA support when psz=%d\n", psz); - free_pages((unsigned long)base, order); + its_free_pages(base, order); return -ENXIO; } @@ -2415,7 +2444,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", &its->phys_base, its_base_type_string[type], val, tmp); - free_pages((unsigned long)base, order); + its_free_pages(base, order); return -ENXIO; } @@ -2554,8 +2583,8 @@ static void its_free_tables(struct its_node *its) for (i = 0; i < GITS_BASER_NR_REGS; i++) { if (its->tables[i].base) { - free_pages((unsigned long)its->tables[i].base, - its->tables[i].order); + its_free_pages(its->tables[i].base, + its->tables[i].order); its->tables[i].base = NULL; } } @@ -2821,7 +2850,8 @@ static bool allocate_vpe_l2_table(int cpu, u32 id) /* Allocate memory for 2nd level table */ if (!table[idx]) { - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz)); + page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(psz)); if (!page) return false; @@ -2940,7 +2970,8 @@ static int allocate_vpe_l1_table(void) pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n", np, npg, psz, epp, esz); - page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); + page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(np * PAGE_SIZE)); if (!page) return -ENOMEM; @@ -2986,8 +3017,8 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) { struct page *pend_page; - pend_page = alloc_pages(gfp_flags | __GFP_ZERO, - get_order(LPI_PENDBASE_SZ)); + pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO, + get_order(LPI_PENDBASE_SZ)); if (!pend_page) return NULL; @@ -2999,7 +3030,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) static void its_free_pending_table(struct page *pt) { - free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ)); + its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ)); } /* @@ -3334,8 +3365,9 @@ static bool its_alloc_table_entry(struct its_node *its, /* Allocate memory for 2nd level table */ if (!table[idx]) { - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, - get_order(baser->psz)); + page = its_alloc_pages_node(its->numa_node, + GFP_KERNEL | __GFP_ZERO, + get_order(baser->psz)); if (!page) return false; @@ -3418,7 +3450,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, unsigned long *lpi_map = NULL; unsigned long flags; u16 *col_map = NULL; + struct page *page; void *itt; + int itt_order; int lpi_base; int nr_lpis; int nr_ites; @@ -3430,7 +3464,6 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, if (WARN_ON(!is_power_of_2(nvecs))) nvecs = roundup_pow_of_two(nvecs); - dev = kzalloc(sizeof(*dev), GFP_KERNEL); /* * Even if the device wants a single LPI, the ITT must be * sized as a power of two (and you need at least one bit...). @@ -3438,7 +3471,16 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, nr_ites = max(2, nvecs); sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; - itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); + itt_order = get_order(sz); + page = its_alloc_pages_node(its->numa_node, + GFP_KERNEL | __GFP_ZERO, + itt_order); + if (!page) + return NULL; + itt = page_address(page); + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (alloc_lpis) { lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis); if (lpi_map) @@ -3450,9 +3492,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, lpi_base = 0; } - if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) { + if (!dev || !col_map || (!lpi_map && alloc_lpis)) { kfree(dev); - kfree(itt); + its_free_pages(itt, itt_order); bitmap_free(lpi_map); kfree(col_map); return NULL; @@ -3462,6 +3504,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, dev->its = its; dev->itt = itt; + dev->itt_order = itt_order; dev->nr_ites = nr_ites; dev->event_map.lpi_map = lpi_map; dev->event_map.col_map = col_map; @@ -3489,7 +3532,7 @@ static void its_free_device(struct its_device *its_dev) list_del(&its_dev->entry); raw_spin_unlock_irqrestore(&its_dev->its->lock, flags); kfree(its_dev->event_map.col_map); - kfree(its_dev->itt); + its_free_pages(its_dev->itt, its_dev->itt_order); kfree(its_dev); } @@ -5131,8 +5174,9 @@ static int __init its_probe_one(struct its_node *its) } } - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, - get_order(ITS_CMD_QUEUE_SZ)); + page = its_alloc_pages_node(its->numa_node, + GFP_KERNEL | __GFP_ZERO, + get_order(ITS_CMD_QUEUE_SZ)); if (!page) { err = -ENOMEM; goto out_unmap_sgir; @@ -5196,7 +5240,7 @@ static int __init its_probe_one(struct its_node *its) out_free_tables: its_free_tables(its); out_free_cmd: - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); + its_free_pages(its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); out_unmap_sgir: if (its->sgir_base) iounmap(its->sgir_base);