Message ID | 1498405569-463-1-git-send-email-shankerd@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Shanker, On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni <shankerd@codeaurora.org> wrote: > The NUMA node information is visible to ITS driver but not being used > other than handling errata. This patch allocates the memory for ITS > tables from the corresponding NUMA node using the appropriate NUMA > aware functions. > > Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org> > --- > drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++---------------- > 1 file changed, 20 insertions(+), 16 deletions(-) > > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c > index fed99c5..e45add2 100644 > --- a/drivers/irqchip/irq-gic-v3-its.c > +++ b/drivers/irqchip/irq-gic-v3-its.c > @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, > u64 val = its_read_baser(its, baser); > u64 esz = GITS_BASER_ENTRY_SIZE(val); > u64 type = GITS_BASER_TYPE(val); > + struct page *page; > u32 alloc_pages; > - void *base; > u64 tmp; > > retry_alloc_baser: > @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, > order = get_order(GITS_BASER_PAGES_MAX * psz); > } > > - base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); > - if (!base) > + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); the changes would have been minimal, if you converted page to base address after allocation here itself? page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); base = page_address(page); > + if (!page) > return -ENOMEM; > > retry_baser: > - val = (virt_to_phys(base) | > + val = (page_to_phys(page) no change required. > (type << GITS_BASER_TYPE_SHIFT) | > ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | > ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT) | > @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, > shr = tmp & GITS_BASER_SHAREABILITY_MASK; > if (!shr) { > cache = GITS_BASER_nC; > - gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); > + gic_flush_dcache_to_poc(page_to_virt(page), > + PAGE_ORDER_TO_SIZE(order)); no change needed, if base is used still. > } > goto retry_baser; > } > @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, > * size and retry. If we reach 4K, then > * something is horribly wrong... > */ > - free_pages((unsigned long)base, order); > + __free_pages(page, order); no change required. > baser->base = NULL; > > switch (psz) { > @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, > pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", > &its->phys_base, its_base_type_string[type], > val, tmp); > - free_pages((unsigned long)base, order); > + __free_pages(page, order); no change required. > return -ENXIO; > } > > baser->order = order; > - baser->base = base; > + baser->base = page_to_virt(page); no change required. > baser->psz = psz; > tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz; > > pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n", > &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp), > its_base_type_string[type], > - (unsigned long)virt_to_phys(base), > + (unsigned long)page_to_phys(page), no change required. > indirect ? "indirect" : "flat", (int)esz, > psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT); > > @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its) > > for (i = 0; i < GITS_BASER_NR_REGS; i++) { > if (its->tables[i].base) { > - free_pages((unsigned long)its->tables[i].base, > + __free_pages(virt_to_page(its->tables[i].base), ditto > its->tables[i].order); > its->tables[i].base = NULL; > } > @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id) > > /* Allocate memory for 2nd level table */ > if (!table[idx]) { > - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz)); > + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, > + get_order(baser->psz)); > if (!page) > return false; > > @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, > nr_ites = max(2UL, roundup_pow_of_two(nvecs)); > sz = nr_ites * its->ite_size; > sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; > - itt = kzalloc(sz, GFP_KERNEL); > + itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); > lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis); > if (lpi_map) > col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL); > @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res, > { > struct its_node *its; > void __iomem *its_base; > + struct page *page; > u32 val; > u64 baser, tmp; > int err; > @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res, > its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1; > its->numa_node = numa_node; > > - its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, > - get_order(ITS_CMD_QUEUE_SZ)); > - if (!its->cmd_base) { > + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, > + get_order(ITS_CMD_QUEUE_SZ)); > + if (!page) { > err = -ENOMEM; > goto out_free_its; > } > + its->cmd_base = page_to_virt(page); same here, this would have been, its->cmd_base = page_address(page); > its->cmd_write = its->cmd_base; > > its_enable_quirks(its); > @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res, > out_free_tables: > its_free_tables(its); > out_free_cmd: > - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); > + __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ)); no change required. > out_free_its: > kfree(its); > out_unmap: > -- > Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc. > Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project. > thanks Ganapat
On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni <gpkulkarni@gmail.com> wrote: > Hi Shanker, > > On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni > <shankerd@codeaurora.org> wrote: >> The NUMA node information is visible to ITS driver but not being used >> other than handling errata. This patch allocates the memory for ITS >> tables from the corresponding NUMA node using the appropriate NUMA >> aware functions. IMHO, the description would have been more constructive? "All ITS tables are mapped by default to NODE 0 memory. Adding changes to allocate memory from respective NUMA NODES of ITS devices. This will optimize tables access and avoids unnecessary inter-node traffic." >> >> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org> >> --- >> drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++---------------- >> 1 file changed, 20 insertions(+), 16 deletions(-) >> >> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c >> index fed99c5..e45add2 100644 >> --- a/drivers/irqchip/irq-gic-v3-its.c >> +++ b/drivers/irqchip/irq-gic-v3-its.c >> @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> u64 val = its_read_baser(its, baser); >> u64 esz = GITS_BASER_ENTRY_SIZE(val); >> u64 type = GITS_BASER_TYPE(val); >> + struct page *page; >> u32 alloc_pages; >> - void *base; >> u64 tmp; >> >> retry_alloc_baser: >> @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> order = get_order(GITS_BASER_PAGES_MAX * psz); >> } >> >> - base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); >> - if (!base) >> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); > > the changes would have been minimal, if you converted page to base > address after allocation here itself? > > page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); > base = page_address(page); > >> + if (!page) >> return -ENOMEM; >> >> retry_baser: >> - val = (virt_to_phys(base) | >> + val = (page_to_phys(page) > > no change required. > >> (type << GITS_BASER_TYPE_SHIFT) | >> ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | >> ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT) | >> @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> shr = tmp & GITS_BASER_SHAREABILITY_MASK; >> if (!shr) { >> cache = GITS_BASER_nC; >> - gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); >> + gic_flush_dcache_to_poc(page_to_virt(page), >> + PAGE_ORDER_TO_SIZE(order)); > > no change needed, if base is used still. >> } >> goto retry_baser; >> } >> @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> * size and retry. If we reach 4K, then >> * something is horribly wrong... >> */ >> - free_pages((unsigned long)base, order); >> + __free_pages(page, order); > > no change required. >> baser->base = NULL; >> >> switch (psz) { >> @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", >> &its->phys_base, its_base_type_string[type], >> val, tmp); >> - free_pages((unsigned long)base, order); >> + __free_pages(page, order); > > no change required. > >> return -ENXIO; >> } >> >> baser->order = order; >> - baser->base = base; >> + baser->base = page_to_virt(page); > > no change required. >> baser->psz = psz; >> tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz; >> >> pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n", >> &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp), >> its_base_type_string[type], >> - (unsigned long)virt_to_phys(base), >> + (unsigned long)page_to_phys(page), > > no change required. >> indirect ? "indirect" : "flat", (int)esz, >> psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT); >> >> @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its) >> >> for (i = 0; i < GITS_BASER_NR_REGS; i++) { >> if (its->tables[i].base) { >> - free_pages((unsigned long)its->tables[i].base, >> + __free_pages(virt_to_page(its->tables[i].base), > > ditto >> its->tables[i].order); >> its->tables[i].base = NULL; >> } >> @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id) >> >> /* Allocate memory for 2nd level table */ >> if (!table[idx]) { >> - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz)); >> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, >> + get_order(baser->psz)); >> if (!page) >> return false; >> >> @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, >> nr_ites = max(2UL, roundup_pow_of_two(nvecs)); >> sz = nr_ites * its->ite_size; >> sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; >> - itt = kzalloc(sz, GFP_KERNEL); >> + itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); >> lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis); >> if (lpi_map) >> col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL); >> @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res, >> { >> struct its_node *its; >> void __iomem *its_base; >> + struct page *page; >> u32 val; >> u64 baser, tmp; >> int err; >> @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res, >> its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1; >> its->numa_node = numa_node; >> >> - its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, >> - get_order(ITS_CMD_QUEUE_SZ)); >> - if (!its->cmd_base) { >> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, >> + get_order(ITS_CMD_QUEUE_SZ)); >> + if (!page) { >> err = -ENOMEM; >> goto out_free_its; >> } >> + its->cmd_base = page_to_virt(page); > > same here, this would have been, > its->cmd_base = page_address(page); > >> its->cmd_write = its->cmd_base; >> >> its_enable_quirks(its); >> @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res, >> out_free_tables: >> its_free_tables(its); >> out_free_cmd: >> - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); >> + __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ)); > > no change required. >> out_free_its: >> kfree(its); >> out_unmap: >> -- >> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc. >> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project. >> > > thanks > Ganapat thanks Ganapat
On 30/06/17 04:01, Ganapatrao Kulkarni wrote: > On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni > <gpkulkarni@gmail.com> wrote: >> Hi Shanker, >> >> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >> <shankerd@codeaurora.org> wrote: >>> The NUMA node information is visible to ITS driver but not being used >>> other than handling errata. This patch allocates the memory for ITS >>> tables from the corresponding NUMA node using the appropriate NUMA >>> aware functions. > > IMHO, the description would have been more constructive? > > "All ITS tables are mapped by default to NODE 0 memory. > Adding changes to allocate memory from respective NUMA NODES of ITS devices. > This will optimize tables access and avoids unnecessary inter-node traffic." But more importantly, I'd like to see figures showing the actual benefit of this per-node allocation. Given that both of you guys have access to such platforms, please show me the numbers! Thanks, M.
Hi Ganapatrao, On 06/29/2017 09:34 PM, Ganapatrao Kulkarni wrote: > Hi Shanker, > > On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni > <shankerd@codeaurora.org> wrote: >> The NUMA node information is visible to ITS driver but not being used >> other than handling errata. This patch allocates the memory for ITS >> tables from the corresponding NUMA node using the appropriate NUMA >> aware functions. >> >> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org> >> --- >> drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++---------------- >> 1 file changed, 20 insertions(+), 16 deletions(-) >> >> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c >> index fed99c5..e45add2 100644 >> --- a/drivers/irqchip/irq-gic-v3-its.c >> +++ b/drivers/irqchip/irq-gic-v3-its.c >> @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> u64 val = its_read_baser(its, baser); >> u64 esz = GITS_BASER_ENTRY_SIZE(val); >> u64 type = GITS_BASER_TYPE(val); >> + struct page *page; >> u32 alloc_pages; >> - void *base; >> u64 tmp; >> >> retry_alloc_baser: >> @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> order = get_order(GITS_BASER_PAGES_MAX * psz); >> } >> >> - base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); >> - if (!base) >> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); > > the changes would have been minimal, if you converted page to base > address after allocation here itself? > Yes, but it doesn't make any sense to keep the two variables to server the same purpose and covert the page pointer to (unsigned long) when freeing it. > page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); > base = page_address(page); > >> + if (!page) >> return -ENOMEM; >> >> retry_baser: >> - val = (virt_to_phys(base) | >> + val = (page_to_phys(page) > > no change required. > >> (type << GITS_BASER_TYPE_SHIFT) | >> ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | >> ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT) | >> @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> shr = tmp & GITS_BASER_SHAREABILITY_MASK; >> if (!shr) { >> cache = GITS_BASER_nC; >> - gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); >> + gic_flush_dcache_to_poc(page_to_virt(page), >> + PAGE_ORDER_TO_SIZE(order)); > > no change needed, if base is used still. >> } >> goto retry_baser; >> } >> @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> * size and retry. If we reach 4K, then >> * something is horribly wrong... >> */ >> - free_pages((unsigned long)base, order); >> + __free_pages(page, order); > > no change required. >> baser->base = NULL; >> >> switch (psz) { >> @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >> pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", >> &its->phys_base, its_base_type_string[type], >> val, tmp); >> - free_pages((unsigned long)base, order); >> + __free_pages(page, order); > > no change required. > >> return -ENXIO; >> } >> >> baser->order = order; >> - baser->base = base; >> + baser->base = page_to_virt(page); > > no change required. >> baser->psz = psz; >> tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz; >> >> pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n", >> &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp), >> its_base_type_string[type], >> - (unsigned long)virt_to_phys(base), >> + (unsigned long)page_to_phys(page), > > no change required. >> indirect ? "indirect" : "flat", (int)esz, >> psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT); >> >> @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its) >> >> for (i = 0; i < GITS_BASER_NR_REGS; i++) { >> if (its->tables[i].base) { >> - free_pages((unsigned long)its->tables[i].base, >> + __free_pages(virt_to_page(its->tables[i].base), > > ditto >> its->tables[i].order); >> its->tables[i].base = NULL; >> } >> @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id) >> >> /* Allocate memory for 2nd level table */ >> if (!table[idx]) { >> - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz)); >> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, >> + get_order(baser->psz)); >> if (!page) >> return false; >> >> @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, >> nr_ites = max(2UL, roundup_pow_of_two(nvecs)); >> sz = nr_ites * its->ite_size; >> sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; >> - itt = kzalloc(sz, GFP_KERNEL); >> + itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); >> lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis); >> if (lpi_map) >> col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL); >> @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res, >> { >> struct its_node *its; >> void __iomem *its_base; >> + struct page *page; >> u32 val; >> u64 baser, tmp; >> int err; >> @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res, >> its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1; >> its->numa_node = numa_node; >> >> - its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, >> - get_order(ITS_CMD_QUEUE_SZ)); >> - if (!its->cmd_base) { >> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, >> + get_order(ITS_CMD_QUEUE_SZ)); >> + if (!page) { >> err = -ENOMEM; >> goto out_free_its; >> } >> + its->cmd_base = page_to_virt(page); > > same here, this would have been, > its->cmd_base = page_address(page); > >> its->cmd_write = its->cmd_base; >> >> its_enable_quirks(its); >> @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res, >> out_free_tables: >> its_free_tables(its); >> out_free_cmd: >> - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); >> + __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ)); > > no change required. >> out_free_its: >> kfree(its); >> out_unmap: >> -- >> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc. >> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project. >> > > thanks > Ganapat > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
Hi Ganapatraoo, On 06/29/2017 10:01 PM, Ganapatrao Kulkarni wrote: > On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni > <gpkulkarni@gmail.com> wrote: >> Hi Shanker, >> >> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >> <shankerd@codeaurora.org> wrote: >>> The NUMA node information is visible to ITS driver but not being used >>> other than handling errata. This patch allocates the memory for ITS >>> tables from the corresponding NUMA node using the appropriate NUMA >>> aware functions. > > IMHO, the description would have been more constructive? > > "All ITS tables are mapped by default to NODE 0 memory. > Adding changes to allocate memory from respective NUMA NODES of ITS devices. > This will optimize tables access and avoids unnecessary inter-node traffic." > I don't understand why you are saying 'All ITS tables are mapped by default to NODE 0 memory'? Kernel runtime memory allocation comes from the corresponding NUMA node of the CPU that being called. Please explain if you don't agree on this. >>> >>> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org> >>> --- >>> drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++---------------- >>> 1 file changed, 20 insertions(+), 16 deletions(-) >>> >>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c >>> index fed99c5..e45add2 100644 >>> --- a/drivers/irqchip/irq-gic-v3-its.c >>> +++ b/drivers/irqchip/irq-gic-v3-its.c >>> @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>> u64 val = its_read_baser(its, baser); >>> u64 esz = GITS_BASER_ENTRY_SIZE(val); >>> u64 type = GITS_BASER_TYPE(val); >>> + struct page *page; >>> u32 alloc_pages; >>> - void *base; >>> u64 tmp; >>> >>> retry_alloc_baser: >>> @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>> order = get_order(GITS_BASER_PAGES_MAX * psz); >>> } >>> >>> - base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); >>> - if (!base) >>> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); >> >> the changes would have been minimal, if you converted page to base >> address after allocation here itself? >> >> page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); >> base = page_address(page); >> >>> + if (!page) >>> return -ENOMEM; >>> >>> retry_baser: >>> - val = (virt_to_phys(base) | >>> + val = (page_to_phys(page) >> >> no change required. >> >>> (type << GITS_BASER_TYPE_SHIFT) | >>> ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | >>> ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT) | >>> @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>> shr = tmp & GITS_BASER_SHAREABILITY_MASK; >>> if (!shr) { >>> cache = GITS_BASER_nC; >>> - gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); >>> + gic_flush_dcache_to_poc(page_to_virt(page), >>> + PAGE_ORDER_TO_SIZE(order)); >> >> no change needed, if base is used still. >>> } >>> goto retry_baser; >>> } >>> @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>> * size and retry. If we reach 4K, then >>> * something is horribly wrong... >>> */ >>> - free_pages((unsigned long)base, order); >>> + __free_pages(page, order); >> >> no change required. >>> baser->base = NULL; >>> >>> switch (psz) { >>> @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>> pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", >>> &its->phys_base, its_base_type_string[type], >>> val, tmp); >>> - free_pages((unsigned long)base, order); >>> + __free_pages(page, order); >> >> no change required. >> >>> return -ENXIO; >>> } >>> >>> baser->order = order; >>> - baser->base = base; >>> + baser->base = page_to_virt(page); >> >> no change required. >>> baser->psz = psz; >>> tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz; >>> >>> pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n", >>> &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp), >>> its_base_type_string[type], >>> - (unsigned long)virt_to_phys(base), >>> + (unsigned long)page_to_phys(page), >> >> no change required. >>> indirect ? "indirect" : "flat", (int)esz, >>> psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT); >>> >>> @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its) >>> >>> for (i = 0; i < GITS_BASER_NR_REGS; i++) { >>> if (its->tables[i].base) { >>> - free_pages((unsigned long)its->tables[i].base, >>> + __free_pages(virt_to_page(its->tables[i].base), >> >> ditto >>> its->tables[i].order); >>> its->tables[i].base = NULL; >>> } >>> @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id) >>> >>> /* Allocate memory for 2nd level table */ >>> if (!table[idx]) { >>> - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz)); >>> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, >>> + get_order(baser->psz)); >>> if (!page) >>> return false; >>> >>> @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, >>> nr_ites = max(2UL, roundup_pow_of_two(nvecs)); >>> sz = nr_ites * its->ite_size; >>> sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; >>> - itt = kzalloc(sz, GFP_KERNEL); >>> + itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); >>> lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis); >>> if (lpi_map) >>> col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL); >>> @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res, >>> { >>> struct its_node *its; >>> void __iomem *its_base; >>> + struct page *page; >>> u32 val; >>> u64 baser, tmp; >>> int err; >>> @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res, >>> its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1; >>> its->numa_node = numa_node; >>> >>> - its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, >>> - get_order(ITS_CMD_QUEUE_SZ)); >>> - if (!its->cmd_base) { >>> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, >>> + get_order(ITS_CMD_QUEUE_SZ)); >>> + if (!page) { >>> err = -ENOMEM; >>> goto out_free_its; >>> } >>> + its->cmd_base = page_to_virt(page); >> >> same here, this would have been, >> its->cmd_base = page_address(page); >> >>> its->cmd_write = its->cmd_base; >>> >>> its_enable_quirks(its); >>> @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res, >>> out_free_tables: >>> its_free_tables(its); >>> out_free_cmd: >>> - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); >>> + __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ)); >> >> no change required. >>> out_free_its: >>> kfree(its); >>> out_unmap: >>> -- >>> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc. >>> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project. >>> >> >> thanks >> Ganapat > > thanks > Ganapat > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
Hi Marc, On 06/30/2017 03:51 AM, Marc Zyngier wrote: > On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >> <gpkulkarni@gmail.com> wrote: >>> Hi Shanker, >>> >>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>> <shankerd@codeaurora.org> wrote: >>>> The NUMA node information is visible to ITS driver but not being used >>>> other than handling errata. This patch allocates the memory for ITS >>>> tables from the corresponding NUMA node using the appropriate NUMA >>>> aware functions. >> >> IMHO, the description would have been more constructive? >> >> "All ITS tables are mapped by default to NODE 0 memory. >> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >> This will optimize tables access and avoids unnecessary inter-node traffic." > > But more importantly, I'd like to see figures showing the actual benefit > of this per-node allocation. Given that both of you guys have access to > such platforms, please show me the numbers! > I'll share the actual results which shows the improvement whenever available on our next chips. Current version of Qualcomm qdf2400 doesn't support multi socket configuration to capture results and share with you. Do you see any other issues with this patch apart from the performance improvements. I strongly believe this brings the noticeable improvement in numbers on systems where it has multi node memory/CPU configuration. > Thanks, > > M. >
Hi Shanker, On 03/07/17 15:24, Shanker Donthineni wrote: > Hi Marc, > > On 06/30/2017 03:51 AM, Marc Zyngier wrote: >> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>> <gpkulkarni@gmail.com> wrote: >>>> Hi Shanker, >>>> >>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>> <shankerd@codeaurora.org> wrote: >>>>> The NUMA node information is visible to ITS driver but not being used >>>>> other than handling errata. This patch allocates the memory for ITS >>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>> aware functions. >>> >>> IMHO, the description would have been more constructive? >>> >>> "All ITS tables are mapped by default to NODE 0 memory. >>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>> This will optimize tables access and avoids unnecessary inter-node traffic." >> >> But more importantly, I'd like to see figures showing the actual benefit >> of this per-node allocation. Given that both of you guys have access to >> such platforms, please show me the numbers! >> > > I'll share the actual results which shows the improvement whenever > available on our next chips. Current version of Qualcomm qdf2400 doesn't > support multi socket configuration to capture results and share with you. > > Do you see any other issues with this patch apart from the performance > improvements. I strongly believe this brings the noticeable improvement > in numbers on systems where it has multi node memory/CPU configuration. I agree that it *could* show an improvement, but it very much depends on how often the ITS misses in its caches. For this kind of patches, I want to see two things: 1) It brings a measurable benefit on NUMA platforms 2) it doesn't adversely impact non-NUMA systems I can deal with (2), but I have no way of evaluating (1), mostly for the lack of an infrastructure exercising multiple ITSs at the same time. Thanks, M.
Hi Marc, On 07/03/2017 09:53 AM, Marc Zyngier wrote: > Hi Shanker, > > On 03/07/17 15:24, Shanker Donthineni wrote: >> Hi Marc, >> >> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>> <gpkulkarni@gmail.com> wrote: >>>>> Hi Shanker, >>>>> >>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>> <shankerd@codeaurora.org> wrote: >>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>> aware functions. >>>> >>>> IMHO, the description would have been more constructive? >>>> >>>> "All ITS tables are mapped by default to NODE 0 memory. >>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>> >>> But more importantly, I'd like to see figures showing the actual benefit >>> of this per-node allocation. Given that both of you guys have access to >>> such platforms, please show me the numbers! >>> >> >> I'll share the actual results which shows the improvement whenever >> available on our next chips. Current version of Qualcomm qdf2400 doesn't >> support multi socket configuration to capture results and share with you. >> >> Do you see any other issues with this patch apart from the performance >> improvements. I strongly believe this brings the noticeable improvement >> in numbers on systems where it has multi node memory/CPU configuration. > > I agree that it *could* show an improvement, but it very much depends on > how often the ITS misses in its caches. For this kind of patches, I want > to see two things: > Just imagine systems with hundreds of PCI-SRIOV virtual functions and assigning some of them to virtual machines, and systems with GICv4 feature. There should be a lot of cache misses on ITS VCPU, DEVICE and COLLECTION lookups. And also VLPI patches that you've posted for comments are forcing to use VLPI feature for each VM irrespective of pass-through device assignment. > 1) It brings a measurable benefit on NUMA platforms > 2) it doesn't adversely impact non-NUMA systems > It should not affect the ITS hardware behavior non-NUMA based system since software always allocate memory from a single (default) NUMA node. > I can deal with (2), but I have no way of evaluating (1), mostly for the > lack of an infrastructure exercising multiple ITSs at the same time. > Agree with you, but it takes some time for me to provide the test results. > Thanks, > > M. >
On Mon, Jul 3, 2017 at 7:07 PM, Shanker Donthineni <shankerd@codeaurora.org> wrote: > Hi Ganapatraoo, > > On 06/29/2017 10:01 PM, Ganapatrao Kulkarni wrote: >> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >> <gpkulkarni@gmail.com> wrote: >>> Hi Shanker, >>> >>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>> <shankerd@codeaurora.org> wrote: >>>> The NUMA node information is visible to ITS driver but not being used >>>> other than handling errata. This patch allocates the memory for ITS >>>> tables from the corresponding NUMA node using the appropriate NUMA >>>> aware functions. >> >> IMHO, the description would have been more constructive? >> >> "All ITS tables are mapped by default to NODE 0 memory. >> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >> This will optimize tables access and avoids unnecessary inter-node traffic." >> > > I don't understand why you are saying 'All ITS tables are mapped by default to NODE 0 memory'? Kernel runtime memory allocation comes from the corresponding NUMA node of the CPU that being called. IIRC, ITT tables can be on any cpu, if drivers are not built-in and msi-x vectors are not initialised in init/probe, etc however Device table and Collection table are allocated from node 0 memory during ITS driver init by boot-cpu/cpu0. > > Please explain if you don't agree on this. > >>>> >>>> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org> >>>> --- >>>> drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++---------------- >>>> 1 file changed, 20 insertions(+), 16 deletions(-) >>>> >>>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c >>>> index fed99c5..e45add2 100644 >>>> --- a/drivers/irqchip/irq-gic-v3-its.c >>>> +++ b/drivers/irqchip/irq-gic-v3-its.c >>>> @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>>> u64 val = its_read_baser(its, baser); >>>> u64 esz = GITS_BASER_ENTRY_SIZE(val); >>>> u64 type = GITS_BASER_TYPE(val); >>>> + struct page *page; >>>> u32 alloc_pages; >>>> - void *base; >>>> u64 tmp; >>>> >>>> retry_alloc_baser: >>>> @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>>> order = get_order(GITS_BASER_PAGES_MAX * psz); >>>> } >>>> >>>> - base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); >>>> - if (!base) >>>> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); >>> >>> the changes would have been minimal, if you converted page to base >>> address after allocation here itself? >>> >>> page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); >>> base = page_address(page); >>> >>>> + if (!page) >>>> return -ENOMEM; >>>> >>>> retry_baser: >>>> - val = (virt_to_phys(base) | >>>> + val = (page_to_phys(page) >>> >>> no change required. >>> >>>> (type << GITS_BASER_TYPE_SHIFT) | >>>> ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | >>>> ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT) | >>>> @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>>> shr = tmp & GITS_BASER_SHAREABILITY_MASK; >>>> if (!shr) { >>>> cache = GITS_BASER_nC; >>>> - gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); >>>> + gic_flush_dcache_to_poc(page_to_virt(page), >>>> + PAGE_ORDER_TO_SIZE(order)); >>> >>> no change needed, if base is used still. >>>> } >>>> goto retry_baser; >>>> } >>>> @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>>> * size and retry. If we reach 4K, then >>>> * something is horribly wrong... >>>> */ >>>> - free_pages((unsigned long)base, order); >>>> + __free_pages(page, order); >>> >>> no change required. >>>> baser->base = NULL; >>>> >>>> switch (psz) { >>>> @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, >>>> pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", >>>> &its->phys_base, its_base_type_string[type], >>>> val, tmp); >>>> - free_pages((unsigned long)base, order); >>>> + __free_pages(page, order); >>> >>> no change required. >>> >>>> return -ENXIO; >>>> } >>>> >>>> baser->order = order; >>>> - baser->base = base; >>>> + baser->base = page_to_virt(page); >>> >>> no change required. >>>> baser->psz = psz; >>>> tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz; >>>> >>>> pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n", >>>> &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp), >>>> its_base_type_string[type], >>>> - (unsigned long)virt_to_phys(base), >>>> + (unsigned long)page_to_phys(page), >>> >>> no change required. >>>> indirect ? "indirect" : "flat", (int)esz, >>>> psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT); >>>> >>>> @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its) >>>> >>>> for (i = 0; i < GITS_BASER_NR_REGS; i++) { >>>> if (its->tables[i].base) { >>>> - free_pages((unsigned long)its->tables[i].base, >>>> + __free_pages(virt_to_page(its->tables[i].base), >>> >>> ditto >>>> its->tables[i].order); >>>> its->tables[i].base = NULL; >>>> } >>>> @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id) >>>> >>>> /* Allocate memory for 2nd level table */ >>>> if (!table[idx]) { >>>> - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz)); >>>> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, >>>> + get_order(baser->psz)); >>>> if (!page) >>>> return false; >>>> >>>> @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, >>>> nr_ites = max(2UL, roundup_pow_of_two(nvecs)); >>>> sz = nr_ites * its->ite_size; >>>> sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; >>>> - itt = kzalloc(sz, GFP_KERNEL); >>>> + itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); >>>> lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis); >>>> if (lpi_map) >>>> col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL); >>>> @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res, >>>> { >>>> struct its_node *its; >>>> void __iomem *its_base; >>>> + struct page *page; >>>> u32 val; >>>> u64 baser, tmp; >>>> int err; >>>> @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res, >>>> its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1; >>>> its->numa_node = numa_node; >>>> >>>> - its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, >>>> - get_order(ITS_CMD_QUEUE_SZ)); >>>> - if (!its->cmd_base) { >>>> + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, >>>> + get_order(ITS_CMD_QUEUE_SZ)); >>>> + if (!page) { >>>> err = -ENOMEM; >>>> goto out_free_its; >>>> } >>>> + its->cmd_base = page_to_virt(page); >>> >>> same here, this would have been, >>> its->cmd_base = page_address(page); >>> >>>> its->cmd_write = its->cmd_base; >>>> >>>> its_enable_quirks(its); >>>> @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res, >>>> out_free_tables: >>>> its_free_tables(its); >>>> out_free_cmd: >>>> - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); >>>> + __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ)); >>> >>> no change required. >>>> out_free_its: >>>> kfree(its); >>>> out_unmap: >>>> -- >>>> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc. >>>> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project. >>>> >>> >>> thanks >>> Ganapat >> >> thanks >> Ganapat >> >> _______________________________________________ >> linux-arm-kernel mailing list >> linux-arm-kernel@lists.infradead.org >> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >> > > -- > Shanker Donthineni > Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc. > Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project. > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel thanks Ganapat
Hi Ganapatrao, On 07/03/2017 11:51 AM, Ganapatrao Kulkarni wrote: > On Mon, Jul 3, 2017 at 7:07 PM, Shanker Donthineni > <shankerd@codeaurora.org> wrote: >> Hi Ganapatraoo, >> >> On 06/29/2017 10:01 PM, Ganapatrao Kulkarni wrote: >>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>> <gpkulkarni@gmail.com> wrote: >>>> Hi Shanker, >>>> >>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>> <shankerd@codeaurora.org> wrote: >>>>> The NUMA node information is visible to ITS driver but not being used >>>>> other than handling errata. This patch allocates the memory for ITS >>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>> aware functions. >>> >>> IMHO, the description would have been more constructive? >>> >>> "All ITS tables are mapped by default to NODE 0 memory. >>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>> This will optimize tables access and avoids unnecessary inter-node traffic." >>> >> >> I don't understand why you are saying 'All ITS tables are mapped by default to NODE 0 memory'? Kernel runtime memory allocation comes from the corresponding NUMA node of the CPU that being called. > > IIRC, ITT tables can be on any cpu, if drivers are not built-in and > msi-x vectors are not initialised in init/probe, etc > however Device table and Collection table are allocated from node 0 > memory during ITS driver init by boot-cpu/cpu0. > Don't confuse boot-cpu aka logical cpu0 and NUMA node 0, these is no requirement that says boot cpu has to tied to NUMA node 0. The memory for first level device table, prop and collection tables are allocated from the boot cpu's NUMA node not NUMA node 0. All other memory allocation inside the ITS driver can be done from any NUMA node depends on which CPU calls the memory allocation functions during runtime.
Hi Marc, On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: > Hi Shanker, > > On 03/07/17 15:24, Shanker Donthineni wrote: >> Hi Marc, >> >> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>> <gpkulkarni@gmail.com> wrote: >>>>> Hi Shanker, >>>>> >>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>> <shankerd@codeaurora.org> wrote: >>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>> aware functions. >>>> >>>> IMHO, the description would have been more constructive? >>>> >>>> "All ITS tables are mapped by default to NODE 0 memory. >>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>> >>> But more importantly, I'd like to see figures showing the actual benefit >>> of this per-node allocation. Given that both of you guys have access to >>> such platforms, please show me the numbers! >>> >> >> I'll share the actual results which shows the improvement whenever >> available on our next chips. Current version of Qualcomm qdf2400 doesn't >> support multi socket configuration to capture results and share with you. >> >> Do you see any other issues with this patch apart from the performance >> improvements. I strongly believe this brings the noticeable improvement >> in numbers on systems where it has multi node memory/CPU configuration. > > I agree that it *could* show an improvement, but it very much depends on > how often the ITS misses in its caches. For this kind of patches, I want > to see two things: > > 1) It brings a measurable benefit on NUMA platforms Did some measurement of interrupt response time for LPIs and we don't see any major improvement due to caching of Tables. However, we have seen improvements of around 5%. IMO, we should merge this patch to have NUMA aware allocations and to avoid unwanted inter-node transactions. Tested-by: Ganapatrao Kulkarni <ganapatrao.kulkarni@cavium.com> > 2) it doesn't adversely impact non-NUMA systems AFAIK, no impact on non-NUMA and on single node NUMA systems. > > I can deal with (2), but I have no way of evaluating (1), mostly for the > lack of an infrastructure exercising multiple ITSs at the same time. > > Thanks, > > M. > -- > Jazz is not dead. It just smells funny... thanks Ganapat
On 10/07/17 09:48, Ganapatrao Kulkarni wrote: > Hi Marc, > > On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >> Hi Shanker, >> >> On 03/07/17 15:24, Shanker Donthineni wrote: >>> Hi Marc, >>> >>> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>>> <gpkulkarni@gmail.com> wrote: >>>>>> Hi Shanker, >>>>>> >>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>>> <shankerd@codeaurora.org> wrote: >>>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>>> aware functions. >>>>> >>>>> IMHO, the description would have been more constructive? >>>>> >>>>> "All ITS tables are mapped by default to NODE 0 memory. >>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>>> >>>> But more importantly, I'd like to see figures showing the actual benefit >>>> of this per-node allocation. Given that both of you guys have access to >>>> such platforms, please show me the numbers! >>>> >>> >>> I'll share the actual results which shows the improvement whenever >>> available on our next chips. Current version of Qualcomm qdf2400 doesn't >>> support multi socket configuration to capture results and share with you. >>> >>> Do you see any other issues with this patch apart from the performance >>> improvements. I strongly believe this brings the noticeable improvement >>> in numbers on systems where it has multi node memory/CPU configuration. >> >> I agree that it *could* show an improvement, but it very much depends on >> how often the ITS misses in its caches. For this kind of patches, I want >> to see two things: >> >> 1) It brings a measurable benefit on NUMA platforms > > Did some measurement of interrupt response time for LPIs and we don't > see any major > improvement due to caching of Tables. However, we have seen > improvements of around 5%. An improvement of what exactly? M.
On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: > On 10/07/17 09:48, Ganapatrao Kulkarni wrote: >> Hi Marc, >> >> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>> Hi Shanker, >>> >>> On 03/07/17 15:24, Shanker Donthineni wrote: >>>> Hi Marc, >>>> >>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>>>> <gpkulkarni@gmail.com> wrote: >>>>>>> Hi Shanker, >>>>>>> >>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>>>> <shankerd@codeaurora.org> wrote: >>>>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>>>> aware functions. >>>>>> >>>>>> IMHO, the description would have been more constructive? >>>>>> >>>>>> "All ITS tables are mapped by default to NODE 0 memory. >>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>>>> >>>>> But more importantly, I'd like to see figures showing the actual benefit >>>>> of this per-node allocation. Given that both of you guys have access to >>>>> such platforms, please show me the numbers! >>>>> >>>> >>>> I'll share the actual results which shows the improvement whenever >>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't >>>> support multi socket configuration to capture results and share with you. >>>> >>>> Do you see any other issues with this patch apart from the performance >>>> improvements. I strongly believe this brings the noticeable improvement >>>> in numbers on systems where it has multi node memory/CPU configuration. >>> >>> I agree that it *could* show an improvement, but it very much depends on >>> how often the ITS misses in its caches. For this kind of patches, I want >>> to see two things: >>> >>> 1) It brings a measurable benefit on NUMA platforms >> >> Did some measurement of interrupt response time for LPIs and we don't >> see any major >> improvement due to caching of Tables. However, we have seen >> improvements of around 5%. > > An improvement of what exactly? interrupt response time. > > M. > -- > Jazz is not dead. It just smells funny... thanks Ganapat
On 10/07/17 10:08, Ganapatrao Kulkarni wrote: > On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >> On 10/07/17 09:48, Ganapatrao Kulkarni wrote: >>> Hi Marc, >>> >>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>> Hi Shanker, >>>> >>>> On 03/07/17 15:24, Shanker Donthineni wrote: >>>>> Hi Marc, >>>>> >>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>>>>> <gpkulkarni@gmail.com> wrote: >>>>>>>> Hi Shanker, >>>>>>>> >>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>>>>> <shankerd@codeaurora.org> wrote: >>>>>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>>>>> aware functions. >>>>>>> >>>>>>> IMHO, the description would have been more constructive? >>>>>>> >>>>>>> "All ITS tables are mapped by default to NODE 0 memory. >>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>>>>> >>>>>> But more importantly, I'd like to see figures showing the actual benefit >>>>>> of this per-node allocation. Given that both of you guys have access to >>>>>> such platforms, please show me the numbers! >>>>>> >>>>> >>>>> I'll share the actual results which shows the improvement whenever >>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't >>>>> support multi socket configuration to capture results and share with you. >>>>> >>>>> Do you see any other issues with this patch apart from the performance >>>>> improvements. I strongly believe this brings the noticeable improvement >>>>> in numbers on systems where it has multi node memory/CPU configuration. >>>> >>>> I agree that it *could* show an improvement, but it very much depends on >>>> how often the ITS misses in its caches. For this kind of patches, I want >>>> to see two things: >>>> >>>> 1) It brings a measurable benefit on NUMA platforms >>> >>> Did some measurement of interrupt response time for LPIs and we don't >>> see any major >>> improvement due to caching of Tables. However, we have seen >>> improvements of around 5%. >> >> An improvement of what exactly? > > interrupt response time. Measured how? On which HW? Using which benchmark? Give me the actual benchmark results. Don't expect me to accept this kind of hand-wavy statement. M.
Hi Marc, On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: > On 10/07/17 10:08, Ganapatrao Kulkarni wrote: >> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote: >>>> Hi Marc, >>>> >>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>>> Hi Shanker, >>>>> >>>>> On 03/07/17 15:24, Shanker Donthineni wrote: >>>>>> Hi Marc, >>>>>> >>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>>>>>> <gpkulkarni@gmail.com> wrote: >>>>>>>>> Hi Shanker, >>>>>>>>> >>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>>>>>> <shankerd@codeaurora.org> wrote: >>>>>>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>>>>>> aware functions. >>>>>>>> >>>>>>>> IMHO, the description would have been more constructive? >>>>>>>> >>>>>>>> "All ITS tables are mapped by default to NODE 0 memory. >>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>>>>>> >>>>>>> But more importantly, I'd like to see figures showing the actual benefit >>>>>>> of this per-node allocation. Given that both of you guys have access to >>>>>>> such platforms, please show me the numbers! >>>>>>> >>>>>> >>>>>> I'll share the actual results which shows the improvement whenever >>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't >>>>>> support multi socket configuration to capture results and share with you. >>>>>> >>>>>> Do you see any other issues with this patch apart from the performance >>>>>> improvements. I strongly believe this brings the noticeable improvement >>>>>> in numbers on systems where it has multi node memory/CPU configuration. >>>>> >>>>> I agree that it *could* show an improvement, but it very much depends on >>>>> how often the ITS misses in its caches. For this kind of patches, I want >>>>> to see two things: >>>>> >>>>> 1) It brings a measurable benefit on NUMA platforms >>>> >>>> Did some measurement of interrupt response time for LPIs and we don't >>>> see any major >>>> improvement due to caching of Tables. However, we have seen >>>> improvements of around 5%. >>> >>> An improvement of what exactly? >> >> interrupt response time. > > Measured how? On which HW? Using which benchmark? This has been tested on ThunderX2. We have instrumented gic-v3-its driver code to create dummy LPI device with few vectors. The LPI is induced from dummy device(through sysfs by writing to TRANSLATOR reg). The ISR routine(gic_handle_irq) being called to handle the induced LPI. NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped in ITT to route this LPI. CPU timer counter are sampled at the time LPI is Induced and in ISR routine to calculate interrupt response time. the result shown improvement of 5% with this patch. Do you have any recommended benchmarks to test the same? > > Give me the actual benchmark results. Don't expect me to accept this > kind of hand-wavy statement. > > M. > -- > Jazz is not dead. It just smells funny... thanks Ganapat
Marc, Do you have any other concerns taking this patch? On 07/10/2017 05:21 AM, Ganapatrao Kulkarni wrote: > Hi Marc, > > On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >> On 10/07/17 10:08, Ganapatrao Kulkarni wrote: >>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote: >>>>> Hi Marc, >>>>> >>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>>>> Hi Shanker, >>>>>> >>>>>> On 03/07/17 15:24, Shanker Donthineni wrote: >>>>>>> Hi Marc, >>>>>>> >>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>>>>>>> <gpkulkarni@gmail.com> wrote: >>>>>>>>>> Hi Shanker, >>>>>>>>>> >>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>>>>>>> <shankerd@codeaurora.org> wrote: >>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>>>>>>> aware functions. >>>>>>>>> >>>>>>>>> IMHO, the description would have been more constructive? >>>>>>>>> >>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory. >>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>>>>>>> >>>>>>>> But more importantly, I'd like to see figures showing the actual benefit >>>>>>>> of this per-node allocation. Given that both of you guys have access to >>>>>>>> such platforms, please show me the numbers! >>>>>>>> >>>>>>> >>>>>>> I'll share the actual results which shows the improvement whenever >>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't >>>>>>> support multi socket configuration to capture results and share with you. >>>>>>> >>>>>>> Do you see any other issues with this patch apart from the performance >>>>>>> improvements. I strongly believe this brings the noticeable improvement >>>>>>> in numbers on systems where it has multi node memory/CPU configuration. >>>>>> >>>>>> I agree that it *could* show an improvement, but it very much depends on >>>>>> how often the ITS misses in its caches. For this kind of patches, I want >>>>>> to see two things: >>>>>> >>>>>> 1) It brings a measurable benefit on NUMA platforms >>>>> >>>>> Did some measurement of interrupt response time for LPIs and we don't >>>>> see any major >>>>> improvement due to caching of Tables. However, we have seen >>>>> improvements of around 5%. >>>> >>>> An improvement of what exactly? >>> >>> interrupt response time. >> >> Measured how? On which HW? Using which benchmark? > > This has been tested on ThunderX2. > We have instrumented gic-v3-its driver code to create dummy LPI device > with few vectors. > The LPI is induced from dummy device(through sysfs by writing to > TRANSLATOR reg). > The ISR routine(gic_handle_irq) being called to handle the induced LPI. > NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped > in ITT to route this LPI. > > CPU timer counter are sampled at the time LPI is Induced and in ISR > routine to calculate interrupt response time. > the result shown improvement of 5% with this patch. > > Do you have any recommended benchmarks to test the same? > Ganapatrao, Thanks for your efforts on instrumenting ITS driver code to show interrupt performance improvement of 5% on the ThunderX2 hardware. Actually the current ITS driver is not consistent on allocating memory for ITS/GICR tables, GICR pending tables are allocated from the corresponding NUMA node based on CPU proximity, but not the other tables. >> >> Give me the actual benchmark results. Don't expect me to accept this >> kind of hand-wavy statement. >> >> M. >> -- >> Jazz is not dead. It just smells funny... > > thanks > Ganapat > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
On 10/07/17 11:21, Ganapatrao Kulkarni wrote: > Hi Marc, > > On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >> On 10/07/17 10:08, Ganapatrao Kulkarni wrote: >>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote: >>>>> Hi Marc, >>>>> >>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>>>> Hi Shanker, >>>>>> >>>>>> On 03/07/17 15:24, Shanker Donthineni wrote: >>>>>>> Hi Marc, >>>>>>> >>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>>>>>>> <gpkulkarni@gmail.com> wrote: >>>>>>>>>> Hi Shanker, >>>>>>>>>> >>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>>>>>>> <shankerd@codeaurora.org> wrote: >>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>>>>>>> aware functions. >>>>>>>>> >>>>>>>>> IMHO, the description would have been more constructive? >>>>>>>>> >>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory. >>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>>>>>>> >>>>>>>> But more importantly, I'd like to see figures showing the actual benefit >>>>>>>> of this per-node allocation. Given that both of you guys have access to >>>>>>>> such platforms, please show me the numbers! >>>>>>>> >>>>>>> >>>>>>> I'll share the actual results which shows the improvement whenever >>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't >>>>>>> support multi socket configuration to capture results and share with you. >>>>>>> >>>>>>> Do you see any other issues with this patch apart from the performance >>>>>>> improvements. I strongly believe this brings the noticeable improvement >>>>>>> in numbers on systems where it has multi node memory/CPU configuration. >>>>>> >>>>>> I agree that it *could* show an improvement, but it very much depends on >>>>>> how often the ITS misses in its caches. For this kind of patches, I want >>>>>> to see two things: >>>>>> >>>>>> 1) It brings a measurable benefit on NUMA platforms >>>>> >>>>> Did some measurement of interrupt response time for LPIs and we don't >>>>> see any major >>>>> improvement due to caching of Tables. However, we have seen >>>>> improvements of around 5%. >>>> >>>> An improvement of what exactly? >>> >>> interrupt response time. >> >> Measured how? On which HW? Using which benchmark? > > This has been tested on ThunderX2. > We have instrumented gic-v3-its driver code to create dummy LPI device > with few vectors. > The LPI is induced from dummy device(through sysfs by writing to > TRANSLATOR reg). > The ISR routine(gic_handle_irq) being called to handle the induced LPI. > NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped > in ITT to route this LPI. > > CPU timer counter are sampled at the time LPI is Induced and in ISR > routine to calculate interrupt response time. > the result shown improvement of 5% with this patch. And you call that a realistic measurement of the latency? Really? Sorry, but I cannot take you seriously here. > Do you have any recommended benchmarks to test the same? Run a standard benchmark such as netperf, post the result with and without that patch. The above is just plain ridiculous. M.
On 10/07/17 13:30, Shanker Donthineni wrote: > Marc, > > Do you have any other concerns taking this patch? Plenty. All I have seen so far is a wet finger in the air and the claim that it reduces "something" by 5%, without any actual figure. Sorry, I'm not buying it without people putting in the effort to show me that this is worth it. Thanks, M.
Hi Marc, On 07/10/2017 08:50 AM, Marc Zyngier wrote: > On 10/07/17 11:21, Ganapatrao Kulkarni wrote: >> Hi Marc, >> >> On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>> On 10/07/17 10:08, Ganapatrao Kulkarni wrote: >>>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote: >>>>>> Hi Marc, >>>>>> >>>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>>>>> Hi Shanker, >>>>>>> >>>>>>> On 03/07/17 15:24, Shanker Donthineni wrote: >>>>>>>> Hi Marc, >>>>>>>> >>>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>>>>>>>> <gpkulkarni@gmail.com> wrote: >>>>>>>>>>> Hi Shanker, >>>>>>>>>>> >>>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>>>>>>>> <shankerd@codeaurora.org> wrote: >>>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>>>>>>>> aware functions. >>>>>>>>>> >>>>>>>>>> IMHO, the description would have been more constructive? >>>>>>>>>> >>>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory. >>>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>>>>>>>> >>>>>>>>> But more importantly, I'd like to see figures showing the actual benefit >>>>>>>>> of this per-node allocation. Given that both of you guys have access to >>>>>>>>> such platforms, please show me the numbers! >>>>>>>>> >>>>>>>> >>>>>>>> I'll share the actual results which shows the improvement whenever >>>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't >>>>>>>> support multi socket configuration to capture results and share with you. >>>>>>>> >>>>>>>> Do you see any other issues with this patch apart from the performance >>>>>>>> improvements. I strongly believe this brings the noticeable improvement >>>>>>>> in numbers on systems where it has multi node memory/CPU configuration. >>>>>>> >>>>>>> I agree that it *could* show an improvement, but it very much depends on >>>>>>> how often the ITS misses in its caches. For this kind of patches, I want >>>>>>> to see two things: >>>>>>> >>>>>>> 1) It brings a measurable benefit on NUMA platforms >>>>>> >>>>>> Did some measurement of interrupt response time for LPIs and we don't >>>>>> see any major >>>>>> improvement due to caching of Tables. However, we have seen >>>>>> improvements of around 5%. >>>>> >>>>> An improvement of what exactly? >>>> >>>> interrupt response time. >>> >>> Measured how? On which HW? Using which benchmark? >> >> This has been tested on ThunderX2. >> We have instrumented gic-v3-its driver code to create dummy LPI device >> with few vectors. >> The LPI is induced from dummy device(through sysfs by writing to >> TRANSLATOR reg). >> The ISR routine(gic_handle_irq) being called to handle the induced LPI. >> NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped >> in ITT to route this LPI. >> >> CPU timer counter are sampled at the time LPI is Induced and in ISR >> routine to calculate interrupt response time. >> the result shown improvement of 5% with this patch. > > And you call that a realistic measurement of the latency? Really? Sorry, > but I cannot take you seriously here. > >> Do you have any recommended benchmarks to test the same? > > Run a standard benchmark such as netperf, post the result with and > without that patch. The above is just plain ridiculous. > The whole purpose of ACPI subtable "GIC Interrupt Translation Service (ITS) Affinity structure" is to provide the proximity information to OS so that software will take advantage of NUMA aware allocations to improve the read latency of ITS/GICR tables, not just for implementing software workarounds. I believe ITS driver should provide NUMA aware allocations just like x86 Linux drivers. How much performance improvement we observer is based on the individual SOC implementation, inter NODE latency, inter node traffic, cache capacity, and type of the test used to measure results. Please consider this patch irrespective of the test results running on a specific hardware. We need this patch for upcoming Qualcomm server chips. > M. >
On 10/07/17 15:57, Shanker Donthineni wrote: > Hi Marc, > > On 07/10/2017 08:50 AM, Marc Zyngier wrote: >> On 10/07/17 11:21, Ganapatrao Kulkarni wrote: >>> Hi Marc, >>> >>> On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>> On 10/07/17 10:08, Ganapatrao Kulkarni wrote: >>>>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote: >>>>>>> Hi Marc, >>>>>>> >>>>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: >>>>>>>> Hi Shanker, >>>>>>>> >>>>>>>> On 03/07/17 15:24, Shanker Donthineni wrote: >>>>>>>>> Hi Marc, >>>>>>>>> >>>>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote: >>>>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: >>>>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni >>>>>>>>>>> <gpkulkarni@gmail.com> wrote: >>>>>>>>>>>> Hi Shanker, >>>>>>>>>>>> >>>>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni >>>>>>>>>>>> <shankerd@codeaurora.org> wrote: >>>>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used >>>>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS >>>>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA >>>>>>>>>>>>> aware functions. >>>>>>>>>>> >>>>>>>>>>> IMHO, the description would have been more constructive? >>>>>>>>>>> >>>>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory. >>>>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. >>>>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic." >>>>>>>>>> >>>>>>>>>> But more importantly, I'd like to see figures showing the actual benefit >>>>>>>>>> of this per-node allocation. Given that both of you guys have access to >>>>>>>>>> such platforms, please show me the numbers! >>>>>>>>>> >>>>>>>>> >>>>>>>>> I'll share the actual results which shows the improvement whenever >>>>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't >>>>>>>>> support multi socket configuration to capture results and share with you. >>>>>>>>> >>>>>>>>> Do you see any other issues with this patch apart from the performance >>>>>>>>> improvements. I strongly believe this brings the noticeable improvement >>>>>>>>> in numbers on systems where it has multi node memory/CPU configuration. >>>>>>>> >>>>>>>> I agree that it *could* show an improvement, but it very much depends on >>>>>>>> how often the ITS misses in its caches. For this kind of patches, I want >>>>>>>> to see two things: >>>>>>>> >>>>>>>> 1) It brings a measurable benefit on NUMA platforms >>>>>>> >>>>>>> Did some measurement of interrupt response time for LPIs and we don't >>>>>>> see any major >>>>>>> improvement due to caching of Tables. However, we have seen >>>>>>> improvements of around 5%. >>>>>> >>>>>> An improvement of what exactly? >>>>> >>>>> interrupt response time. >>>> >>>> Measured how? On which HW? Using which benchmark? >>> >>> This has been tested on ThunderX2. >>> We have instrumented gic-v3-its driver code to create dummy LPI device >>> with few vectors. >>> The LPI is induced from dummy device(through sysfs by writing to >>> TRANSLATOR reg). >>> The ISR routine(gic_handle_irq) being called to handle the induced LPI. >>> NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped >>> in ITT to route this LPI. >>> >>> CPU timer counter are sampled at the time LPI is Induced and in ISR >>> routine to calculate interrupt response time. >>> the result shown improvement of 5% with this patch. >> >> And you call that a realistic measurement of the latency? Really? Sorry, >> but I cannot take you seriously here. >> >>> Do you have any recommended benchmarks to test the same? >> >> Run a standard benchmark such as netperf, post the result with and >> without that patch. The above is just plain ridiculous. >> > > The whole purpose of ACPI subtable "GIC Interrupt Translation Service (ITS) Affinity structure" > is to provide the proximity information to OS so that software will take advantage of NUMA > aware allocations to improve the read latency of ITS/GICR tables, not just for implementing > software workarounds. > > > I believe ITS driver should provide NUMA aware allocations just like x86 Linux drivers. How much > performance improvement we observer is based on the individual SOC implementation, inter NODE > latency, inter node traffic, cache capacity, and type of the test used to measure results. > > Please consider this patch irrespective of the test results running on a specific hardware. We > need this patch for upcoming Qualcomm server chips. "I believe" and "We need" are not a proof of the usefulness of this. We can argue all day, or you can provide a set of convincing results. Your choice. But I can guarantee you the the latter is a much better method than the former. If you (or Cavium) cannot be bothered to provide tangible results that this is useful, why should I take this at face value? This is just like any other improvement we make to the kernel. We back it *with data*. M.
Hi Marc, On Mon, Jul 10, 2017 at 04:15:28PM +0100, Marc Zyngier wrote: > On 10/07/17 15:57, Shanker Donthineni wrote: > > Hi Marc, > > > > On 07/10/2017 08:50 AM, Marc Zyngier wrote: > >> On 10/07/17 11:21, Ganapatrao Kulkarni wrote: > >>> Hi Marc, > >>> > >>> On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: > >>>> On 10/07/17 10:08, Ganapatrao Kulkarni wrote: > >>>>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: > >>>>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote: > >>>>>>> Hi Marc, > >>>>>>> > >>>>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote: > >>>>>>>> Hi Shanker, > >>>>>>>> > >>>>>>>> On 03/07/17 15:24, Shanker Donthineni wrote: > >>>>>>>>> Hi Marc, > >>>>>>>>> > >>>>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote: > >>>>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote: > >>>>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni > >>>>>>>>>>> <gpkulkarni@gmail.com> wrote: > >>>>>>>>>>>> Hi Shanker, > >>>>>>>>>>>> > >>>>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni > >>>>>>>>>>>> <shankerd@codeaurora.org> wrote: > >>>>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used > >>>>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS > >>>>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA > >>>>>>>>>>>>> aware functions. > >>>>>>>>>>> > >>>>>>>>>>> IMHO, the description would have been more constructive? > >>>>>>>>>>> > >>>>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory. > >>>>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices. > >>>>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic." > >>>>>>>>>> > >>>>>>>>>> But more importantly, I'd like to see figures showing the actual benefit > >>>>>>>>>> of this per-node allocation. Given that both of you guys have access to > >>>>>>>>>> such platforms, please show me the numbers! > >>>>>>>>>> > >>>>>>>>> > >>>>>>>>> I'll share the actual results which shows the improvement whenever > >>>>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't > >>>>>>>>> support multi socket configuration to capture results and share with you. > >>>>>>>>> > >>>>>>>>> Do you see any other issues with this patch apart from the performance > >>>>>>>>> improvements. I strongly believe this brings the noticeable improvement > >>>>>>>>> in numbers on systems where it has multi node memory/CPU configuration. > >>>>>>>> > >>>>>>>> I agree that it *could* show an improvement, but it very much depends on > >>>>>>>> how often the ITS misses in its caches. For this kind of patches, I want > >>>>>>>> to see two things: > >>>>>>>> > >>>>>>>> 1) It brings a measurable benefit on NUMA platforms > >>>>>>> > >>>>>>> Did some measurement of interrupt response time for LPIs and we don't > >>>>>>> see any major > >>>>>>> improvement due to caching of Tables. However, we have seen > >>>>>>> improvements of around 5%. > >>>>>> > >>>>>> An improvement of what exactly? > >>>>> > >>>>> interrupt response time. > >>>> > >>>> Measured how? On which HW? Using which benchmark? > >>> > >>> This has been tested on ThunderX2. > >>> We have instrumented gic-v3-its driver code to create dummy LPI device > >>> with few vectors. > >>> The LPI is induced from dummy device(through sysfs by writing to > >>> TRANSLATOR reg). > >>> The ISR routine(gic_handle_irq) being called to handle the induced LPI. > >>> NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped > >>> in ITT to route this LPI. > >>> > >>> CPU timer counter are sampled at the time LPI is Induced and in ISR > >>> routine to calculate interrupt response time. > >>> the result shown improvement of 5% with this patch. > >> > >> And you call that a realistic measurement of the latency? Really? Sorry, > >> but I cannot take you seriously here. > >> > >>> Do you have any recommended benchmarks to test the same? > >> > >> Run a standard benchmark such as netperf, post the result with and > >> without that patch. The above is just plain ridiculous. > >> > > > > The whole purpose of ACPI subtable "GIC Interrupt Translation Service (ITS) Affinity structure" > > is to provide the proximity information to OS so that software will take advantage of NUMA > > aware allocations to improve the read latency of ITS/GICR tables, not just for implementing > > software workarounds. > > > > > > I believe ITS driver should provide NUMA aware allocations just like x86 Linux drivers. How much > > performance improvement we observer is based on the individual SOC implementation, inter NODE > > latency, inter node traffic, cache capacity, and type of the test used to measure results. > > > > Please consider this patch irrespective of the test results running on a specific hardware. We > > need this patch for upcoming Qualcomm server chips. > > "I believe" and "We need" are not a proof of the usefulness of this. We > can argue all day, or you can provide a set of convincing results. Your > choice. But I can guarantee you the the latter is a much better method > than the former. > > If you (or Cavium) cannot be bothered to provide tangible results that > this is useful, why should I take this at face value? This is just like > any other improvement we make to the kernel. We back it *with data*. At Cavium, most of the ThunderX2 boards we have are multi-node, and we are interested in enabling NUMA optimizations. But, in this case, we do not see (or expect to see - given the nature of access) any significant improvement in any standard benchmark. Ganapat's LPI injection test to find interrupt latency was probably the best option we had so far. We could come up with another contrived test case to see if there is any change in behavior when we overload the interconnect, but I don't think we will get any data to really justify the patch. Allocating the tables on the node is a good thing since it avoids unnecessary traffic over the interconnect, so I do not see the problem in merging a simple patch for that. Is there any specific issue here? Anyway, for ThunderX2, the patch is good to have, but not critical. And as Ganapat noted, the patch can be improved a bit. Also going thru the patch, I think the chip data is better allocated using node as well. JC.
On 11.07.17 08:48:56, Jayachandran C wrote: > On Mon, Jul 10, 2017 at 04:15:28PM +0100, Marc Zyngier wrote: > > On 10/07/17 15:57, Shanker Donthineni wrote: > > > I believe ITS driver should provide NUMA aware allocations just like x86 Linux drivers. How much > > > performance improvement we observer is based on the individual SOC implementation, inter NODE > > > latency, inter node traffic, cache capacity, and type of the test used to measure results. > > > > > > Please consider this patch irrespective of the test results running on a specific hardware. We > > > need this patch for upcoming Qualcomm server chips. > > > > "I believe" and "We need" are not a proof of the usefulness of this. We > > can argue all day, or you can provide a set of convincing results. Your > > choice. But I can guarantee you the the latter is a much better method > > than the former. > > > > If you (or Cavium) cannot be bothered to provide tangible results that > > this is useful, why should I take this at face value? This is just like > > any other improvement we make to the kernel. We back it *with data*. > > At Cavium, most of the ThunderX2 boards we have are multi-node, and we > are interested in enabling NUMA optimizations. > > But, in this case, we do not see (or expect to see - given the nature of > access) any significant improvement in any standard benchmark. Ganapat's > LPI injection test to find interrupt latency was probably the best option > we had so far. We could come up with another contrived test case to see > if there is any change in behavior when we overload the interconnect, > but I don't think we will get any data to really justify the patch. > > Allocating the tables on the node is a good thing since it avoids > unnecessary traffic over the interconnect, so I do not see the > problem in merging a simple patch for that. Is there any specific > issue here? > > Anyway, for ThunderX2, the patch is good to have, but not critical. > And as Ganapat noted, the patch can be improved a bit. Also going thru > the patch, I think the chip data is better allocated using node as well. There is another thing to consider here. We will need cma and devm for ITS. There are only a few per node allocation functions that can be used then, so per-node allocation should only be used in rare cases where really needed. I am going to repost my cma device table allocation series after the merge window closes. -Robert
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fed99c5..e45add2 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, u64 val = its_read_baser(its, baser); u64 esz = GITS_BASER_ENTRY_SIZE(val); u64 type = GITS_BASER_TYPE(val); + struct page *page; u32 alloc_pages; - void *base; u64 tmp; retry_alloc_baser: @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, order = get_order(GITS_BASER_PAGES_MAX * psz); } - base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!base) + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); + if (!page) return -ENOMEM; retry_baser: - val = (virt_to_phys(base) | + val = (page_to_phys(page) | (type << GITS_BASER_TYPE_SHIFT) | ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT) | @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, shr = tmp & GITS_BASER_SHAREABILITY_MASK; if (!shr) { cache = GITS_BASER_nC; - gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); + gic_flush_dcache_to_poc(page_to_virt(page), + PAGE_ORDER_TO_SIZE(order)); } goto retry_baser; } @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, * size and retry. If we reach 4K, then * something is horribly wrong... */ - free_pages((unsigned long)base, order); + __free_pages(page, order); baser->base = NULL; switch (psz) { @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", &its->phys_base, its_base_type_string[type], val, tmp); - free_pages((unsigned long)base, order); + __free_pages(page, order); return -ENXIO; } baser->order = order; - baser->base = base; + baser->base = page_to_virt(page); baser->psz = psz; tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz; pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n", &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp), its_base_type_string[type], - (unsigned long)virt_to_phys(base), + (unsigned long)page_to_phys(page), indirect ? "indirect" : "flat", (int)esz, psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT); @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its) for (i = 0; i < GITS_BASER_NR_REGS; i++) { if (its->tables[i].base) { - free_pages((unsigned long)its->tables[i].base, + __free_pages(virt_to_page(its->tables[i].base), its->tables[i].order); its->tables[i].base = NULL; } @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id) /* Allocate memory for 2nd level table */ if (!table[idx]) { - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz)); + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, + get_order(baser->psz)); if (!page) return false; @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, nr_ites = max(2UL, roundup_pow_of_two(nvecs)); sz = nr_ites * its->ite_size; sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; - itt = kzalloc(sz, GFP_KERNEL); + itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis); if (lpi_map) col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL); @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res, { struct its_node *its; void __iomem *its_base; + struct page *page; u32 val; u64 baser, tmp; int err; @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res, its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1; its->numa_node = numa_node; - its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(ITS_CMD_QUEUE_SZ)); - if (!its->cmd_base) { + page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, + get_order(ITS_CMD_QUEUE_SZ)); + if (!page) { err = -ENOMEM; goto out_free_its; } + its->cmd_base = page_to_virt(page); its->cmd_write = its->cmd_base; its_enable_quirks(its); @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res, out_free_tables: its_free_tables(its); out_free_cmd: - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); + __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ)); out_free_its: kfree(its); out_unmap:
The NUMA node information is visible to ITS driver but not being used other than handling errata. This patch allocates the memory for ITS tables from the corresponding NUMA node using the appropriate NUMA aware functions. Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org> --- drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-)