diff mbox

irqchip: gicv3-its: Use NUMA aware memory allocation for ITS tables

Message ID 1498405569-463-1-git-send-email-shankerd@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Shanker Donthineni June 25, 2017, 3:46 p.m. UTC
The NUMA node information is visible to ITS driver but not being used
other than handling errata. This patch allocates the memory for ITS
tables from the corresponding NUMA node using the appropriate NUMA
aware functions.

Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
---
 drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

Comments

Ganapatrao Kulkarni June 30, 2017, 2:34 a.m. UTC | #1
Hi Shanker,

On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
<shankerd@codeaurora.org> wrote:
> The NUMA node information is visible to ITS driver but not being used
> other than handling errata. This patch allocates the memory for ITS
> tables from the corresponding NUMA node using the appropriate NUMA
> aware functions.
>
> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
> ---
>  drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++----------------
>  1 file changed, 20 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index fed99c5..e45add2 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>         u64 val = its_read_baser(its, baser);
>         u64 esz = GITS_BASER_ENTRY_SIZE(val);
>         u64 type = GITS_BASER_TYPE(val);
> +       struct page *page;
>         u32 alloc_pages;
> -       void *base;
>         u64 tmp;
>
>  retry_alloc_baser:
> @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>                 order = get_order(GITS_BASER_PAGES_MAX * psz);
>         }
>
> -       base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
> -       if (!base)
> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);

the changes would have been minimal, if you converted page to base
address after allocation here itself?

 page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
 base =  page_address(page);

> +       if (!page)
>                 return -ENOMEM;
>
>  retry_baser:
> -       val = (virt_to_phys(base)                                |
> +       val = (page_to_phys(page)

no change required.

>                 (type << GITS_BASER_TYPE_SHIFT)                  |
>                 ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)       |
>                 ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT)    |
> @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>                 shr = tmp & GITS_BASER_SHAREABILITY_MASK;
>                 if (!shr) {
>                         cache = GITS_BASER_nC;
> -                       gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
> +                       gic_flush_dcache_to_poc(page_to_virt(page),
> +                                               PAGE_ORDER_TO_SIZE(order));

no change needed, if base is used still.
>                 }
>                 goto retry_baser;
>         }
> @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>                  * size and retry. If we reach 4K, then
>                  * something is horribly wrong...
>                  */
> -               free_pages((unsigned long)base, order);
> +               __free_pages(page, order);

no change required.
>                 baser->base = NULL;
>
>                 switch (psz) {
> @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>                 pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
>                        &its->phys_base, its_base_type_string[type],
>                        val, tmp);
> -               free_pages((unsigned long)base, order);
> +               __free_pages(page, order);

no change required.

>                 return -ENXIO;
>         }
>
>         baser->order = order;
> -       baser->base = base;
> +       baser->base = page_to_virt(page);

no change required.
>         baser->psz = psz;
>         tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz;
>
>         pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n",
>                 &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp),
>                 its_base_type_string[type],
> -               (unsigned long)virt_to_phys(base),
> +               (unsigned long)page_to_phys(page),

no change required.
>                 indirect ? "indirect" : "flat", (int)esz,
>                 psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT);
>
> @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its)
>
>         for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>                 if (its->tables[i].base) {
> -                       free_pages((unsigned long)its->tables[i].base,
> +                       __free_pages(virt_to_page(its->tables[i].base),

ditto
>                                    its->tables[i].order);
>                         its->tables[i].base = NULL;
>                 }
> @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id)
>
>         /* Allocate memory for 2nd level table */
>         if (!table[idx]) {
> -               page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz));
> +               page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
> +                                       get_order(baser->psz));
>                 if (!page)
>                         return false;
>
> @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>         nr_ites = max(2UL, roundup_pow_of_two(nvecs));
>         sz = nr_ites * its->ite_size;
>         sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
> -       itt = kzalloc(sz, GFP_KERNEL);
> +       itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
>         lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis);
>         if (lpi_map)
>                 col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL);
> @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res,
>  {
>         struct its_node *its;
>         void __iomem *its_base;
> +       struct page *page;
>         u32 val;
>         u64 baser, tmp;
>         int err;
> @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res,
>         its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1;
>         its->numa_node = numa_node;
>
> -       its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
> -                                               get_order(ITS_CMD_QUEUE_SZ));
> -       if (!its->cmd_base) {
> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
> +                               get_order(ITS_CMD_QUEUE_SZ));
> +       if (!page) {
>                 err = -ENOMEM;
>                 goto out_free_its;
>         }
> +       its->cmd_base = page_to_virt(page);

same here, this would have been,
its->cmd_base = page_address(page);

>         its->cmd_write = its->cmd_base;
>
>         its_enable_quirks(its);
> @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res,
>  out_free_tables:
>         its_free_tables(its);
>  out_free_cmd:
> -       free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
> +       __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ));

no change required.
>  out_free_its:
>         kfree(its);
>  out_unmap:
> --
> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
>

thanks
Ganapat
Ganapatrao Kulkarni June 30, 2017, 3:01 a.m. UTC | #2
On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
<gpkulkarni@gmail.com> wrote:
> Hi Shanker,
>
> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
> <shankerd@codeaurora.org> wrote:
>> The NUMA node information is visible to ITS driver but not being used
>> other than handling errata. This patch allocates the memory for ITS
>> tables from the corresponding NUMA node using the appropriate NUMA
>> aware functions.

IMHO, the description would have been more constructive?

"All ITS tables are mapped by default to NODE 0 memory.
Adding changes to allocate memory from respective NUMA NODES of ITS devices.
This will optimize tables access and avoids unnecessary inter-node traffic."

>>
>> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
>> ---
>>  drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++----------------
>>  1 file changed, 20 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
>> index fed99c5..e45add2 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>         u64 val = its_read_baser(its, baser);
>>         u64 esz = GITS_BASER_ENTRY_SIZE(val);
>>         u64 type = GITS_BASER_TYPE(val);
>> +       struct page *page;
>>         u32 alloc_pages;
>> -       void *base;
>>         u64 tmp;
>>
>>  retry_alloc_baser:
>> @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>                 order = get_order(GITS_BASER_PAGES_MAX * psz);
>>         }
>>
>> -       base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
>> -       if (!base)
>> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
>
> the changes would have been minimal, if you converted page to base
> address after allocation here itself?
>
>  page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
>  base =  page_address(page);
>
>> +       if (!page)
>>                 return -ENOMEM;
>>
>>  retry_baser:
>> -       val = (virt_to_phys(base)                                |
>> +       val = (page_to_phys(page)
>
> no change required.
>
>>                 (type << GITS_BASER_TYPE_SHIFT)                  |
>>                 ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)       |
>>                 ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT)    |
>> @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>                 shr = tmp & GITS_BASER_SHAREABILITY_MASK;
>>                 if (!shr) {
>>                         cache = GITS_BASER_nC;
>> -                       gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
>> +                       gic_flush_dcache_to_poc(page_to_virt(page),
>> +                                               PAGE_ORDER_TO_SIZE(order));
>
> no change needed, if base is used still.
>>                 }
>>                 goto retry_baser;
>>         }
>> @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>                  * size and retry. If we reach 4K, then
>>                  * something is horribly wrong...
>>                  */
>> -               free_pages((unsigned long)base, order);
>> +               __free_pages(page, order);
>
> no change required.
>>                 baser->base = NULL;
>>
>>                 switch (psz) {
>> @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>                 pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
>>                        &its->phys_base, its_base_type_string[type],
>>                        val, tmp);
>> -               free_pages((unsigned long)base, order);
>> +               __free_pages(page, order);
>
> no change required.
>
>>                 return -ENXIO;
>>         }
>>
>>         baser->order = order;
>> -       baser->base = base;
>> +       baser->base = page_to_virt(page);
>
> no change required.
>>         baser->psz = psz;
>>         tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz;
>>
>>         pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n",
>>                 &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp),
>>                 its_base_type_string[type],
>> -               (unsigned long)virt_to_phys(base),
>> +               (unsigned long)page_to_phys(page),
>
> no change required.
>>                 indirect ? "indirect" : "flat", (int)esz,
>>                 psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT);
>>
>> @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its)
>>
>>         for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>>                 if (its->tables[i].base) {
>> -                       free_pages((unsigned long)its->tables[i].base,
>> +                       __free_pages(virt_to_page(its->tables[i].base),
>
> ditto
>>                                    its->tables[i].order);
>>                         its->tables[i].base = NULL;
>>                 }
>> @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id)
>>
>>         /* Allocate memory for 2nd level table */
>>         if (!table[idx]) {
>> -               page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz));
>> +               page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>> +                                       get_order(baser->psz));
>>                 if (!page)
>>                         return false;
>>
>> @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>>         nr_ites = max(2UL, roundup_pow_of_two(nvecs));
>>         sz = nr_ites * its->ite_size;
>>         sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
>> -       itt = kzalloc(sz, GFP_KERNEL);
>> +       itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
>>         lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis);
>>         if (lpi_map)
>>                 col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL);
>> @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res,
>>  {
>>         struct its_node *its;
>>         void __iomem *its_base;
>> +       struct page *page;
>>         u32 val;
>>         u64 baser, tmp;
>>         int err;
>> @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res,
>>         its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1;
>>         its->numa_node = numa_node;
>>
>> -       its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
>> -                                               get_order(ITS_CMD_QUEUE_SZ));
>> -       if (!its->cmd_base) {
>> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>> +                               get_order(ITS_CMD_QUEUE_SZ));
>> +       if (!page) {
>>                 err = -ENOMEM;
>>                 goto out_free_its;
>>         }
>> +       its->cmd_base = page_to_virt(page);
>
> same here, this would have been,
> its->cmd_base = page_address(page);
>
>>         its->cmd_write = its->cmd_base;
>>
>>         its_enable_quirks(its);
>> @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res,
>>  out_free_tables:
>>         its_free_tables(its);
>>  out_free_cmd:
>> -       free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
>> +       __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ));
>
> no change required.
>>  out_free_its:
>>         kfree(its);
>>  out_unmap:
>> --
>> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
>> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
>>
>
> thanks
> Ganapat

thanks
Ganapat
Marc Zyngier June 30, 2017, 8:51 a.m. UTC | #3
On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
> <gpkulkarni@gmail.com> wrote:
>> Hi Shanker,
>>
>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>> <shankerd@codeaurora.org> wrote:
>>> The NUMA node information is visible to ITS driver but not being used
>>> other than handling errata. This patch allocates the memory for ITS
>>> tables from the corresponding NUMA node using the appropriate NUMA
>>> aware functions.
> 
> IMHO, the description would have been more constructive?
> 
> "All ITS tables are mapped by default to NODE 0 memory.
> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
> This will optimize tables access and avoids unnecessary inter-node traffic."

But more importantly, I'd like to see figures showing the actual benefit
of this per-node allocation. Given that both of you guys have access to
such platforms, please show me the numbers!

Thanks,

	M.
Shanker Donthineni July 3, 2017, 1:30 p.m. UTC | #4
Hi Ganapatrao,

On 06/29/2017 09:34 PM, Ganapatrao Kulkarni wrote:
> Hi Shanker,
> 
> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
> <shankerd@codeaurora.org> wrote:
>> The NUMA node information is visible to ITS driver but not being used
>> other than handling errata. This patch allocates the memory for ITS
>> tables from the corresponding NUMA node using the appropriate NUMA
>> aware functions.
>>
>> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
>> ---
>>  drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++----------------
>>  1 file changed, 20 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
>> index fed99c5..e45add2 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>         u64 val = its_read_baser(its, baser);
>>         u64 esz = GITS_BASER_ENTRY_SIZE(val);
>>         u64 type = GITS_BASER_TYPE(val);
>> +       struct page *page;
>>         u32 alloc_pages;
>> -       void *base;
>>         u64 tmp;
>>
>>  retry_alloc_baser:
>> @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>                 order = get_order(GITS_BASER_PAGES_MAX * psz);
>>         }
>>
>> -       base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
>> -       if (!base)
>> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
> 
> the changes would have been minimal, if you converted page to base
> address after allocation here itself?
> 

Yes, but it doesn't make any sense to keep the two variables to server the same purpose and covert the page pointer to (unsigned long) when freeing it. 
 
>  page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
>  base =  page_address(page);
> 
>> +       if (!page)
>>                 return -ENOMEM;
>>
>>  retry_baser:
>> -       val = (virt_to_phys(base)                                |
>> +       val = (page_to_phys(page)
> 
> no change required.
> 
>>                 (type << GITS_BASER_TYPE_SHIFT)                  |
>>                 ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)       |
>>                 ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT)    |
>> @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>                 shr = tmp & GITS_BASER_SHAREABILITY_MASK;
>>                 if (!shr) {
>>                         cache = GITS_BASER_nC;
>> -                       gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
>> +                       gic_flush_dcache_to_poc(page_to_virt(page),
>> +                                               PAGE_ORDER_TO_SIZE(order));
> 
> no change needed, if base is used still.
>>                 }
>>                 goto retry_baser;
>>         }
>> @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>                  * size and retry. If we reach 4K, then
>>                  * something is horribly wrong...
>>                  */
>> -               free_pages((unsigned long)base, order);
>> +               __free_pages(page, order);
> 
> no change required.
>>                 baser->base = NULL;
>>
>>                 switch (psz) {
>> @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>                 pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
>>                        &its->phys_base, its_base_type_string[type],
>>                        val, tmp);
>> -               free_pages((unsigned long)base, order);
>> +               __free_pages(page, order);
> 
> no change required.
> 
>>                 return -ENXIO;
>>         }
>>
>>         baser->order = order;
>> -       baser->base = base;
>> +       baser->base = page_to_virt(page);
> 
> no change required.
>>         baser->psz = psz;
>>         tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz;
>>
>>         pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n",
>>                 &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp),
>>                 its_base_type_string[type],
>> -               (unsigned long)virt_to_phys(base),
>> +               (unsigned long)page_to_phys(page),
> 
> no change required.
>>                 indirect ? "indirect" : "flat", (int)esz,
>>                 psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT);
>>
>> @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its)
>>
>>         for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>>                 if (its->tables[i].base) {
>> -                       free_pages((unsigned long)its->tables[i].base,
>> +                       __free_pages(virt_to_page(its->tables[i].base),
> 
> ditto
>>                                    its->tables[i].order);
>>                         its->tables[i].base = NULL;
>>                 }
>> @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id)
>>
>>         /* Allocate memory for 2nd level table */
>>         if (!table[idx]) {
>> -               page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz));
>> +               page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>> +                                       get_order(baser->psz));
>>                 if (!page)
>>                         return false;
>>
>> @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>>         nr_ites = max(2UL, roundup_pow_of_two(nvecs));
>>         sz = nr_ites * its->ite_size;
>>         sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
>> -       itt = kzalloc(sz, GFP_KERNEL);
>> +       itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
>>         lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis);
>>         if (lpi_map)
>>                 col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL);
>> @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res,
>>  {
>>         struct its_node *its;
>>         void __iomem *its_base;
>> +       struct page *page;
>>         u32 val;
>>         u64 baser, tmp;
>>         int err;
>> @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res,
>>         its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1;
>>         its->numa_node = numa_node;
>>
>> -       its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
>> -                                               get_order(ITS_CMD_QUEUE_SZ));
>> -       if (!its->cmd_base) {
>> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>> +                               get_order(ITS_CMD_QUEUE_SZ));
>> +       if (!page) {
>>                 err = -ENOMEM;
>>                 goto out_free_its;
>>         }
>> +       its->cmd_base = page_to_virt(page);
> 
> same here, this would have been,
> its->cmd_base = page_address(page);
> 
>>         its->cmd_write = its->cmd_base;
>>
>>         its_enable_quirks(its);
>> @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res,
>>  out_free_tables:
>>         its_free_tables(its);
>>  out_free_cmd:
>> -       free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
>> +       __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ));
> 
> no change required.
>>  out_free_its:
>>         kfree(its);
>>  out_unmap:
>> --
>> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
>> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
>>
> 
> thanks
> Ganapat
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Shanker Donthineni July 3, 2017, 1:37 p.m. UTC | #5
Hi Ganapatraoo,

On 06/29/2017 10:01 PM, Ganapatrao Kulkarni wrote:
> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
> <gpkulkarni@gmail.com> wrote:
>> Hi Shanker,
>>
>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>> <shankerd@codeaurora.org> wrote:
>>> The NUMA node information is visible to ITS driver but not being used
>>> other than handling errata. This patch allocates the memory for ITS
>>> tables from the corresponding NUMA node using the appropriate NUMA
>>> aware functions.
> 
> IMHO, the description would have been more constructive?
> 
> "All ITS tables are mapped by default to NODE 0 memory.
> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
> This will optimize tables access and avoids unnecessary inter-node traffic."
> 

I don't understand why you are saying 'All ITS tables are mapped by default to NODE 0 memory'? Kernel runtime memory allocation comes from the corresponding NUMA node of the CPU that being called. 

Please explain if you don't agree on this.

>>>
>>> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
>>> ---
>>>  drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++----------------
>>>  1 file changed, 20 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
>>> index fed99c5..e45add2 100644
>>> --- a/drivers/irqchip/irq-gic-v3-its.c
>>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>>> @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>         u64 val = its_read_baser(its, baser);
>>>         u64 esz = GITS_BASER_ENTRY_SIZE(val);
>>>         u64 type = GITS_BASER_TYPE(val);
>>> +       struct page *page;
>>>         u32 alloc_pages;
>>> -       void *base;
>>>         u64 tmp;
>>>
>>>  retry_alloc_baser:
>>> @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>                 order = get_order(GITS_BASER_PAGES_MAX * psz);
>>>         }
>>>
>>> -       base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
>>> -       if (!base)
>>> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
>>
>> the changes would have been minimal, if you converted page to base
>> address after allocation here itself?
>>
>>  page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
>>  base =  page_address(page);
>>
>>> +       if (!page)
>>>                 return -ENOMEM;
>>>
>>>  retry_baser:
>>> -       val = (virt_to_phys(base)                                |
>>> +       val = (page_to_phys(page)
>>
>> no change required.
>>
>>>                 (type << GITS_BASER_TYPE_SHIFT)                  |
>>>                 ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)       |
>>>                 ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT)    |
>>> @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>                 shr = tmp & GITS_BASER_SHAREABILITY_MASK;
>>>                 if (!shr) {
>>>                         cache = GITS_BASER_nC;
>>> -                       gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
>>> +                       gic_flush_dcache_to_poc(page_to_virt(page),
>>> +                                               PAGE_ORDER_TO_SIZE(order));
>>
>> no change needed, if base is used still.
>>>                 }
>>>                 goto retry_baser;
>>>         }
>>> @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>                  * size and retry. If we reach 4K, then
>>>                  * something is horribly wrong...
>>>                  */
>>> -               free_pages((unsigned long)base, order);
>>> +               __free_pages(page, order);
>>
>> no change required.
>>>                 baser->base = NULL;
>>>
>>>                 switch (psz) {
>>> @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>                 pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
>>>                        &its->phys_base, its_base_type_string[type],
>>>                        val, tmp);
>>> -               free_pages((unsigned long)base, order);
>>> +               __free_pages(page, order);
>>
>> no change required.
>>
>>>                 return -ENXIO;
>>>         }
>>>
>>>         baser->order = order;
>>> -       baser->base = base;
>>> +       baser->base = page_to_virt(page);
>>
>> no change required.
>>>         baser->psz = psz;
>>>         tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz;
>>>
>>>         pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n",
>>>                 &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp),
>>>                 its_base_type_string[type],
>>> -               (unsigned long)virt_to_phys(base),
>>> +               (unsigned long)page_to_phys(page),
>>
>> no change required.
>>>                 indirect ? "indirect" : "flat", (int)esz,
>>>                 psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT);
>>>
>>> @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its)
>>>
>>>         for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>>>                 if (its->tables[i].base) {
>>> -                       free_pages((unsigned long)its->tables[i].base,
>>> +                       __free_pages(virt_to_page(its->tables[i].base),
>>
>> ditto
>>>                                    its->tables[i].order);
>>>                         its->tables[i].base = NULL;
>>>                 }
>>> @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id)
>>>
>>>         /* Allocate memory for 2nd level table */
>>>         if (!table[idx]) {
>>> -               page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz));
>>> +               page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>>> +                                       get_order(baser->psz));
>>>                 if (!page)
>>>                         return false;
>>>
>>> @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>>>         nr_ites = max(2UL, roundup_pow_of_two(nvecs));
>>>         sz = nr_ites * its->ite_size;
>>>         sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
>>> -       itt = kzalloc(sz, GFP_KERNEL);
>>> +       itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
>>>         lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis);
>>>         if (lpi_map)
>>>                 col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL);
>>> @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res,
>>>  {
>>>         struct its_node *its;
>>>         void __iomem *its_base;
>>> +       struct page *page;
>>>         u32 val;
>>>         u64 baser, tmp;
>>>         int err;
>>> @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res,
>>>         its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1;
>>>         its->numa_node = numa_node;
>>>
>>> -       its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
>>> -                                               get_order(ITS_CMD_QUEUE_SZ));
>>> -       if (!its->cmd_base) {
>>> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>>> +                               get_order(ITS_CMD_QUEUE_SZ));
>>> +       if (!page) {
>>>                 err = -ENOMEM;
>>>                 goto out_free_its;
>>>         }
>>> +       its->cmd_base = page_to_virt(page);
>>
>> same here, this would have been,
>> its->cmd_base = page_address(page);
>>
>>>         its->cmd_write = its->cmd_base;
>>>
>>>         its_enable_quirks(its);
>>> @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res,
>>>  out_free_tables:
>>>         its_free_tables(its);
>>>  out_free_cmd:
>>> -       free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
>>> +       __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ));
>>
>> no change required.
>>>  out_free_its:
>>>         kfree(its);
>>>  out_unmap:
>>> --
>>> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
>>> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
>>>
>>
>> thanks
>> Ganapat
> 
> thanks
> Ganapat
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Shanker Donthineni July 3, 2017, 2:24 p.m. UTC | #6
Hi Marc,

On 06/30/2017 03:51 AM, Marc Zyngier wrote:
> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>> <gpkulkarni@gmail.com> wrote:
>>> Hi Shanker,
>>>
>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>> <shankerd@codeaurora.org> wrote:
>>>> The NUMA node information is visible to ITS driver but not being used
>>>> other than handling errata. This patch allocates the memory for ITS
>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>> aware functions.
>>
>> IMHO, the description would have been more constructive?
>>
>> "All ITS tables are mapped by default to NODE 0 memory.
>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>> This will optimize tables access and avoids unnecessary inter-node traffic."
> 
> But more importantly, I'd like to see figures showing the actual benefit
> of this per-node allocation. Given that both of you guys have access to
> such platforms, please show me the numbers!
> 

I'll share the actual results which shows the improvement whenever 
available on our next chips. Current version of Qualcomm qdf2400 doesn't 
support multi socket configuration to capture results and share with you. 

Do you see any other issues with this patch apart from the performance 
improvements. I strongly believe this brings the noticeable improvement 
in numbers on systems where it has multi node memory/CPU configuration.


> Thanks,
> 
> 	M.
>
Marc Zyngier July 3, 2017, 2:53 p.m. UTC | #7
Hi Shanker,

On 03/07/17 15:24, Shanker Donthineni wrote:
> Hi Marc,
> 
> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>> <gpkulkarni@gmail.com> wrote:
>>>> Hi Shanker,
>>>>
>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>> <shankerd@codeaurora.org> wrote:
>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>> aware functions.
>>>
>>> IMHO, the description would have been more constructive?
>>>
>>> "All ITS tables are mapped by default to NODE 0 memory.
>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>
>> But more importantly, I'd like to see figures showing the actual benefit
>> of this per-node allocation. Given that both of you guys have access to
>> such platforms, please show me the numbers!
>>
> 
> I'll share the actual results which shows the improvement whenever 
> available on our next chips. Current version of Qualcomm qdf2400 doesn't 
> support multi socket configuration to capture results and share with you. 
> 
> Do you see any other issues with this patch apart from the performance 
> improvements. I strongly believe this brings the noticeable improvement 
> in numbers on systems where it has multi node memory/CPU configuration.

I agree that it *could* show an improvement, but it very much depends on
how often the ITS misses in its caches. For this kind of patches, I want
to see two things:

1) It brings a measurable benefit on NUMA platforms
2) it doesn't adversely impact non-NUMA systems

I can deal with (2), but I have no way of evaluating (1), mostly for the
lack of an infrastructure exercising multiple ITSs at the same time.

Thanks,

	M.
Shanker Donthineni July 3, 2017, 3:15 p.m. UTC | #8
Hi Marc,

On 07/03/2017 09:53 AM, Marc Zyngier wrote:
> Hi Shanker,
> 
> On 03/07/17 15:24, Shanker Donthineni wrote:
>> Hi Marc,
>>
>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>> <gpkulkarni@gmail.com> wrote:
>>>>> Hi Shanker,
>>>>>
>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>> <shankerd@codeaurora.org> wrote:
>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>> aware functions.
>>>>
>>>> IMHO, the description would have been more constructive?
>>>>
>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>
>>> But more importantly, I'd like to see figures showing the actual benefit
>>> of this per-node allocation. Given that both of you guys have access to
>>> such platforms, please show me the numbers!
>>>
>>
>> I'll share the actual results which shows the improvement whenever 
>> available on our next chips. Current version of Qualcomm qdf2400 doesn't 
>> support multi socket configuration to capture results and share with you. 
>>
>> Do you see any other issues with this patch apart from the performance 
>> improvements. I strongly believe this brings the noticeable improvement 
>> in numbers on systems where it has multi node memory/CPU configuration.
> 
> I agree that it *could* show an improvement, but it very much depends on
> how often the ITS misses in its caches. For this kind of patches, I want
> to see two things:
> 

Just imagine systems with hundreds of PCI-SRIOV virtual functions and
assigning some of them to virtual machines, and systems with GICv4 feature. 
There should be a lot of cache misses on ITS VCPU, DEVICE and COLLECTION 
lookups. And also VLPI patches that you've posted for comments are forcing 
to use VLPI feature for each VM irrespective of pass-through device assignment. 


> 1) It brings a measurable benefit on NUMA platforms
> 2) it doesn't adversely impact non-NUMA systems
> 
It should not affect the ITS hardware behavior non-NUMA based system since 
software always allocate memory from a single (default) NUMA node.

> I can deal with (2), but I have no way of evaluating (1), mostly for the
> lack of an infrastructure exercising multiple ITSs at the same time.
> 
Agree with you, but it takes some time for me to provide the test results. 

> Thanks,
> 
> 	M.
>
Ganapatrao Kulkarni July 3, 2017, 4:51 p.m. UTC | #9
On Mon, Jul 3, 2017 at 7:07 PM, Shanker Donthineni
<shankerd@codeaurora.org> wrote:
> Hi Ganapatraoo,
>
> On 06/29/2017 10:01 PM, Ganapatrao Kulkarni wrote:
>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>> <gpkulkarni@gmail.com> wrote:
>>> Hi Shanker,
>>>
>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>> <shankerd@codeaurora.org> wrote:
>>>> The NUMA node information is visible to ITS driver but not being used
>>>> other than handling errata. This patch allocates the memory for ITS
>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>> aware functions.
>>
>> IMHO, the description would have been more constructive?
>>
>> "All ITS tables are mapped by default to NODE 0 memory.
>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>
>
> I don't understand why you are saying 'All ITS tables are mapped by default to NODE 0 memory'? Kernel runtime memory allocation comes from the corresponding NUMA node of the CPU that being called.

IIRC, ITT tables can be on any cpu, if drivers are not built-in and
msi-x vectors are not initialised in init/probe, etc
however Device table and Collection table are allocated from node 0
memory during ITS driver init by boot-cpu/cpu0.

>
> Please explain if you don't agree on this.
>
>>>>
>>>> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
>>>> ---
>>>>  drivers/irqchip/irq-gic-v3-its.c | 36 ++++++++++++++++++++----------------
>>>>  1 file changed, 20 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
>>>> index fed99c5..e45add2 100644
>>>> --- a/drivers/irqchip/irq-gic-v3-its.c
>>>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>>>> @@ -860,8 +860,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>>         u64 val = its_read_baser(its, baser);
>>>>         u64 esz = GITS_BASER_ENTRY_SIZE(val);
>>>>         u64 type = GITS_BASER_TYPE(val);
>>>> +       struct page *page;
>>>>         u32 alloc_pages;
>>>> -       void *base;
>>>>         u64 tmp;
>>>>
>>>>  retry_alloc_baser:
>>>> @@ -874,12 +874,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>>                 order = get_order(GITS_BASER_PAGES_MAX * psz);
>>>>         }
>>>>
>>>> -       base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
>>>> -       if (!base)
>>>> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
>>>
>>> the changes would have been minimal, if you converted page to base
>>> address after allocation here itself?
>>>
>>>  page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
>>>  base =  page_address(page);
>>>
>>>> +       if (!page)
>>>>                 return -ENOMEM;
>>>>
>>>>  retry_baser:
>>>> -       val = (virt_to_phys(base)                                |
>>>> +       val = (page_to_phys(page)
>>>
>>> no change required.
>>>
>>>>                 (type << GITS_BASER_TYPE_SHIFT)                  |
>>>>                 ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)       |
>>>>                 ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT)    |
>>>> @@ -915,7 +915,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>>                 shr = tmp & GITS_BASER_SHAREABILITY_MASK;
>>>>                 if (!shr) {
>>>>                         cache = GITS_BASER_nC;
>>>> -                       gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
>>>> +                       gic_flush_dcache_to_poc(page_to_virt(page),
>>>> +                                               PAGE_ORDER_TO_SIZE(order));
>>>
>>> no change needed, if base is used still.
>>>>                 }
>>>>                 goto retry_baser;
>>>>         }
>>>> @@ -926,7 +927,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>>                  * size and retry. If we reach 4K, then
>>>>                  * something is horribly wrong...
>>>>                  */
>>>> -               free_pages((unsigned long)base, order);
>>>> +               __free_pages(page, order);
>>>
>>> no change required.
>>>>                 baser->base = NULL;
>>>>
>>>>                 switch (psz) {
>>>> @@ -943,19 +944,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>>>>                 pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
>>>>                        &its->phys_base, its_base_type_string[type],
>>>>                        val, tmp);
>>>> -               free_pages((unsigned long)base, order);
>>>> +               __free_pages(page, order);
>>>
>>> no change required.
>>>
>>>>                 return -ENXIO;
>>>>         }
>>>>
>>>>         baser->order = order;
>>>> -       baser->base = base;
>>>> +       baser->base = page_to_virt(page);
>>>
>>> no change required.
>>>>         baser->psz = psz;
>>>>         tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz;
>>>>
>>>>         pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n",
>>>>                 &its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp),
>>>>                 its_base_type_string[type],
>>>> -               (unsigned long)virt_to_phys(base),
>>>> +               (unsigned long)page_to_phys(page),
>>>
>>> no change required.
>>>>                 indirect ? "indirect" : "flat", (int)esz,
>>>>                 psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT);
>>>>
>>>> @@ -1019,7 +1020,7 @@ static void its_free_tables(struct its_node *its)
>>>>
>>>>         for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>>>>                 if (its->tables[i].base) {
>>>> -                       free_pages((unsigned long)its->tables[i].base,
>>>> +                       __free_pages(virt_to_page(its->tables[i].base),
>>>
>>> ditto
>>>>                                    its->tables[i].order);
>>>>                         its->tables[i].base = NULL;
>>>>                 }
>>>> @@ -1286,7 +1287,8 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id)
>>>>
>>>>         /* Allocate memory for 2nd level table */
>>>>         if (!table[idx]) {
>>>> -               page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz));
>>>> +               page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>>>> +                                       get_order(baser->psz));
>>>>                 if (!page)
>>>>                         return false;
>>>>
>>>> @@ -1332,7 +1334,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
>>>>         nr_ites = max(2UL, roundup_pow_of_two(nvecs));
>>>>         sz = nr_ites * its->ite_size;
>>>>         sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
>>>> -       itt = kzalloc(sz, GFP_KERNEL);
>>>> +       itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
>>>>         lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis);
>>>>         if (lpi_map)
>>>>                 col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL);
>>>> @@ -1677,6 +1679,7 @@ static int __init its_probe_one(struct resource *res,
>>>>  {
>>>>         struct its_node *its;
>>>>         void __iomem *its_base;
>>>> +       struct page *page;
>>>>         u32 val;
>>>>         u64 baser, tmp;
>>>>         int err;
>>>> @@ -1716,12 +1719,13 @@ static int __init its_probe_one(struct resource *res,
>>>>         its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1;
>>>>         its->numa_node = numa_node;
>>>>
>>>> -       its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
>>>> -                                               get_order(ITS_CMD_QUEUE_SZ));
>>>> -       if (!its->cmd_base) {
>>>> +       page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>>>> +                               get_order(ITS_CMD_QUEUE_SZ));
>>>> +       if (!page) {
>>>>                 err = -ENOMEM;
>>>>                 goto out_free_its;
>>>>         }
>>>> +       its->cmd_base = page_to_virt(page);
>>>
>>> same here, this would have been,
>>> its->cmd_base = page_address(page);
>>>
>>>>         its->cmd_write = its->cmd_base;
>>>>
>>>>         its_enable_quirks(its);
>>>> @@ -1775,7 +1779,7 @@ static int __init its_probe_one(struct resource *res,
>>>>  out_free_tables:
>>>>         its_free_tables(its);
>>>>  out_free_cmd:
>>>> -       free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
>>>> +       __free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ));
>>>
>>> no change required.
>>>>  out_free_its:
>>>>         kfree(its);
>>>>  out_unmap:
>>>> --
>>>> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
>>>> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
>>>>
>>>
>>> thanks
>>> Ganapat
>>
>> thanks
>> Ganapat
>>
>> _______________________________________________
>> linux-arm-kernel mailing list
>> linux-arm-kernel@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>>
>
> --
> Shanker Donthineni
> Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

thanks
Ganapat
Shanker Donthineni July 3, 2017, 6:25 p.m. UTC | #10
Hi Ganapatrao,

On 07/03/2017 11:51 AM, Ganapatrao Kulkarni wrote:
> On Mon, Jul 3, 2017 at 7:07 PM, Shanker Donthineni
> <shankerd@codeaurora.org> wrote:
>> Hi Ganapatraoo,
>>
>> On 06/29/2017 10:01 PM, Ganapatrao Kulkarni wrote:
>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>> <gpkulkarni@gmail.com> wrote:
>>>> Hi Shanker,
>>>>
>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>> <shankerd@codeaurora.org> wrote:
>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>> aware functions.
>>>
>>> IMHO, the description would have been more constructive?
>>>
>>> "All ITS tables are mapped by default to NODE 0 memory.
>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>
>>
>> I don't understand why you are saying 'All ITS tables are mapped by default to NODE 0 memory'? Kernel runtime memory allocation comes from the corresponding NUMA node of the CPU that being called.
> 
> IIRC, ITT tables can be on any cpu, if drivers are not built-in and
> msi-x vectors are not initialised in init/probe, etc
> however Device table and Collection table are allocated from node 0
> memory during ITS driver init by boot-cpu/cpu0.
> 

Don't confuse boot-cpu aka logical cpu0 and NUMA node 0, these is no requirement that 
says boot cpu has to tied to NUMA node 0. The memory for first level device table, 
prop and collection tables are allocated from the boot cpu's NUMA node not NUMA node 0.

All other memory allocation inside the ITS driver can be done from any NUMA node depends 
on which CPU calls the memory allocation functions during runtime.
Ganapatrao Kulkarni July 10, 2017, 8:48 a.m. UTC | #11
Hi Marc,

On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> Hi Shanker,
>
> On 03/07/17 15:24, Shanker Donthineni wrote:
>> Hi Marc,
>>
>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>> <gpkulkarni@gmail.com> wrote:
>>>>> Hi Shanker,
>>>>>
>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>> <shankerd@codeaurora.org> wrote:
>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>> aware functions.
>>>>
>>>> IMHO, the description would have been more constructive?
>>>>
>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>
>>> But more importantly, I'd like to see figures showing the actual benefit
>>> of this per-node allocation. Given that both of you guys have access to
>>> such platforms, please show me the numbers!
>>>
>>
>> I'll share the actual results which shows the improvement whenever
>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
>> support multi socket configuration to capture results and share with you.
>>
>> Do you see any other issues with this patch apart from the performance
>> improvements. I strongly believe this brings the noticeable improvement
>> in numbers on systems where it has multi node memory/CPU configuration.
>
> I agree that it *could* show an improvement, but it very much depends on
> how often the ITS misses in its caches. For this kind of patches, I want
> to see two things:
>
> 1) It brings a measurable benefit on NUMA platforms

Did some measurement of interrupt response time for LPIs and we don't
see any major
improvement due to caching of Tables. However, we have seen
improvements of around 5%.
IMO, we should merge this patch to have NUMA aware allocations and to
avoid unwanted inter-node transactions.

Tested-by: Ganapatrao Kulkarni <ganapatrao.kulkarni@cavium.com>

> 2) it doesn't adversely impact non-NUMA systems
AFAIK, no impact on non-NUMA and on single node NUMA systems.

>
> I can deal with (2), but I have no way of evaluating (1), mostly for the
> lack of an infrastructure exercising multiple ITSs at the same time.
>
> Thanks,
>
>         M.
> --
> Jazz is not dead. It just smells funny...

thanks
Ganapat
Marc Zyngier July 10, 2017, 9:06 a.m. UTC | #12
On 10/07/17 09:48, Ganapatrao Kulkarni wrote:
> Hi Marc,
> 
> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> Hi Shanker,
>>
>> On 03/07/17 15:24, Shanker Donthineni wrote:
>>> Hi Marc,
>>>
>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>>> <gpkulkarni@gmail.com> wrote:
>>>>>> Hi Shanker,
>>>>>>
>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>>> <shankerd@codeaurora.org> wrote:
>>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>>> aware functions.
>>>>>
>>>>> IMHO, the description would have been more constructive?
>>>>>
>>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>>
>>>> But more importantly, I'd like to see figures showing the actual benefit
>>>> of this per-node allocation. Given that both of you guys have access to
>>>> such platforms, please show me the numbers!
>>>>
>>>
>>> I'll share the actual results which shows the improvement whenever
>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
>>> support multi socket configuration to capture results and share with you.
>>>
>>> Do you see any other issues with this patch apart from the performance
>>> improvements. I strongly believe this brings the noticeable improvement
>>> in numbers on systems where it has multi node memory/CPU configuration.
>>
>> I agree that it *could* show an improvement, but it very much depends on
>> how often the ITS misses in its caches. For this kind of patches, I want
>> to see two things:
>>
>> 1) It brings a measurable benefit on NUMA platforms
> 
> Did some measurement of interrupt response time for LPIs and we don't
> see any major
> improvement due to caching of Tables. However, we have seen
> improvements of around 5%.

An improvement of what exactly?

	M.
Ganapatrao Kulkarni July 10, 2017, 9:08 a.m. UTC | #13
On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> On 10/07/17 09:48, Ganapatrao Kulkarni wrote:
>> Hi Marc,
>>
>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>> Hi Shanker,
>>>
>>> On 03/07/17 15:24, Shanker Donthineni wrote:
>>>> Hi Marc,
>>>>
>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>>>> <gpkulkarni@gmail.com> wrote:
>>>>>>> Hi Shanker,
>>>>>>>
>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>>>> <shankerd@codeaurora.org> wrote:
>>>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>>>> aware functions.
>>>>>>
>>>>>> IMHO, the description would have been more constructive?
>>>>>>
>>>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>>>
>>>>> But more importantly, I'd like to see figures showing the actual benefit
>>>>> of this per-node allocation. Given that both of you guys have access to
>>>>> such platforms, please show me the numbers!
>>>>>
>>>>
>>>> I'll share the actual results which shows the improvement whenever
>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
>>>> support multi socket configuration to capture results and share with you.
>>>>
>>>> Do you see any other issues with this patch apart from the performance
>>>> improvements. I strongly believe this brings the noticeable improvement
>>>> in numbers on systems where it has multi node memory/CPU configuration.
>>>
>>> I agree that it *could* show an improvement, but it very much depends on
>>> how often the ITS misses in its caches. For this kind of patches, I want
>>> to see two things:
>>>
>>> 1) It brings a measurable benefit on NUMA platforms
>>
>> Did some measurement of interrupt response time for LPIs and we don't
>> see any major
>> improvement due to caching of Tables. However, we have seen
>> improvements of around 5%.
>
> An improvement of what exactly?

interrupt response time.
>
>         M.
> --
> Jazz is not dead. It just smells funny...

thanks
Ganapat
Marc Zyngier July 10, 2017, 9:23 a.m. UTC | #14
On 10/07/17 10:08, Ganapatrao Kulkarni wrote:
> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote:
>>> Hi Marc,
>>>
>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>> Hi Shanker,
>>>>
>>>> On 03/07/17 15:24, Shanker Donthineni wrote:
>>>>> Hi Marc,
>>>>>
>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>>>>> <gpkulkarni@gmail.com> wrote:
>>>>>>>> Hi Shanker,
>>>>>>>>
>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>>>>> <shankerd@codeaurora.org> wrote:
>>>>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>>>>> aware functions.
>>>>>>>
>>>>>>> IMHO, the description would have been more constructive?
>>>>>>>
>>>>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>>>>
>>>>>> But more importantly, I'd like to see figures showing the actual benefit
>>>>>> of this per-node allocation. Given that both of you guys have access to
>>>>>> such platforms, please show me the numbers!
>>>>>>
>>>>>
>>>>> I'll share the actual results which shows the improvement whenever
>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
>>>>> support multi socket configuration to capture results and share with you.
>>>>>
>>>>> Do you see any other issues with this patch apart from the performance
>>>>> improvements. I strongly believe this brings the noticeable improvement
>>>>> in numbers on systems where it has multi node memory/CPU configuration.
>>>>
>>>> I agree that it *could* show an improvement, but it very much depends on
>>>> how often the ITS misses in its caches. For this kind of patches, I want
>>>> to see two things:
>>>>
>>>> 1) It brings a measurable benefit on NUMA platforms
>>>
>>> Did some measurement of interrupt response time for LPIs and we don't
>>> see any major
>>> improvement due to caching of Tables. However, we have seen
>>> improvements of around 5%.
>>
>> An improvement of what exactly?
> 
> interrupt response time.

Measured how? On which HW? Using which benchmark?

Give me the actual benchmark results. Don't expect me to accept this
kind of hand-wavy statement.

	M.
Ganapatrao Kulkarni July 10, 2017, 10:21 a.m. UTC | #15
Hi Marc,

On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> On 10/07/17 10:08, Ganapatrao Kulkarni wrote:
>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote:
>>>> Hi Marc,
>>>>
>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>>> Hi Shanker,
>>>>>
>>>>> On 03/07/17 15:24, Shanker Donthineni wrote:
>>>>>> Hi Marc,
>>>>>>
>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>>>>>> <gpkulkarni@gmail.com> wrote:
>>>>>>>>> Hi Shanker,
>>>>>>>>>
>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>>>>>> <shankerd@codeaurora.org> wrote:
>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>>>>>> aware functions.
>>>>>>>>
>>>>>>>> IMHO, the description would have been more constructive?
>>>>>>>>
>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>>>>>
>>>>>>> But more importantly, I'd like to see figures showing the actual benefit
>>>>>>> of this per-node allocation. Given that both of you guys have access to
>>>>>>> such platforms, please show me the numbers!
>>>>>>>
>>>>>>
>>>>>> I'll share the actual results which shows the improvement whenever
>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
>>>>>> support multi socket configuration to capture results and share with you.
>>>>>>
>>>>>> Do you see any other issues with this patch apart from the performance
>>>>>> improvements. I strongly believe this brings the noticeable improvement
>>>>>> in numbers on systems where it has multi node memory/CPU configuration.
>>>>>
>>>>> I agree that it *could* show an improvement, but it very much depends on
>>>>> how often the ITS misses in its caches. For this kind of patches, I want
>>>>> to see two things:
>>>>>
>>>>> 1) It brings a measurable benefit on NUMA platforms
>>>>
>>>> Did some measurement of interrupt response time for LPIs and we don't
>>>> see any major
>>>> improvement due to caching of Tables. However, we have seen
>>>> improvements of around 5%.
>>>
>>> An improvement of what exactly?
>>
>> interrupt response time.
>
> Measured how? On which HW? Using which benchmark?

This has been tested on ThunderX2.
We have instrumented gic-v3-its driver code to create dummy LPI device
with few vectors.
The LPI is induced from dummy device(through sysfs by writing to
TRANSLATOR reg).
The ISR routine(gic_handle_irq) being called to handle the induced LPI.
NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped
in ITT to route this LPI.

CPU timer counter are sampled at the time LPI is Induced and in ISR
routine to calculate interrupt response time.
the result shown improvement of 5% with this patch.

Do you have any recommended benchmarks to test the same?

>
> Give me the actual benchmark results. Don't expect me to accept this
> kind of hand-wavy statement.
>
>         M.
> --
> Jazz is not dead. It just smells funny...

thanks
Ganapat
Shanker Donthineni July 10, 2017, 12:30 p.m. UTC | #16
Marc,

Do you have any other concerns taking this patch?


On 07/10/2017 05:21 AM, Ganapatrao Kulkarni wrote:
> Hi Marc,
> 
> On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> On 10/07/17 10:08, Ganapatrao Kulkarni wrote:
>>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote:
>>>>> Hi Marc,
>>>>>
>>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>>>> Hi Shanker,
>>>>>>
>>>>>> On 03/07/17 15:24, Shanker Donthineni wrote:
>>>>>>> Hi Marc,
>>>>>>>
>>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>>>>>>> <gpkulkarni@gmail.com> wrote:
>>>>>>>>>> Hi Shanker,
>>>>>>>>>>
>>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>>>>>>> <shankerd@codeaurora.org> wrote:
>>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>>>>>>> aware functions.
>>>>>>>>>
>>>>>>>>> IMHO, the description would have been more constructive?
>>>>>>>>>
>>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>>>>>>
>>>>>>>> But more importantly, I'd like to see figures showing the actual benefit
>>>>>>>> of this per-node allocation. Given that both of you guys have access to
>>>>>>>> such platforms, please show me the numbers!
>>>>>>>>
>>>>>>>
>>>>>>> I'll share the actual results which shows the improvement whenever
>>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
>>>>>>> support multi socket configuration to capture results and share with you.
>>>>>>>
>>>>>>> Do you see any other issues with this patch apart from the performance
>>>>>>> improvements. I strongly believe this brings the noticeable improvement
>>>>>>> in numbers on systems where it has multi node memory/CPU configuration.
>>>>>>
>>>>>> I agree that it *could* show an improvement, but it very much depends on
>>>>>> how often the ITS misses in its caches. For this kind of patches, I want
>>>>>> to see two things:
>>>>>>
>>>>>> 1) It brings a measurable benefit on NUMA platforms
>>>>>
>>>>> Did some measurement of interrupt response time for LPIs and we don't
>>>>> see any major
>>>>> improvement due to caching of Tables. However, we have seen
>>>>> improvements of around 5%.
>>>>
>>>> An improvement of what exactly?
>>>
>>> interrupt response time.
>>
>> Measured how? On which HW? Using which benchmark?
> 
> This has been tested on ThunderX2.
> We have instrumented gic-v3-its driver code to create dummy LPI device
> with few vectors.
> The LPI is induced from dummy device(through sysfs by writing to
> TRANSLATOR reg).
> The ISR routine(gic_handle_irq) being called to handle the induced LPI.
> NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped
> in ITT to route this LPI.
> 
> CPU timer counter are sampled at the time LPI is Induced and in ISR
> routine to calculate interrupt response time.
> the result shown improvement of 5% with this patch.
> 
> Do you have any recommended benchmarks to test the same?
> 

Ganapatrao,

Thanks for your efforts on instrumenting ITS driver code to show interrupt performance
improvement of 5% on the ThunderX2 hardware. Actually the current ITS driver is not 
consistent on allocating memory for ITS/GICR tables, GICR pending tables are allocated
from the corresponding NUMA node based on CPU proximity, but not the other tables.


>>
>> Give me the actual benchmark results. Don't expect me to accept this
>> kind of hand-wavy statement.
>>
>>         M.
>> --
>> Jazz is not dead. It just smells funny...
> 
> thanks
> Ganapat
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Marc Zyngier July 10, 2017, 1:50 p.m. UTC | #17
On 10/07/17 11:21, Ganapatrao Kulkarni wrote:
> Hi Marc,
> 
> On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> On 10/07/17 10:08, Ganapatrao Kulkarni wrote:
>>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote:
>>>>> Hi Marc,
>>>>>
>>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>>>> Hi Shanker,
>>>>>>
>>>>>> On 03/07/17 15:24, Shanker Donthineni wrote:
>>>>>>> Hi Marc,
>>>>>>>
>>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>>>>>>> <gpkulkarni@gmail.com> wrote:
>>>>>>>>>> Hi Shanker,
>>>>>>>>>>
>>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>>>>>>> <shankerd@codeaurora.org> wrote:
>>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>>>>>>> aware functions.
>>>>>>>>>
>>>>>>>>> IMHO, the description would have been more constructive?
>>>>>>>>>
>>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>>>>>>
>>>>>>>> But more importantly, I'd like to see figures showing the actual benefit
>>>>>>>> of this per-node allocation. Given that both of you guys have access to
>>>>>>>> such platforms, please show me the numbers!
>>>>>>>>
>>>>>>>
>>>>>>> I'll share the actual results which shows the improvement whenever
>>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
>>>>>>> support multi socket configuration to capture results and share with you.
>>>>>>>
>>>>>>> Do you see any other issues with this patch apart from the performance
>>>>>>> improvements. I strongly believe this brings the noticeable improvement
>>>>>>> in numbers on systems where it has multi node memory/CPU configuration.
>>>>>>
>>>>>> I agree that it *could* show an improvement, but it very much depends on
>>>>>> how often the ITS misses in its caches. For this kind of patches, I want
>>>>>> to see two things:
>>>>>>
>>>>>> 1) It brings a measurable benefit on NUMA platforms
>>>>>
>>>>> Did some measurement of interrupt response time for LPIs and we don't
>>>>> see any major
>>>>> improvement due to caching of Tables. However, we have seen
>>>>> improvements of around 5%.
>>>>
>>>> An improvement of what exactly?
>>>
>>> interrupt response time.
>>
>> Measured how? On which HW? Using which benchmark?
> 
> This has been tested on ThunderX2.
> We have instrumented gic-v3-its driver code to create dummy LPI device
> with few vectors.
> The LPI is induced from dummy device(through sysfs by writing to
> TRANSLATOR reg).
> The ISR routine(gic_handle_irq) being called to handle the induced LPI.
> NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped
> in ITT to route this LPI.
> 
> CPU timer counter are sampled at the time LPI is Induced and in ISR
> routine to calculate interrupt response time.
> the result shown improvement of 5% with this patch.

And you call that a realistic measurement of the latency? Really? Sorry,
but I cannot take you seriously here.

> Do you have any recommended benchmarks to test the same?

Run a standard benchmark such as netperf, post the result with and
without that patch. The above is just plain ridiculous.

	M.
Marc Zyngier July 10, 2017, 1:53 p.m. UTC | #18
On 10/07/17 13:30, Shanker Donthineni wrote:
> Marc,
> 
> Do you have any other concerns taking this patch?

Plenty. All I have seen so far is a wet finger in the air and the claim
that it reduces "something" by 5%, without any actual figure.

Sorry, I'm not buying it without people putting in the effort to show me
that this is worth it.

Thanks,

	M.
Shanker Donthineni July 10, 2017, 2:57 p.m. UTC | #19
Hi Marc,

On 07/10/2017 08:50 AM, Marc Zyngier wrote:
> On 10/07/17 11:21, Ganapatrao Kulkarni wrote:
>> Hi Marc,
>>
>> On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>> On 10/07/17 10:08, Ganapatrao Kulkarni wrote:
>>>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote:
>>>>>> Hi Marc,
>>>>>>
>>>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>>>>> Hi Shanker,
>>>>>>>
>>>>>>> On 03/07/17 15:24, Shanker Donthineni wrote:
>>>>>>>> Hi Marc,
>>>>>>>>
>>>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>>>>>>>> <gpkulkarni@gmail.com> wrote:
>>>>>>>>>>> Hi Shanker,
>>>>>>>>>>>
>>>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>>>>>>>> <shankerd@codeaurora.org> wrote:
>>>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>>>>>>>> aware functions.
>>>>>>>>>>
>>>>>>>>>> IMHO, the description would have been more constructive?
>>>>>>>>>>
>>>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>>>>>>>
>>>>>>>>> But more importantly, I'd like to see figures showing the actual benefit
>>>>>>>>> of this per-node allocation. Given that both of you guys have access to
>>>>>>>>> such platforms, please show me the numbers!
>>>>>>>>>
>>>>>>>>
>>>>>>>> I'll share the actual results which shows the improvement whenever
>>>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
>>>>>>>> support multi socket configuration to capture results and share with you.
>>>>>>>>
>>>>>>>> Do you see any other issues with this patch apart from the performance
>>>>>>>> improvements. I strongly believe this brings the noticeable improvement
>>>>>>>> in numbers on systems where it has multi node memory/CPU configuration.
>>>>>>>
>>>>>>> I agree that it *could* show an improvement, but it very much depends on
>>>>>>> how often the ITS misses in its caches. For this kind of patches, I want
>>>>>>> to see two things:
>>>>>>>
>>>>>>> 1) It brings a measurable benefit on NUMA platforms
>>>>>>
>>>>>> Did some measurement of interrupt response time for LPIs and we don't
>>>>>> see any major
>>>>>> improvement due to caching of Tables. However, we have seen
>>>>>> improvements of around 5%.
>>>>>
>>>>> An improvement of what exactly?
>>>>
>>>> interrupt response time.
>>>
>>> Measured how? On which HW? Using which benchmark?
>>
>> This has been tested on ThunderX2.
>> We have instrumented gic-v3-its driver code to create dummy LPI device
>> with few vectors.
>> The LPI is induced from dummy device(through sysfs by writing to
>> TRANSLATOR reg).
>> The ISR routine(gic_handle_irq) being called to handle the induced LPI.
>> NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped
>> in ITT to route this LPI.
>>
>> CPU timer counter are sampled at the time LPI is Induced and in ISR
>> routine to calculate interrupt response time.
>> the result shown improvement of 5% with this patch.
> 
> And you call that a realistic measurement of the latency? Really? Sorry,
> but I cannot take you seriously here.
> 
>> Do you have any recommended benchmarks to test the same?
> 
> Run a standard benchmark such as netperf, post the result with and
> without that patch. The above is just plain ridiculous.
>

The whole purpose of ACPI subtable "GIC Interrupt Translation Service (ITS) Affinity structure" 
is to provide the proximity information to OS so that software will take advantage of NUMA
aware allocations to improve the read latency of ITS/GICR tables, not just for implementing 
software workarounds.

 
I believe ITS driver should provide NUMA aware allocations just like x86 Linux drivers. How much
performance improvement we observer is based on the individual SOC implementation, inter NODE
latency, inter node traffic, cache capacity, and type of the test used to measure results.

Please consider this patch irrespective of the test results running on a specific hardware. We
need this patch for upcoming Qualcomm server chips. 


 
> 	M.
>
Marc Zyngier July 10, 2017, 3:15 p.m. UTC | #20
On 10/07/17 15:57, Shanker Donthineni wrote:
> Hi Marc,
> 
> On 07/10/2017 08:50 AM, Marc Zyngier wrote:
>> On 10/07/17 11:21, Ganapatrao Kulkarni wrote:
>>> Hi Marc,
>>>
>>> On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>> On 10/07/17 10:08, Ganapatrao Kulkarni wrote:
>>>>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote:
>>>>>>> Hi Marc,
>>>>>>>
>>>>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>>>>>> Hi Shanker,
>>>>>>>>
>>>>>>>> On 03/07/17 15:24, Shanker Donthineni wrote:
>>>>>>>>> Hi Marc,
>>>>>>>>>
>>>>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
>>>>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
>>>>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
>>>>>>>>>>> <gpkulkarni@gmail.com> wrote:
>>>>>>>>>>>> Hi Shanker,
>>>>>>>>>>>>
>>>>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
>>>>>>>>>>>> <shankerd@codeaurora.org> wrote:
>>>>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used
>>>>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS
>>>>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
>>>>>>>>>>>>> aware functions.
>>>>>>>>>>>
>>>>>>>>>>> IMHO, the description would have been more constructive?
>>>>>>>>>>>
>>>>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory.
>>>>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
>>>>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
>>>>>>>>>>
>>>>>>>>>> But more importantly, I'd like to see figures showing the actual benefit
>>>>>>>>>> of this per-node allocation. Given that both of you guys have access to
>>>>>>>>>> such platforms, please show me the numbers!
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I'll share the actual results which shows the improvement whenever
>>>>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
>>>>>>>>> support multi socket configuration to capture results and share with you.
>>>>>>>>>
>>>>>>>>> Do you see any other issues with this patch apart from the performance
>>>>>>>>> improvements. I strongly believe this brings the noticeable improvement
>>>>>>>>> in numbers on systems where it has multi node memory/CPU configuration.
>>>>>>>>
>>>>>>>> I agree that it *could* show an improvement, but it very much depends on
>>>>>>>> how often the ITS misses in its caches. For this kind of patches, I want
>>>>>>>> to see two things:
>>>>>>>>
>>>>>>>> 1) It brings a measurable benefit on NUMA platforms
>>>>>>>
>>>>>>> Did some measurement of interrupt response time for LPIs and we don't
>>>>>>> see any major
>>>>>>> improvement due to caching of Tables. However, we have seen
>>>>>>> improvements of around 5%.
>>>>>>
>>>>>> An improvement of what exactly?
>>>>>
>>>>> interrupt response time.
>>>>
>>>> Measured how? On which HW? Using which benchmark?
>>>
>>> This has been tested on ThunderX2.
>>> We have instrumented gic-v3-its driver code to create dummy LPI device
>>> with few vectors.
>>> The LPI is induced from dummy device(through sysfs by writing to
>>> TRANSLATOR reg).
>>> The ISR routine(gic_handle_irq) being called to handle the induced LPI.
>>> NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped
>>> in ITT to route this LPI.
>>>
>>> CPU timer counter are sampled at the time LPI is Induced and in ISR
>>> routine to calculate interrupt response time.
>>> the result shown improvement of 5% with this patch.
>>
>> And you call that a realistic measurement of the latency? Really? Sorry,
>> but I cannot take you seriously here.
>>
>>> Do you have any recommended benchmarks to test the same?
>>
>> Run a standard benchmark such as netperf, post the result with and
>> without that patch. The above is just plain ridiculous.
>>
> 
> The whole purpose of ACPI subtable "GIC Interrupt Translation Service (ITS) Affinity structure" 
> is to provide the proximity information to OS so that software will take advantage of NUMA
> aware allocations to improve the read latency of ITS/GICR tables, not just for implementing 
> software workarounds.
> 
>  
> I believe ITS driver should provide NUMA aware allocations just like x86 Linux drivers. How much
> performance improvement we observer is based on the individual SOC implementation, inter NODE
> latency, inter node traffic, cache capacity, and type of the test used to measure results.
> 
> Please consider this patch irrespective of the test results running on a specific hardware. We
> need this patch for upcoming Qualcomm server chips. 

"I believe" and "We need" are not a proof of the usefulness of this. We
can argue all day, or you can provide a set of convincing results. Your
choice. But I can guarantee you the the latter is a much better method
than the former.

If you (or Cavium) cannot be bothered to provide tangible results that
this is useful, why should I take this at face value? This is just like
any other improvement we make to the kernel. We back it *with data*.

	M.
Jayachandran C July 11, 2017, 8:48 a.m. UTC | #21
Hi Marc,

On Mon, Jul 10, 2017 at 04:15:28PM +0100, Marc Zyngier wrote:
> On 10/07/17 15:57, Shanker Donthineni wrote:
> > Hi Marc,
> > 
> > On 07/10/2017 08:50 AM, Marc Zyngier wrote:
> >> On 10/07/17 11:21, Ganapatrao Kulkarni wrote:
> >>> Hi Marc,
> >>>
> >>> On Mon, Jul 10, 2017 at 2:53 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> >>>> On 10/07/17 10:08, Ganapatrao Kulkarni wrote:
> >>>>> On Mon, Jul 10, 2017 at 2:36 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> >>>>>> On 10/07/17 09:48, Ganapatrao Kulkarni wrote:
> >>>>>>> Hi Marc,
> >>>>>>>
> >>>>>>> On Mon, Jul 3, 2017 at 8:23 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> >>>>>>>> Hi Shanker,
> >>>>>>>>
> >>>>>>>> On 03/07/17 15:24, Shanker Donthineni wrote:
> >>>>>>>>> Hi Marc,
> >>>>>>>>>
> >>>>>>>>> On 06/30/2017 03:51 AM, Marc Zyngier wrote:
> >>>>>>>>>> On 30/06/17 04:01, Ganapatrao Kulkarni wrote:
> >>>>>>>>>>> On Fri, Jun 30, 2017 at 8:04 AM, Ganapatrao Kulkarni
> >>>>>>>>>>> <gpkulkarni@gmail.com> wrote:
> >>>>>>>>>>>> Hi Shanker,
> >>>>>>>>>>>>
> >>>>>>>>>>>> On Sun, Jun 25, 2017 at 9:16 PM, Shanker Donthineni
> >>>>>>>>>>>> <shankerd@codeaurora.org> wrote:
> >>>>>>>>>>>>> The NUMA node information is visible to ITS driver but not being used
> >>>>>>>>>>>>> other than handling errata. This patch allocates the memory for ITS
> >>>>>>>>>>>>> tables from the corresponding NUMA node using the appropriate NUMA
> >>>>>>>>>>>>> aware functions.
> >>>>>>>>>>>
> >>>>>>>>>>> IMHO, the description would have been more constructive?
> >>>>>>>>>>>
> >>>>>>>>>>> "All ITS tables are mapped by default to NODE 0 memory.
> >>>>>>>>>>> Adding changes to allocate memory from respective NUMA NODES of ITS devices.
> >>>>>>>>>>> This will optimize tables access and avoids unnecessary inter-node traffic."
> >>>>>>>>>>
> >>>>>>>>>> But more importantly, I'd like to see figures showing the actual benefit
> >>>>>>>>>> of this per-node allocation. Given that both of you guys have access to
> >>>>>>>>>> such platforms, please show me the numbers!
> >>>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> I'll share the actual results which shows the improvement whenever
> >>>>>>>>> available on our next chips. Current version of Qualcomm qdf2400 doesn't
> >>>>>>>>> support multi socket configuration to capture results and share with you.
> >>>>>>>>>
> >>>>>>>>> Do you see any other issues with this patch apart from the performance
> >>>>>>>>> improvements. I strongly believe this brings the noticeable improvement
> >>>>>>>>> in numbers on systems where it has multi node memory/CPU configuration.
> >>>>>>>>
> >>>>>>>> I agree that it *could* show an improvement, but it very much depends on
> >>>>>>>> how often the ITS misses in its caches. For this kind of patches, I want
> >>>>>>>> to see two things:
> >>>>>>>>
> >>>>>>>> 1) It brings a measurable benefit on NUMA platforms
> >>>>>>>
> >>>>>>> Did some measurement of interrupt response time for LPIs and we don't
> >>>>>>> see any major
> >>>>>>> improvement due to caching of Tables. However, we have seen
> >>>>>>> improvements of around 5%.
> >>>>>>
> >>>>>> An improvement of what exactly?
> >>>>>
> >>>>> interrupt response time.
> >>>>
> >>>> Measured how? On which HW? Using which benchmark?
> >>>
> >>> This has been tested on ThunderX2.
> >>> We have instrumented gic-v3-its driver code to create dummy LPI device
> >>> with few vectors.
> >>> The LPI is induced from dummy device(through sysfs by writing to
> >>> TRANSLATOR reg).
> >>> The ISR routine(gic_handle_irq) being called to handle the induced LPI.
> >>> NODE 1 cpu is used to induce LPI and NODE 1 cpu/collection is mapped
> >>> in ITT to route this LPI.
> >>>
> >>> CPU timer counter are sampled at the time LPI is Induced and in ISR
> >>> routine to calculate interrupt response time.
> >>> the result shown improvement of 5% with this patch.
> >>
> >> And you call that a realistic measurement of the latency? Really? Sorry,
> >> but I cannot take you seriously here.
> >>
> >>> Do you have any recommended benchmarks to test the same?
> >>
> >> Run a standard benchmark such as netperf, post the result with and
> >> without that patch. The above is just plain ridiculous.
> >>
> > 
> > The whole purpose of ACPI subtable "GIC Interrupt Translation Service (ITS) Affinity structure" 
> > is to provide the proximity information to OS so that software will take advantage of NUMA
> > aware allocations to improve the read latency of ITS/GICR tables, not just for implementing 
> > software workarounds.
> > 
> >  
> > I believe ITS driver should provide NUMA aware allocations just like x86 Linux drivers. How much
> > performance improvement we observer is based on the individual SOC implementation, inter NODE
> > latency, inter node traffic, cache capacity, and type of the test used to measure results.
> > 
> > Please consider this patch irrespective of the test results running on a specific hardware. We
> > need this patch for upcoming Qualcomm server chips. 
> 
> "I believe" and "We need" are not a proof of the usefulness of this. We
> can argue all day, or you can provide a set of convincing results. Your
> choice. But I can guarantee you the the latter is a much better method
> than the former.
> 
> If you (or Cavium) cannot be bothered to provide tangible results that
> this is useful, why should I take this at face value? This is just like
> any other improvement we make to the kernel. We back it *with data*.

At Cavium, most of the ThunderX2 boards we have are multi-node, and we
are interested in enabling NUMA optimizations.

But, in this case, we do not see (or expect to see - given the nature of
access) any significant improvement in any standard benchmark. Ganapat's
LPI injection test to find interrupt latency was probably the best option
we had so far. We could come up with another contrived test case to see
if there is any change in behavior when we overload the interconnect,
but I don't think we will get any data to really justify the patch.

Allocating the tables on the node is a good thing since it avoids
unnecessary traffic over the interconnect, so I do not see the
problem in merging a simple patch for that. Is there any specific
issue here?

Anyway, for ThunderX2, the patch is good to have, but not critical.
And as Ganapat noted, the patch can be improved a bit. Also going thru
the patch, I think the chip data is better allocated using node as well.

JC.
Richter, Robert July 13, 2017, 3:40 p.m. UTC | #22
On 11.07.17 08:48:56, Jayachandran C wrote:
> On Mon, Jul 10, 2017 at 04:15:28PM +0100, Marc Zyngier wrote:
> > On 10/07/17 15:57, Shanker Donthineni wrote:

> > > I believe ITS driver should provide NUMA aware allocations just like x86 Linux drivers. How much
> > > performance improvement we observer is based on the individual SOC implementation, inter NODE
> > > latency, inter node traffic, cache capacity, and type of the test used to measure results.
> > > 
> > > Please consider this patch irrespective of the test results running on a specific hardware. We
> > > need this patch for upcoming Qualcomm server chips. 
> > 
> > "I believe" and "We need" are not a proof of the usefulness of this. We
> > can argue all day, or you can provide a set of convincing results. Your
> > choice. But I can guarantee you the the latter is a much better method
> > than the former.
> > 
> > If you (or Cavium) cannot be bothered to provide tangible results that
> > this is useful, why should I take this at face value? This is just like
> > any other improvement we make to the kernel. We back it *with data*.
> 
> At Cavium, most of the ThunderX2 boards we have are multi-node, and we
> are interested in enabling NUMA optimizations.
> 
> But, in this case, we do not see (or expect to see - given the nature of
> access) any significant improvement in any standard benchmark. Ganapat's
> LPI injection test to find interrupt latency was probably the best option
> we had so far. We could come up with another contrived test case to see
> if there is any change in behavior when we overload the interconnect,
> but I don't think we will get any data to really justify the patch.
> 
> Allocating the tables on the node is a good thing since it avoids
> unnecessary traffic over the interconnect, so I do not see the
> problem in merging a simple patch for that. Is there any specific
> issue here?
> 
> Anyway, for ThunderX2, the patch is good to have, but not critical.
> And as Ganapat noted, the patch can be improved a bit. Also going thru
> the patch, I think the chip data is better allocated using node as well.

There is another thing to consider here. We will need cma and devm for
ITS. There are only a few per node allocation functions that can be
used then, so per-node allocation should only be used in rare cases
where really needed. I am going to repost my cma device table
allocation series after the merge window closes.

-Robert
diff mbox

Patch

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index fed99c5..e45add2 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -860,8 +860,8 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 	u64 val = its_read_baser(its, baser);
 	u64 esz = GITS_BASER_ENTRY_SIZE(val);
 	u64 type = GITS_BASER_TYPE(val);
+	struct page *page;
 	u32 alloc_pages;
-	void *base;
 	u64 tmp;
 
 retry_alloc_baser:
@@ -874,12 +874,12 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		order = get_order(GITS_BASER_PAGES_MAX * psz);
 	}
 
-	base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!base)
+	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
+	if (!page)
 		return -ENOMEM;
 
 retry_baser:
-	val = (virt_to_phys(base)				 |
+	val = (page_to_phys(page)				 |
 		(type << GITS_BASER_TYPE_SHIFT)			 |
 		((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)	 |
 		((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT)	 |
@@ -915,7 +915,8 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		shr = tmp & GITS_BASER_SHAREABILITY_MASK;
 		if (!shr) {
 			cache = GITS_BASER_nC;
-			gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
+			gic_flush_dcache_to_poc(page_to_virt(page),
+						PAGE_ORDER_TO_SIZE(order));
 		}
 		goto retry_baser;
 	}
@@ -926,7 +927,7 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		 * size and retry. If we reach 4K, then
 		 * something is horribly wrong...
 		 */
-		free_pages((unsigned long)base, order);
+		__free_pages(page, order);
 		baser->base = NULL;
 
 		switch (psz) {
@@ -943,19 +944,19 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
 		       &its->phys_base, its_base_type_string[type],
 		       val, tmp);
-		free_pages((unsigned long)base, order);
+		__free_pages(page, order);
 		return -ENXIO;
 	}
 
 	baser->order = order;
-	baser->base = base;
+	baser->base = page_to_virt(page);
 	baser->psz = psz;
 	tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz;
 
 	pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n",
 		&its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp),
 		its_base_type_string[type],
-		(unsigned long)virt_to_phys(base),
+		(unsigned long)page_to_phys(page),
 		indirect ? "indirect" : "flat", (int)esz,
 		psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT);
 
@@ -1019,7 +1020,7 @@  static void its_free_tables(struct its_node *its)
 
 	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
 		if (its->tables[i].base) {
-			free_pages((unsigned long)its->tables[i].base,
+			__free_pages(virt_to_page(its->tables[i].base),
 				   its->tables[i].order);
 			its->tables[i].base = NULL;
 		}
@@ -1286,7 +1287,8 @@  static bool its_alloc_device_table(struct its_node *its, u32 dev_id)
 
 	/* Allocate memory for 2nd level table */
 	if (!table[idx]) {
-		page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(baser->psz));
+		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
+					get_order(baser->psz));
 		if (!page)
 			return false;
 
@@ -1332,7 +1334,7 @@  static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
 	nr_ites = max(2UL, roundup_pow_of_two(nvecs));
 	sz = nr_ites * its->ite_size;
 	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
-	itt = kzalloc(sz, GFP_KERNEL);
+	itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
 	lpi_map = its_lpi_alloc_chunks(nvecs, &lpi_base, &nr_lpis);
 	if (lpi_map)
 		col_map = kzalloc(sizeof(*col_map) * nr_lpis, GFP_KERNEL);
@@ -1677,6 +1679,7 @@  static int __init its_probe_one(struct resource *res,
 {
 	struct its_node *its;
 	void __iomem *its_base;
+	struct page *page;
 	u32 val;
 	u64 baser, tmp;
 	int err;
@@ -1716,12 +1719,13 @@  static int __init its_probe_one(struct resource *res,
 	its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1;
 	its->numa_node = numa_node;
 
-	its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-						get_order(ITS_CMD_QUEUE_SZ));
-	if (!its->cmd_base) {
+	page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
+				get_order(ITS_CMD_QUEUE_SZ));
+	if (!page) {
 		err = -ENOMEM;
 		goto out_free_its;
 	}
+	its->cmd_base = page_to_virt(page);
 	its->cmd_write = its->cmd_base;
 
 	its_enable_quirks(its);
@@ -1775,7 +1779,7 @@  static int __init its_probe_one(struct resource *res,
 out_free_tables:
 	its_free_tables(its);
 out_free_cmd:
-	free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
+	__free_pages(virt_to_page(its->cmd_base), get_order(ITS_CMD_QUEUE_SZ));
 out_free_its:
 	kfree(its);
 out_unmap: