diff mbox

[v13,3/6] of, numa: Add NUMA of binding implementation.

Message ID 1456959362-2036-4-git-send-email-ddaney.cavm@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

David Daney March 2, 2016, 10:55 p.m. UTC
From: David Daney <david.daney@cavium.com>

Add device tree parsing for NUMA topology using device
"numa-node-id" property in distance-map and cpu nodes.

This is a complete rewrite of a previous patch by:
   Ganapatrao Kulkarni<gkulkarni@caviumnetworks.com>

Signed-off-by: David Daney <david.daney@cavium.com>
---
 drivers/of/Kconfig   |   3 +
 drivers/of/Makefile  |   1 +
 drivers/of/of_numa.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of.h   |   9 +++
 4 files changed, 213 insertions(+)
 create mode 100644 drivers/of/of_numa.c

Comments

Rob Herring March 3, 2016, 3:34 a.m. UTC | #1
On Wed, Mar 2, 2016 at 4:55 PM, David Daney <ddaney.cavm@gmail.com> wrote:
> From: David Daney <david.daney@cavium.com>
>
> Add device tree parsing for NUMA topology using device
> "numa-node-id" property in distance-map and cpu nodes.
>
> This is a complete rewrite of a previous patch by:
>    Ganapatrao Kulkarni<gkulkarni@caviumnetworks.com>
>
> Signed-off-by: David Daney <david.daney@cavium.com>
> ---
>  drivers/of/Kconfig   |   3 +
>  drivers/of/Makefile  |   1 +
>  drivers/of/of_numa.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/of.h   |   9 +++
>  4 files changed, 213 insertions(+)
>  create mode 100644 drivers/of/of_numa.c
>
> diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
> index e2a4841..b3bec3a 100644
> --- a/drivers/of/Kconfig
> +++ b/drivers/of/Kconfig
> @@ -112,4 +112,7 @@ config OF_OVERLAY
>           While this option is selected automatically when needed, you can
>           enable it manually to improve device tree unit test coverage.
>
> +config OF_NUMA
> +       bool
> +
>  endif # OF
> diff --git a/drivers/of/Makefile b/drivers/of/Makefile
> index 156c072..bee3fa9 100644
> --- a/drivers/of/Makefile
> +++ b/drivers/of/Makefile
> @@ -14,5 +14,6 @@ obj-$(CONFIG_OF_MTD)  += of_mtd.o
>  obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
>  obj-$(CONFIG_OF_RESOLVE)  += resolver.o
>  obj-$(CONFIG_OF_OVERLAY) += overlay.o
> +obj-$(CONFIG_OF_NUMA) += of_numa.o
>
>  obj-$(CONFIG_OF_UNITTEST) += unittest-data/
> diff --git a/drivers/of/of_numa.c b/drivers/of/of_numa.c
> new file mode 100644
> index 0000000..9727b60
> --- /dev/null
> +++ b/drivers/of/of_numa.c
> @@ -0,0 +1,200 @@
> +/*
> + * OF NUMA Parsing support.
> + *
> + * Copyright (C) 2015 - 2016 Cavium Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/of.h>
> +#include <linux/of_fdt.h>

This can be dropped now.

> +#include <linux/nodemask.h>
> +
> +#include <asm/numa.h>
> +
> +/* define default numa node to 0 */
> +#define DEFAULT_NODE 0
> +
> +/*
> + * Even though we connect cpus to numa domains later in SMP
> + * init, we need to know the node ids now for all cpus.
> +*/
> +static void __init of_find_cpu_nodes(void)

Perhaps of_parse_cpu_nodes for consistency.

Actually, if all the functions were prefixed with "of_numa_" that
would be better.

> +{
> +       u32 nid;
> +       int r;
> +       struct device_node *np = NULL;
> +
> +       for (;;) {
> +               np = of_find_node_by_type(np, "cpu");
> +               if (!np)
> +                       break;

Can't we use the child node iterator for /cpus here?

> +
> +               r = of_property_read_u32(np, "numa-node-id", &nid);
> +               if (r)
> +                       continue;
> +
> +               pr_debug("NUMA: CPU on %u\n", nid);
> +               if (nid >= MAX_NUMNODES)
> +                       pr_warn("NUMA: Node id %u exceeds maximum value\n",
> +                               nid);
> +               else
> +                       node_set(nid, numa_nodes_parsed);

I'm not sure how this works, but don't you need to match this up with
MPIDR of the cpu here?

> +       }
> +}
> +
> +static void __init of_parse_memory_nodes(void)
> +{
> +       struct device_node *np = NULL;
> +       int na, ns;
> +       const __be32 *prop;
> +       unsigned int psize;
> +       u32 nid;
> +       u64 base, size;
> +       int r;
> +
> +       for (;;) {
> +               np = of_find_node_by_type(np, "memory");
> +               if (!np)
> +                       break;
> +
> +               r = of_property_read_u32(np, "numa-node-id", &nid);
> +               if (r)
> +                       continue;
> +

> +               prop = of_get_property(np, "reg", &psize);
> +               if (!prop)
> +                       continue;
> +
> +               psize /= sizeof(__be32);
> +               na = of_n_addr_cells(np);
> +               ns = of_n_size_cells(np);
> +
> +               if (psize < na + ns) {
> +                       pr_err("NUMA: memory reg property too small\n");
> +                       continue;
> +               }
> +               base = of_read_number(prop, na);
> +               size = of_read_number(prop + na, ns);

You should be able to use of_address_to_resource for all this.

> +
> +               pr_debug("NUMA:  base = %llx len = %llx, node = %u\n",
> +                        base, size, nid);
> +
> +               if (numa_add_memblk(nid, base, size) < 0)
> +                       break;
> +       }
> +
> +       of_node_put(np);
> +}
> +
> +static int __init parse_distance_map_v1(struct device_node *map)
> +{
> +       const __be32 *matrix;
> +       unsigned int matrix_size;
> +       int entry_count;
> +       int i;
> +       int nr_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;

I believe the defaults are for some old DT files. As this is new, it
should rely on explicit #size-cells in the DT.

OTOH, what is point of using #size-cells at all versus fixing the
sizes to 1 cell. The documentation doesn't indicate that it uses
#size-cells. That also means that the sizes basically follow the cell
size for the memory given that this is at the top-level.

> +
> +       pr_info("NUMA: parsing numa-distance-map-v1\n");
> +
> +       matrix = of_get_property(map, "distance-matrix", &matrix_size);
> +       if (!matrix) {
> +               pr_err("NUMA: No distance-matrix property in distance-map\n");
> +               return -EINVAL;
> +       }
> +
> +       entry_count = matrix_size / (sizeof(__be32) * 3 * nr_size_cells);
> +
> +       for (i = 0; i < entry_count; i++) {
> +               u32 nodea, nodeb, distance;
> +
> +               nodea = of_read_number(matrix, nr_size_cells);
> +               matrix += nr_size_cells;
> +               nodeb = of_read_number(matrix, nr_size_cells);
> +               matrix += nr_size_cells;
> +               distance = of_read_number(matrix, nr_size_cells);
> +               matrix += nr_size_cells;

Assuming you fix this to 1 cell, you can use
of_property_count_u32_elems and of_property_read_u32_array.

> +
> +               numa_set_distance(nodea, nodeb, distance);
> +               pr_debug("NUMA:  distance[node%d -> node%d] = %d\n",
> +                        nodea, nodeb, distance);
> +
> +               /* Set default distance of node B->A same as A->B */
> +               if (nodeb > nodea)
> +                       numa_set_distance(nodeb, nodea, distance);
> +       }
> +
> +       return 0;
> +}
> +
> +static int __init of_parse_distance_map(void)
> +{
> +       int ret = -EINVAL;
> +       struct device_node *np = of_find_node_by_name(NULL, "distance-map");
> +
> +       if (!np)
> +               return ret;
> +
> +       if (of_device_is_compatible(np, "numa-distance-map-v1")) {

You can use of_find_compatible_node() instead of these 2 calls.

> +               ret = parse_distance_map_v1(np);
> +               goto out;
> +       }
> +
> +       pr_err("NUMA: invalid distance-map device node\n");
> +out:
> +       of_node_put(np);
> +       return ret;
> +}
> +
> +int of_node_to_nid(struct device_node *device)
> +{
> +       struct device_node *np;
> +       u32 nid;
> +       int r = -ENODATA;
> +
> +       np = of_node_get(device);
> +
> +       while (np) {
> +               struct device_node *parent;
> +
> +               r = of_property_read_u32(np, "numa-node-id", &nid);
> +               if (r != -EINVAL)

You want to break for other err values?

> +                       break;
> +
> +               /* property doesn't exist in this node, look in parent */
> +               parent = of_get_parent(np);
> +               of_node_put(np);
> +               np = parent;
> +       }
> +       if (np && r)
> +               pr_warn("NUMA: Invalid \"numa-node-id\" property in node %s\n",
> +                       np->name);
> +       of_node_put(np);
> +
> +       if (!r) {
> +               if (nid >= MAX_NUMNODES)
> +                       pr_warn("NUMA: Node id %u exceeds maximum value\n",
> +                               nid);
> +               else
> +                       return nid;
> +       }
> +
> +       return NUMA_NO_NODE;
> +}

Needs to be exported?

> +
> +int __init of_numa_init(void)
> +{
> +       of_find_cpu_nodes();
> +       of_parse_memory_nodes();
> +       return of_parse_distance_map();
> +}
> diff --git a/include/linux/of.h b/include/linux/of.h
> index dc6e396..fe67a4c 100644
> --- a/include/linux/of.h
> +++ b/include/linux/of.h
> @@ -685,6 +685,15 @@ static inline int of_node_to_nid(struct device_node *device)
>  }
>  #endif
>
> +#ifdef CONFIG_OF_NUMA
> +extern int of_numa_init(void);
> +#else
> +static inline int of_numa_init(void)
> +{
> +       return -ENOSYS;
> +}
> +#endif
> +
>  static inline struct device_node *of_find_matching_node(
>         struct device_node *from,
>         const struct of_device_id *matches)
> --
> 1.8.3.1
>
Ganapatrao Kulkarni March 3, 2016, 4:25 a.m. UTC | #2
On Thu, Mar 3, 2016 at 9:04 AM, Rob Herring <robh+dt@kernel.org> wrote:
> On Wed, Mar 2, 2016 at 4:55 PM, David Daney <ddaney.cavm@gmail.com> wrote:
>> From: David Daney <david.daney@cavium.com>
>>
>> Add device tree parsing for NUMA topology using device
>> "numa-node-id" property in distance-map and cpu nodes.
>>
>> This is a complete rewrite of a previous patch by:
>>    Ganapatrao Kulkarni<gkulkarni@caviumnetworks.com>
>>
>> Signed-off-by: David Daney <david.daney@cavium.com>
>> ---
>>  drivers/of/Kconfig   |   3 +
>>  drivers/of/Makefile  |   1 +
>>  drivers/of/of_numa.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  include/linux/of.h   |   9 +++
>>  4 files changed, 213 insertions(+)
>>  create mode 100644 drivers/of/of_numa.c
>>
>> diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
>> index e2a4841..b3bec3a 100644
>> --- a/drivers/of/Kconfig
>> +++ b/drivers/of/Kconfig
>> @@ -112,4 +112,7 @@ config OF_OVERLAY
>>           While this option is selected automatically when needed, you can
>>           enable it manually to improve device tree unit test coverage.
>>
>> +config OF_NUMA
>> +       bool
>> +
>>  endif # OF
>> diff --git a/drivers/of/Makefile b/drivers/of/Makefile
>> index 156c072..bee3fa9 100644
>> --- a/drivers/of/Makefile
>> +++ b/drivers/of/Makefile
>> @@ -14,5 +14,6 @@ obj-$(CONFIG_OF_MTD)  += of_mtd.o
>>  obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
>>  obj-$(CONFIG_OF_RESOLVE)  += resolver.o
>>  obj-$(CONFIG_OF_OVERLAY) += overlay.o
>> +obj-$(CONFIG_OF_NUMA) += of_numa.o
>>
>>  obj-$(CONFIG_OF_UNITTEST) += unittest-data/
>> diff --git a/drivers/of/of_numa.c b/drivers/of/of_numa.c
>> new file mode 100644
>> index 0000000..9727b60
>> --- /dev/null
>> +++ b/drivers/of/of_numa.c
>> @@ -0,0 +1,200 @@
>> +/*
>> + * OF NUMA Parsing support.
>> + *
>> + * Copyright (C) 2015 - 2016 Cavium Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <linux/of.h>
>> +#include <linux/of_fdt.h>
>
> This can be dropped now.
>
>> +#include <linux/nodemask.h>
>> +
>> +#include <asm/numa.h>
>> +
>> +/* define default numa node to 0 */
>> +#define DEFAULT_NODE 0
>> +
>> +/*
>> + * Even though we connect cpus to numa domains later in SMP
>> + * init, we need to know the node ids now for all cpus.
>> +*/
>> +static void __init of_find_cpu_nodes(void)
>
> Perhaps of_parse_cpu_nodes for consistency.
>
> Actually, if all the functions were prefixed with "of_numa_" that
> would be better.
>
>> +{
>> +       u32 nid;
>> +       int r;
>> +       struct device_node *np = NULL;
>> +
>> +       for (;;) {
>> +               np = of_find_node_by_type(np, "cpu");
>> +               if (!np)
>> +                       break;
>
> Can't we use the child node iterator for /cpus here?
>
>> +
>> +               r = of_property_read_u32(np, "numa-node-id", &nid);
>> +               if (r)
>> +                       continue;
>> +
>> +               pr_debug("NUMA: CPU on %u\n", nid);
>> +               if (nid >= MAX_NUMNODES)
>> +                       pr_warn("NUMA: Node id %u exceeds maximum value\n",
>> +                               nid);
>> +               else
>> +                       node_set(nid, numa_nodes_parsed);
>
> I'm not sure how this works, but don't you need to match this up with
> MPIDR of the cpu here?
>
>> +       }
>> +}
>> +
>> +static void __init of_parse_memory_nodes(void)
>> +{
>> +       struct device_node *np = NULL;
>> +       int na, ns;
>> +       const __be32 *prop;
>> +       unsigned int psize;
>> +       u32 nid;
>> +       u64 base, size;
>> +       int r;
>> +
>> +       for (;;) {
>> +               np = of_find_node_by_type(np, "memory");
>> +               if (!np)
>> +                       break;
>> +
>> +               r = of_property_read_u32(np, "numa-node-id", &nid);
>> +               if (r)
>> +                       continue;
>> +
>
>> +               prop = of_get_property(np, "reg", &psize);
>> +               if (!prop)
>> +                       continue;
>> +
>> +               psize /= sizeof(__be32);
>> +               na = of_n_addr_cells(np);
>> +               ns = of_n_size_cells(np);
>> +
>> +               if (psize < na + ns) {
>> +                       pr_err("NUMA: memory reg property too small\n");
>> +                       continue;
>> +               }
>> +               base = of_read_number(prop, na);
>> +               size = of_read_number(prop + na, ns);
>
> You should be able to use of_address_to_resource for all this.
>
>> +
>> +               pr_debug("NUMA:  base = %llx len = %llx, node = %u\n",
>> +                        base, size, nid);
>> +
>> +               if (numa_add_memblk(nid, base, size) < 0)
>> +                       break;
>> +       }
>> +
>> +       of_node_put(np);
>> +}
>> +
>> +static int __init parse_distance_map_v1(struct device_node *map)
>> +{
>> +       const __be32 *matrix;
>> +       unsigned int matrix_size;
>> +       int entry_count;
>> +       int i;
>> +       int nr_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
>
> I believe the defaults are for some old DT files. As this is new, it
> should rely on explicit #size-cells in the DT.
>
> OTOH, what is point of using #size-cells at all versus fixing the
> sizes to 1 cell. The documentation doesn't indicate that it uses
> #size-cells. That also means that the sizes basically follow the cell
> size for the memory given that this is at the top-level.

This property needs only size-cell of 1 (32 bit is sufficient to
define numa node id and their relative distance)
adding note about cell size of this property in binding will clear the doubt.
>
>> +
>> +       pr_info("NUMA: parsing numa-distance-map-v1\n");
>> +
>> +       matrix = of_get_property(map, "distance-matrix", &matrix_size);
>> +       if (!matrix) {
>> +               pr_err("NUMA: No distance-matrix property in distance-map\n");
>> +               return -EINVAL;
>> +       }
>> +
>> +       entry_count = matrix_size / (sizeof(__be32) * 3 * nr_size_cells);
>> +
>> +       for (i = 0; i < entry_count; i++) {
>> +               u32 nodea, nodeb, distance;
>> +
>> +               nodea = of_read_number(matrix, nr_size_cells);
>> +               matrix += nr_size_cells;
>> +               nodeb = of_read_number(matrix, nr_size_cells);
>> +               matrix += nr_size_cells;
>> +               distance = of_read_number(matrix, nr_size_cells);
>> +               matrix += nr_size_cells;
>
> Assuming you fix this to 1 cell, you can use
> of_property_count_u32_elems and of_property_read_u32_array.
>
>> +
>> +               numa_set_distance(nodea, nodeb, distance);
>> +               pr_debug("NUMA:  distance[node%d -> node%d] = %d\n",
>> +                        nodea, nodeb, distance);
>> +
>> +               /* Set default distance of node B->A same as A->B */
>> +               if (nodeb > nodea)
>> +                       numa_set_distance(nodeb, nodea, distance);
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static int __init of_parse_distance_map(void)
>> +{
>> +       int ret = -EINVAL;
>> +       struct device_node *np = of_find_node_by_name(NULL, "distance-map");
>> +
>> +       if (!np)
>> +               return ret;
>> +
>> +       if (of_device_is_compatible(np, "numa-distance-map-v1")) {
>
> You can use of_find_compatible_node() instead of these 2 calls.
>
>> +               ret = parse_distance_map_v1(np);
>> +               goto out;
>> +       }
>> +
>> +       pr_err("NUMA: invalid distance-map device node\n");
>> +out:
>> +       of_node_put(np);
>> +       return ret;
>> +}
>> +
>> +int of_node_to_nid(struct device_node *device)
>> +{
>> +       struct device_node *np;
>> +       u32 nid;
>> +       int r = -ENODATA;
>> +
>> +       np = of_node_get(device);
>> +
>> +       while (np) {
>> +               struct device_node *parent;
>> +
>> +               r = of_property_read_u32(np, "numa-node-id", &nid);
>> +               if (r != -EINVAL)
>
> You want to break for other err values?
>
>> +                       break;
>> +
>> +               /* property doesn't exist in this node, look in parent */
>> +               parent = of_get_parent(np);
>> +               of_node_put(np);
>> +               np = parent;
>> +       }
>> +       if (np && r)
>> +               pr_warn("NUMA: Invalid \"numa-node-id\" property in node %s\n",
>> +                       np->name);
>> +       of_node_put(np);
>> +
>> +       if (!r) {
>> +               if (nid >= MAX_NUMNODES)
>> +                       pr_warn("NUMA: Node id %u exceeds maximum value\n",
>> +                               nid);
>> +               else
>> +                       return nid;
>> +       }
>> +
>> +       return NUMA_NO_NODE;
>> +}
>
> Needs to be exported?
>
>> +
>> +int __init of_numa_init(void)
>> +{
>> +       of_find_cpu_nodes();
>> +       of_parse_memory_nodes();
>> +       return of_parse_distance_map();
>> +}
>> diff --git a/include/linux/of.h b/include/linux/of.h
>> index dc6e396..fe67a4c 100644
>> --- a/include/linux/of.h
>> +++ b/include/linux/of.h
>> @@ -685,6 +685,15 @@ static inline int of_node_to_nid(struct device_node *device)
>>  }
>>  #endif
>>
>> +#ifdef CONFIG_OF_NUMA
>> +extern int of_numa_init(void);
>> +#else
>> +static inline int of_numa_init(void)
>> +{
>> +       return -ENOSYS;
>> +}
>> +#endif
>> +
>>  static inline struct device_node *of_find_matching_node(
>>         struct device_node *from,
>>         const struct of_device_id *matches)
>> --
>> 1.8.3.1
>>
>

thanks
Ganapat

> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Ganapatrao Kulkarni March 3, 2016, 5:13 a.m. UTC | #3
On Thu, Mar 3, 2016 at 9:55 AM, Ganapatrao Kulkarni
<gpkulkarni@gmail.com> wrote:
> On Thu, Mar 3, 2016 at 9:04 AM, Rob Herring <robh+dt@kernel.org> wrote:
>> On Wed, Mar 2, 2016 at 4:55 PM, David Daney <ddaney.cavm@gmail.com> wrote:
>>> From: David Daney <david.daney@cavium.com>
>>>
>>> Add device tree parsing for NUMA topology using device
>>> "numa-node-id" property in distance-map and cpu nodes.
>>>
>>> This is a complete rewrite of a previous patch by:
>>>    Ganapatrao Kulkarni<gkulkarni@caviumnetworks.com>
>>>
>>> Signed-off-by: David Daney <david.daney@cavium.com>
>>> ---
>>>  drivers/of/Kconfig   |   3 +
>>>  drivers/of/Makefile  |   1 +
>>>  drivers/of/of_numa.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  include/linux/of.h   |   9 +++
>>>  4 files changed, 213 insertions(+)
>>>  create mode 100644 drivers/of/of_numa.c
>>>
>>> diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
>>> index e2a4841..b3bec3a 100644
>>> --- a/drivers/of/Kconfig
>>> +++ b/drivers/of/Kconfig
>>> @@ -112,4 +112,7 @@ config OF_OVERLAY
>>>           While this option is selected automatically when needed, you can
>>>           enable it manually to improve device tree unit test coverage.
>>>
>>> +config OF_NUMA
>>> +       bool
>>> +
>>>  endif # OF
>>> diff --git a/drivers/of/Makefile b/drivers/of/Makefile
>>> index 156c072..bee3fa9 100644
>>> --- a/drivers/of/Makefile
>>> +++ b/drivers/of/Makefile
>>> @@ -14,5 +14,6 @@ obj-$(CONFIG_OF_MTD)  += of_mtd.o
>>>  obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
>>>  obj-$(CONFIG_OF_RESOLVE)  += resolver.o
>>>  obj-$(CONFIG_OF_OVERLAY) += overlay.o
>>> +obj-$(CONFIG_OF_NUMA) += of_numa.o
>>>
>>>  obj-$(CONFIG_OF_UNITTEST) += unittest-data/
>>> diff --git a/drivers/of/of_numa.c b/drivers/of/of_numa.c
>>> new file mode 100644
>>> index 0000000..9727b60
>>> --- /dev/null
>>> +++ b/drivers/of/of_numa.c
>>> @@ -0,0 +1,200 @@
>>> +/*
>>> + * OF NUMA Parsing support.
>>> + *
>>> + * Copyright (C) 2015 - 2016 Cavium Inc.
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify
>>> + * it under the terms of the GNU General Public License version 2 as
>>> + * published by the Free Software Foundation.
>>> + *
>>> + * This program is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> + * GNU General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU General Public License
>>> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>>> + */
>>> +
>>> +#include <linux/of.h>
>>> +#include <linux/of_fdt.h>
>>
>> This can be dropped now.
>>
>>> +#include <linux/nodemask.h>
>>> +
>>> +#include <asm/numa.h>
>>> +
>>> +/* define default numa node to 0 */
>>> +#define DEFAULT_NODE 0
>>> +
>>> +/*
>>> + * Even though we connect cpus to numa domains later in SMP
>>> + * init, we need to know the node ids now for all cpus.
>>> +*/
>>> +static void __init of_find_cpu_nodes(void)
>>
>> Perhaps of_parse_cpu_nodes for consistency.
>>
>> Actually, if all the functions were prefixed with "of_numa_" that
>> would be better.
>>
>>> +{
>>> +       u32 nid;
>>> +       int r;
>>> +       struct device_node *np = NULL;
>>> +
>>> +       for (;;) {
>>> +               np = of_find_node_by_type(np, "cpu");
>>> +               if (!np)
>>> +                       break;
>>
>> Can't we use the child node iterator for /cpus here?
>>
>>> +
>>> +               r = of_property_read_u32(np, "numa-node-id", &nid);
>>> +               if (r)
>>> +                       continue;
>>> +
>>> +               pr_debug("NUMA: CPU on %u\n", nid);
>>> +               if (nid >= MAX_NUMNODES)
>>> +                       pr_warn("NUMA: Node id %u exceeds maximum value\n",
>>> +                               nid);
>>> +               else
>>> +                       node_set(nid, numa_nodes_parsed);
>>
>> I'm not sure how this works, but don't you need to match this up with
>> MPIDR of the cpu here?

we are parsing cpu node to know the number of nodes present in the system.
alone memory will not convey the number of numa nodes present in the
system(in some cases like node has cpu but no memory).
we do mapping of cpu logical id and numa node id in smp init when cpu
logical id is numbered(of_parse_and_init_cpus).
and logical id is already mapped to hwid/MPIDR.

>>
>>> +       }
>>> +}
>>> +
>>> +static void __init of_parse_memory_nodes(void)
>>> +{
>>> +       struct device_node *np = NULL;
>>> +       int na, ns;
>>> +       const __be32 *prop;
>>> +       unsigned int psize;
>>> +       u32 nid;
>>> +       u64 base, size;
>>> +       int r;
>>> +
>>> +       for (;;) {
>>> +               np = of_find_node_by_type(np, "memory");
>>> +               if (!np)
>>> +                       break;
>>> +
>>> +               r = of_property_read_u32(np, "numa-node-id", &nid);
>>> +               if (r)
>>> +                       continue;
>>> +
>>
>>> +               prop = of_get_property(np, "reg", &psize);
>>> +               if (!prop)
>>> +                       continue;
>>> +
>>> +               psize /= sizeof(__be32);
>>> +               na = of_n_addr_cells(np);
>>> +               ns = of_n_size_cells(np);
>>> +
>>> +               if (psize < na + ns) {
>>> +                       pr_err("NUMA: memory reg property too small\n");
>>> +                       continue;
>>> +               }
>>> +               base = of_read_number(prop, na);
>>> +               size = of_read_number(prop + na, ns);
>>
>> You should be able to use of_address_to_resource for all this.
>>
>>> +
>>> +               pr_debug("NUMA:  base = %llx len = %llx, node = %u\n",
>>> +                        base, size, nid);
>>> +
>>> +               if (numa_add_memblk(nid, base, size) < 0)
>>> +                       break;
>>> +       }
>>> +
>>> +       of_node_put(np);
>>> +}
>>> +
>>> +static int __init parse_distance_map_v1(struct device_node *map)
>>> +{
>>> +       const __be32 *matrix;
>>> +       unsigned int matrix_size;
>>> +       int entry_count;
>>> +       int i;
>>> +       int nr_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
>>
>> I believe the defaults are for some old DT files. As this is new, it
>> should rely on explicit #size-cells in the DT.
>>
>> OTOH, what is point of using #size-cells at all versus fixing the
>> sizes to 1 cell. The documentation doesn't indicate that it uses
>> #size-cells. That also means that the sizes basically follow the cell
>> size for the memory given that this is at the top-level.
>
> This property needs only size-cell of 1 (32 bit is sufficient to
> define numa node id and their relative distance)
> adding note about cell size of this property in binding will clear the doubt.
>>
>>> +
>>> +       pr_info("NUMA: parsing numa-distance-map-v1\n");
>>> +
>>> +       matrix = of_get_property(map, "distance-matrix", &matrix_size);
>>> +       if (!matrix) {
>>> +               pr_err("NUMA: No distance-matrix property in distance-map\n");
>>> +               return -EINVAL;
>>> +       }
>>> +
>>> +       entry_count = matrix_size / (sizeof(__be32) * 3 * nr_size_cells);
>>> +
>>> +       for (i = 0; i < entry_count; i++) {
>>> +               u32 nodea, nodeb, distance;
>>> +
>>> +               nodea = of_read_number(matrix, nr_size_cells);
>>> +               matrix += nr_size_cells;
>>> +               nodeb = of_read_number(matrix, nr_size_cells);
>>> +               matrix += nr_size_cells;
>>> +               distance = of_read_number(matrix, nr_size_cells);
>>> +               matrix += nr_size_cells;
>>
>> Assuming you fix this to 1 cell, you can use
>> of_property_count_u32_elems and of_property_read_u32_array.
>>
>>> +
>>> +               numa_set_distance(nodea, nodeb, distance);
>>> +               pr_debug("NUMA:  distance[node%d -> node%d] = %d\n",
>>> +                        nodea, nodeb, distance);
>>> +
>>> +               /* Set default distance of node B->A same as A->B */
>>> +               if (nodeb > nodea)
>>> +                       numa_set_distance(nodeb, nodea, distance);
>>> +       }
>>> +
>>> +       return 0;
>>> +}
>>> +
>>> +static int __init of_parse_distance_map(void)
>>> +{
>>> +       int ret = -EINVAL;
>>> +       struct device_node *np = of_find_node_by_name(NULL, "distance-map");
>>> +
>>> +       if (!np)
>>> +               return ret;
>>> +
>>> +       if (of_device_is_compatible(np, "numa-distance-map-v1")) {
>>
>> You can use of_find_compatible_node() instead of these 2 calls.
>>
>>> +               ret = parse_distance_map_v1(np);
>>> +               goto out;
>>> +       }
>>> +
>>> +       pr_err("NUMA: invalid distance-map device node\n");
>>> +out:
>>> +       of_node_put(np);
>>> +       return ret;
>>> +}
>>> +
>>> +int of_node_to_nid(struct device_node *device)
>>> +{
>>> +       struct device_node *np;
>>> +       u32 nid;
>>> +       int r = -ENODATA;
>>> +
>>> +       np = of_node_get(device);
>>> +
>>> +       while (np) {
>>> +               struct device_node *parent;
>>> +
>>> +               r = of_property_read_u32(np, "numa-node-id", &nid);
>>> +               if (r != -EINVAL)
>>
>> You want to break for other err values?
>>
>>> +                       break;
>>> +
>>> +               /* property doesn't exist in this node, look in parent */
>>> +               parent = of_get_parent(np);
>>> +               of_node_put(np);
>>> +               np = parent;
>>> +       }
>>> +       if (np && r)
>>> +               pr_warn("NUMA: Invalid \"numa-node-id\" property in node %s\n",
>>> +                       np->name);
>>> +       of_node_put(np);
>>> +
>>> +       if (!r) {
>>> +               if (nid >= MAX_NUMNODES)
>>> +                       pr_warn("NUMA: Node id %u exceeds maximum value\n",
>>> +                               nid);
>>> +               else
>>> +                       return nid;
>>> +       }
>>> +
>>> +       return NUMA_NO_NODE;
>>> +}
>>
>> Needs to be exported?
>>
>>> +
>>> +int __init of_numa_init(void)
>>> +{
>>> +       of_find_cpu_nodes();
>>> +       of_parse_memory_nodes();
>>> +       return of_parse_distance_map();
>>> +}
>>> diff --git a/include/linux/of.h b/include/linux/of.h
>>> index dc6e396..fe67a4c 100644
>>> --- a/include/linux/of.h
>>> +++ b/include/linux/of.h
>>> @@ -685,6 +685,15 @@ static inline int of_node_to_nid(struct device_node *device)
>>>  }
>>>  #endif
>>>
>>> +#ifdef CONFIG_OF_NUMA
>>> +extern int of_numa_init(void);
>>> +#else
>>> +static inline int of_numa_init(void)
>>> +{
>>> +       return -ENOSYS;
>>> +}
>>> +#endif
>>> +
>>>  static inline struct device_node *of_find_matching_node(
>>>         struct device_node *from,
>>>         const struct of_device_id *matches)
>>> --
>>> 1.8.3.1
>>>
>>

Ganapat
>
> thanks
> Ganapat
>
>> _______________________________________________
>> linux-arm-kernel mailing list
>> linux-arm-kernel@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
David Daney March 3, 2016, 5:33 p.m. UTC | #4
On 03/02/2016 07:34 PM, Rob Herring wrote:
> On Wed, Mar 2, 2016 at 4:55 PM, David Daney <ddaney.cavm@gmail.com> wrote:
>> From: David Daney <david.daney@cavium.com>
>>
>> Add device tree parsing for NUMA topology using device
>> "numa-node-id" property in distance-map and cpu nodes.
>>
>> This is a complete rewrite of a previous patch by:
>>     Ganapatrao Kulkarni<gkulkarni@caviumnetworks.com>
>>
>> Signed-off-by: David Daney <david.daney@cavium.com>
>> ---
>>   drivers/of/Kconfig   |   3 +
>>   drivers/of/Makefile  |   1 +
>>   drivers/of/of_numa.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/of.h   |   9 +++
>>   4 files changed, 213 insertions(+)
>>   create mode 100644 drivers/of/of_numa.c
>>
[...]
>> +++ b/drivers/of/of_numa.c
>> @@ -0,0 +1,200 @@
>> +/*
>> + * OF NUMA Parsing support.
>> + *
>> + * Copyright (C) 2015 - 2016 Cavium Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <linux/of.h>
>> +#include <linux/of_fdt.h>
>
> This can be dropped now.

Yes.

>
>> +#include <linux/nodemask.h>
>> +
>> +#include <asm/numa.h>
>> +
>> +/* define default numa node to 0 */
>> +#define DEFAULT_NODE 0
>> +
>> +/*
>> + * Even though we connect cpus to numa domains later in SMP
>> + * init, we need to know the node ids now for all cpus.
>> +*/
>> +static void __init of_find_cpu_nodes(void)
>
> Perhaps of_parse_cpu_nodes for consistency.
>
> Actually, if all the functions were prefixed with "of_numa_" that
> would be better.
>

OK, I will do that.

>> +{
>> +       u32 nid;
>> +       int r;
>> +       struct device_node *np = NULL;
>> +
>> +       for (;;) {
>> +               np = of_find_node_by_type(np, "cpu");
>> +               if (!np)
>> +                       break;
>
> Can't we use the child node iterator for /cpus here?

I will try to do something like that.

>
>> +
>> +               r = of_property_read_u32(np, "numa-node-id", &nid);
>> +               if (r)
>> +                       continue;
>> +
>> +               pr_debug("NUMA: CPU on %u\n", nid);
>> +               if (nid >= MAX_NUMNODES)
>> +                       pr_warn("NUMA: Node id %u exceeds maximum value\n",
>> +                               nid);
>> +               else
>> +                       node_set(nid, numa_nodes_parsed);
>
> I'm not sure how this works, but don't you need to match this up with
> MPIDR of the cpu here?

As Ganapatrao said in the other e-mail, all we are doing is discovering 
which nodes have a CPU here.  At this point we don't care about any 
other properties of the individual CPUs.

>
>> +       }
>> +}
>> +
>> +static void __init of_parse_memory_nodes(void)
>> +{
>> +       struct device_node *np = NULL;
>> +       int na, ns;
>> +       const __be32 *prop;
>> +       unsigned int psize;
>> +       u32 nid;
>> +       u64 base, size;
>> +       int r;
>> +
>> +       for (;;) {
>> +               np = of_find_node_by_type(np, "memory");
>> +               if (!np)
>> +                       break;
>> +
>> +               r = of_property_read_u32(np, "numa-node-id", &nid);
>> +               if (r)
>> +                       continue;
>> +
>
>> +               prop = of_get_property(np, "reg", &psize);
>> +               if (!prop)
>> +                       continue;
>> +
>> +               psize /= sizeof(__be32);
>> +               na = of_n_addr_cells(np);
>> +               ns = of_n_size_cells(np);
>> +
>> +               if (psize < na + ns) {
>> +                       pr_err("NUMA: memory reg property too small\n");
>> +                       continue;
>> +               }
>> +               base = of_read_number(prop, na);
>> +               size = of_read_number(prop + na, ns);
>
> You should be able to use of_address_to_resource for all this.
>

I thought about doing that.  It would make the code simpler.  I was 
concerned about the address translation that is done, but since these 
are at the root level there should be no translation.  I will change the 
code to do this.

>> +
>> +               pr_debug("NUMA:  base = %llx len = %llx, node = %u\n",
>> +                        base, size, nid);
>> +
>> +               if (numa_add_memblk(nid, base, size) < 0)
>> +                       break;
>> +       }
>> +
>> +       of_node_put(np);
>> +}
>> +
>> +static int __init parse_distance_map_v1(struct device_node *map)
>> +{
>> +       const __be32 *matrix;
>> +       unsigned int matrix_size;
>> +       int entry_count;
>> +       int i;
>> +       int nr_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
>
> I believe the defaults are for some old DT files. As this is new, it
> should rely on explicit #size-cells in the DT.
>
> OTOH, what is point of using #size-cells at all versus fixing the
> sizes to 1 cell. The documentation doesn't indicate that it uses
> #size-cells. That also means that the sizes basically follow the cell
> size for the memory given that this is at the top-level.

I think we should start with specifying that all elements are a single 
cell.  In the future if this turns out to be insufficient, the binding 
could easily be extended to include a #node-id-size or something similar.

I will simplify the code to assume a single cell.


>
>> +
>> +       pr_info("NUMA: parsing numa-distance-map-v1\n");
>> +
>> +       matrix = of_get_property(map, "distance-matrix", &matrix_size);
>> +       if (!matrix) {
>> +               pr_err("NUMA: No distance-matrix property in distance-map\n");
>> +               return -EINVAL;
>> +       }
>> +
>> +       entry_count = matrix_size / (sizeof(__be32) * 3 * nr_size_cells);
>> +
>> +       for (i = 0; i < entry_count; i++) {
>> +               u32 nodea, nodeb, distance;
>> +
>> +               nodea = of_read_number(matrix, nr_size_cells);
>> +               matrix += nr_size_cells;
>> +               nodeb = of_read_number(matrix, nr_size_cells);
>> +               matrix += nr_size_cells;
>> +               distance = of_read_number(matrix, nr_size_cells);
>> +               matrix += nr_size_cells;
>
> Assuming you fix this to 1 cell, you can use
> of_property_count_u32_elems and of_property_read_u32_array.

The number of elements in the array could be large.  We would have to do 
dynamic memory allocation to be able to use of_property_read_u32_array. 
  I would prefer to iterate through the array like this to avoid having 
to allocate memory.

>
>> +
>> +               numa_set_distance(nodea, nodeb, distance);
>> +               pr_debug("NUMA:  distance[node%d -> node%d] = %d\n",
>> +                        nodea, nodeb, distance);
>> +
>> +               /* Set default distance of node B->A same as A->B */
>> +               if (nodeb > nodea)
>> +                       numa_set_distance(nodeb, nodea, distance);
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static int __init of_parse_distance_map(void)
>> +{
>> +       int ret = -EINVAL;
>> +       struct device_node *np = of_find_node_by_name(NULL, "distance-map");
>> +
>> +       if (!np)
>> +               return ret;
>> +
>> +       if (of_device_is_compatible(np, "numa-distance-map-v1")) {
>
> You can use of_find_compatible_node() instead of these 2 calls.

Well, we need to match exactly the name "distance-map", 
of_find_compatible_node() doesn't match on the name, so I think we need 
two checks, one for name and one for compatible.

>
>> +               ret = parse_distance_map_v1(np);
>> +               goto out;
>> +       }
>> +
>> +       pr_err("NUMA: invalid distance-map device node\n");
>> +out:
>> +       of_node_put(np);
>> +       return ret;
>> +}
>> +
>> +int of_node_to_nid(struct device_node *device)
>> +{
>> +       struct device_node *np;
>> +       u32 nid;
>> +       int r = -ENODATA;
>> +
>> +       np = of_node_get(device);
>> +
>> +       while (np) {
>> +               struct device_node *parent;
>> +
>> +               r = of_property_read_u32(np, "numa-node-id", &nid);
>> +               if (r != -EINVAL)
>
> You want to break for other err values?

Yes, if the property doesn't exist, we need to check the parent. 
Otherwise, it indicates an error in the device tree, and we bail out 
with the warning message.

I will add a comment that explains what we are doing and the 
significance of -EINVAL

>
>> +                       break;
>> +
>> +               /* property doesn't exist in this node, look in parent */
>> +               parent = of_get_parent(np);
>> +               of_node_put(np);
>> +               np = parent;
>> +       }
>> +       if (np && r)
>> +               pr_warn("NUMA: Invalid \"numa-node-id\" property in node %s\n",
>> +                       np->name);
>> +       of_node_put(np);
>> +
>> +       if (!r) {
>> +               if (nid >= MAX_NUMNODES)
>> +                       pr_warn("NUMA: Node id %u exceeds maximum value\n",
>> +                               nid);
>> +               else
>> +                       return nid;
>> +       }
>> +
>> +       return NUMA_NO_NODE;
>> +}
>
> Needs to be exported?

Good catch.  I will export it.

>
>> +
>> +int __init of_numa_init(void)
>> +{
>> +       of_find_cpu_nodes();
>> +       of_parse_memory_nodes();
>> +       return of_parse_distance_map();
>> +}
>> diff --git a/include/linux/of.h b/include/linux/of.h
>> index dc6e396..fe67a4c 100644
>> --- a/include/linux/of.h
>> +++ b/include/linux/of.h
>> @@ -685,6 +685,15 @@ static inline int of_node_to_nid(struct device_node *device)
>>   }
>>   #endif
>>
>> +#ifdef CONFIG_OF_NUMA
>> +extern int of_numa_init(void);
>> +#else
>> +static inline int of_numa_init(void)
>> +{
>> +       return -ENOSYS;
>> +}
>> +#endif
>> +
>>   static inline struct device_node *of_find_matching_node(
>>          struct device_node *from,
>>          const struct of_device_id *matches)
>> --
>> 1.8.3.1
>>
Rob Herring March 4, 2016, 12:21 a.m. UTC | #5
On Thu, Mar 3, 2016 at 11:33 AM, David Daney <ddaney@caviumnetworks.com> wrote:
> On 03/02/2016 07:34 PM, Rob Herring wrote:
>>
>> On Wed, Mar 2, 2016 at 4:55 PM, David Daney <ddaney.cavm@gmail.com> wrote:
>>>
>>> From: David Daney <david.daney@cavium.com>
>>>
>>> Add device tree parsing for NUMA topology using device
>>> "numa-node-id" property in distance-map and cpu nodes.
>>>
>>> This is a complete rewrite of a previous patch by:
>>>     Ganapatrao Kulkarni<gkulkarni@caviumnetworks.com>
>>>
>>> Signed-off-by: David Daney <david.daney@cavium.com>
>>> ---
>>>   drivers/of/Kconfig   |   3 +
>>>   drivers/of/Makefile  |   1 +
>>>   drivers/of/of_numa.c | 200
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>   include/linux/of.h   |   9 +++
>>>   4 files changed, 213 insertions(+)
>>>   create mode 100644 drivers/of/of_numa.c

[...]

>>> +static int __init of_parse_distance_map(void)
>>> +{
>>> +       int ret = -EINVAL;
>>> +       struct device_node *np = of_find_node_by_name(NULL,
>>> "distance-map");
>>> +
>>> +       if (!np)
>>> +               return ret;
>>> +
>>> +       if (of_device_is_compatible(np, "numa-distance-map-v1")) {
>>
>>
>> You can use of_find_compatible_node() instead of these 2 calls.
>
>
> Well, we need to match exactly the name "distance-map",
> of_find_compatible_node() doesn't match on the name, so I think we need two
> checks, one for name and one for compatible.

Node names generally aren't supposed to matter and the kernel's job is
not to validate DTs, So just finding the compatible should be enough.

Not a big deal, so I've acked v14. If there's a v15, then please make
the change.

Rob
diff mbox

Patch

diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index e2a4841..b3bec3a 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -112,4 +112,7 @@  config OF_OVERLAY
 	  While this option is selected automatically when needed, you can
 	  enable it manually to improve device tree unit test coverage.
 
+config OF_NUMA
+	bool
+
 endif # OF
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 156c072..bee3fa9 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -14,5 +14,6 @@  obj-$(CONFIG_OF_MTD)	+= of_mtd.o
 obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
 obj-$(CONFIG_OF_RESOLVE)  += resolver.o
 obj-$(CONFIG_OF_OVERLAY) += overlay.o
+obj-$(CONFIG_OF_NUMA) += of_numa.o
 
 obj-$(CONFIG_OF_UNITTEST) += unittest-data/
diff --git a/drivers/of/of_numa.c b/drivers/of/of_numa.c
new file mode 100644
index 0000000..9727b60
--- /dev/null
+++ b/drivers/of/of_numa.c
@@ -0,0 +1,200 @@ 
+/*
+ * OF NUMA Parsing support.
+ *
+ * Copyright (C) 2015 - 2016 Cavium Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/nodemask.h>
+
+#include <asm/numa.h>
+
+/* define default numa node to 0 */
+#define DEFAULT_NODE 0
+
+/*
+ * Even though we connect cpus to numa domains later in SMP
+ * init, we need to know the node ids now for all cpus.
+*/
+static void __init of_find_cpu_nodes(void)
+{
+	u32 nid;
+	int r;
+	struct device_node *np = NULL;
+
+	for (;;) {
+		np = of_find_node_by_type(np, "cpu");
+		if (!np)
+			break;
+
+		r = of_property_read_u32(np, "numa-node-id", &nid);
+		if (r)
+			continue;
+
+		pr_debug("NUMA: CPU on %u\n", nid);
+		if (nid >= MAX_NUMNODES)
+			pr_warn("NUMA: Node id %u exceeds maximum value\n",
+				nid);
+		else
+			node_set(nid, numa_nodes_parsed);
+	}
+}
+
+static void __init of_parse_memory_nodes(void)
+{
+	struct device_node *np = NULL;
+	int na, ns;
+	const __be32 *prop;
+	unsigned int psize;
+	u32 nid;
+	u64 base, size;
+	int r;
+
+	for (;;) {
+		np = of_find_node_by_type(np, "memory");
+		if (!np)
+			break;
+
+		r = of_property_read_u32(np, "numa-node-id", &nid);
+		if (r)
+			continue;
+
+		prop = of_get_property(np, "reg", &psize);
+		if (!prop)
+			continue;
+
+		psize /= sizeof(__be32);
+		na = of_n_addr_cells(np);
+		ns = of_n_size_cells(np);
+
+		if (psize < na + ns) {
+			pr_err("NUMA: memory reg property too small\n");
+			continue;
+		}
+		base = of_read_number(prop, na);
+		size = of_read_number(prop + na, ns);
+
+		pr_debug("NUMA:  base = %llx len = %llx, node = %u\n",
+			 base, size, nid);
+
+		if (numa_add_memblk(nid, base, size) < 0)
+			break;
+	}
+
+	of_node_put(np);
+}
+
+static int __init parse_distance_map_v1(struct device_node *map)
+{
+	const __be32 *matrix;
+	unsigned int matrix_size;
+	int entry_count;
+	int i;
+	int nr_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
+
+	pr_info("NUMA: parsing numa-distance-map-v1\n");
+
+	matrix = of_get_property(map, "distance-matrix", &matrix_size);
+	if (!matrix) {
+		pr_err("NUMA: No distance-matrix property in distance-map\n");
+		return -EINVAL;
+	}
+
+	entry_count = matrix_size / (sizeof(__be32) * 3 * nr_size_cells);
+
+	for (i = 0; i < entry_count; i++) {
+		u32 nodea, nodeb, distance;
+
+		nodea = of_read_number(matrix, nr_size_cells);
+		matrix += nr_size_cells;
+		nodeb = of_read_number(matrix, nr_size_cells);
+		matrix += nr_size_cells;
+		distance = of_read_number(matrix, nr_size_cells);
+		matrix += nr_size_cells;
+
+		numa_set_distance(nodea, nodeb, distance);
+		pr_debug("NUMA:  distance[node%d -> node%d] = %d\n",
+			 nodea, nodeb, distance);
+
+		/* Set default distance of node B->A same as A->B */
+		if (nodeb > nodea)
+			numa_set_distance(nodeb, nodea, distance);
+	}
+
+	return 0;
+}
+
+static int __init of_parse_distance_map(void)
+{
+	int ret = -EINVAL;
+	struct device_node *np = of_find_node_by_name(NULL, "distance-map");
+
+	if (!np)
+		return ret;
+
+	if (of_device_is_compatible(np, "numa-distance-map-v1")) {
+		ret = parse_distance_map_v1(np);
+		goto out;
+	}
+
+	pr_err("NUMA: invalid distance-map device node\n");
+out:
+	of_node_put(np);
+	return ret;
+}
+
+int of_node_to_nid(struct device_node *device)
+{
+	struct device_node *np;
+	u32 nid;
+	int r = -ENODATA;
+
+	np = of_node_get(device);
+
+	while (np) {
+		struct device_node *parent;
+
+		r = of_property_read_u32(np, "numa-node-id", &nid);
+		if (r != -EINVAL)
+			break;
+
+		/* property doesn't exist in this node, look in parent */
+		parent = of_get_parent(np);
+		of_node_put(np);
+		np = parent;
+	}
+	if (np && r)
+		pr_warn("NUMA: Invalid \"numa-node-id\" property in node %s\n",
+			np->name);
+	of_node_put(np);
+
+	if (!r) {
+		if (nid >= MAX_NUMNODES)
+			pr_warn("NUMA: Node id %u exceeds maximum value\n",
+				nid);
+		else
+			return nid;
+	}
+
+	return NUMA_NO_NODE;
+}
+
+int __init of_numa_init(void)
+{
+	of_find_cpu_nodes();
+	of_parse_memory_nodes();
+	return of_parse_distance_map();
+}
diff --git a/include/linux/of.h b/include/linux/of.h
index dc6e396..fe67a4c 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -685,6 +685,15 @@  static inline int of_node_to_nid(struct device_node *device)
 }
 #endif
 
+#ifdef CONFIG_OF_NUMA
+extern int of_numa_init(void);
+#else
+static inline int of_numa_init(void)
+{
+	return -ENOSYS;
+}
+#endif
+
 static inline struct device_node *of_find_matching_node(
 	struct device_node *from,
 	const struct of_device_id *matches)