diff mbox series

[RFC,v5,10/11] tools/damon/record: Support NUMA specific recording

Message ID 20200707144540.21216-11-sjpark@amazon.com (mailing list archive)
State New, archived
Headers show
Series DAMON: Support Physical Memory Address Space Monitoring | expand

Commit Message

SeongJae Park July 7, 2020, 2:45 p.m. UTC
From: SeongJae Park <sjpark@amazon.de>

This commit updates the DAMON user space tool (damo-record) for NUMA
specific physical memory monitoring.  With this change, users can
monitor accesses to physical memory of specific NUMA node.

Signed-off-by: SeongJae Park <sjpark@amazon.de>
---
 tools/damon/_paddr_layout.py | 158 +++++++++++++++++++++++++++++++++++
 tools/damon/record.py        |  21 ++++-
 2 files changed, 178 insertions(+), 1 deletion(-)
 create mode 100644 tools/damon/_paddr_layout.py

Comments

Du, Fan July 20, 2020, 6:53 a.m. UTC | #1
>-----Original Message-----
>From: owner-linux-mm@kvack.org <owner-linux-mm@kvack.org> On Behalf
>Of SeongJae Park
>Sent: Tuesday, July 7, 2020 10:46 PM
>To: akpm@linux-foundation.org
>Cc: SeongJae Park <sjpark@amazon.de>; Jonathan.Cameron@Huawei.com;
>aarcange@redhat.com; acme@kernel.org; alexander.shishkin@linux.intel.com;
>amit@kernel.org; benh@kernel.crashing.org; brendan.d.gregg@gmail.com;
>brendanhiggins@google.com; cai@lca.pw; colin.king@canonical.com;
>corbet@lwn.net; david@redhat.com; dwmw@amazon.com;
>foersleo@amazon.de; irogers@google.com; jolsa@redhat.com;
>kirill@shutemov.name; mark.rutland@arm.com; mgorman@suse.de;
>minchan@kernel.org; mingo@redhat.com; namhyung@kernel.org;
>peterz@infradead.org; rdunlap@infradead.org; riel@surriel.com;
>rientjes@google.com; rostedt@goodmis.org; rppt@kernel.org;
>sblbir@amazon.com; shakeelb@google.com; shuah@kernel.org;
>sj38.park@gmail.com; snu@amazon.de; vbabka@suse.cz;
>vdavydov.dev@gmail.com; yang.shi@linux.alibaba.com; Huang, Ying
><ying.huang@intel.com>; linux-damon@amazon.com; linux-mm@kvack.org;
>linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org
>Subject: [RFC v5 10/11] tools/damon/record: Support NUMA specific
>recording
>
>From: SeongJae Park <sjpark@amazon.de>
>
>This commit updates the DAMON user space tool (damo-record) for NUMA
>specific physical memory monitoring.  With this change, users can
>monitor accesses to physical memory of specific NUMA node.
>
>Signed-off-by: SeongJae Park <sjpark@amazon.de>
>---
> tools/damon/_paddr_layout.py | 158
>+++++++++++++++++++++++++++++++++++
> tools/damon/record.py        |  21 ++++-
> 2 files changed, 178 insertions(+), 1 deletion(-)
> create mode 100644 tools/damon/_paddr_layout.py
>
>diff --git a/tools/damon/_paddr_layout.py b/tools/damon/_paddr_layout.py
>new file mode 100644
>index 000000000000..10056172db21
>--- /dev/null
>+++ b/tools/damon/_paddr_layout.py
>@@ -0,0 +1,158 @@
>+#!/usr/bin/env python3
>+# SPDX-License-Identifier: GPL-2.0
>+
>+import os
>+
>+class PaddrRange:
>+    start = None
>+    end = None
>+    nid = None
>+    state = None
>+    name = None
>+
>+    def __init__(self, start, end, nid, state, name):
>+        self.start = start
>+        self.end = end
>+        self.nid = nid
>+        self.state = state
>+        self.name = name
>+
>+    def interleaved(self, prange):
>+        if self.end <= prange.start:
>+            return None
>+        if prange.end <= self.start:
>+            return None
>+        return [max(self.start, prange.start), min(self.end, prange.end)]
>+
>+    def __str__(self):
>+        return '%x-%x, nid %s, state %s, name %s' % (self.start, self.end,
>+                self.nid, self.state, self.name)
>+
>+class MemBlock:
>+    nid = None
>+    index = None
>+    state = None
>+
>+    def __init__(self, nid, index, state):
>+        self.nid = nid
>+        self.index = index
>+        self.state = state
>+
>+    def __str__(self):
>+        return '%d (%s)' % (self.index, self.state)
>+
>+    def __repr__(self):
>+        return self.__str__()
>+
>+def readfile(file_path):
>+    with open(file_path, 'r') as f:
>+        return f.read()
>+
>+def collapse_ranges(ranges):
>+    ranges = sorted(ranges, key=lambda x: x.start)
>+    merged = []
>+    for r in ranges:
>+        if not merged:
>+            merged.append(r)
>+            continue
>+        last = merged[-1]
>+        if last.end != r.start or last.nid != r.nid or last.state != r.state:
>+            merged.append(r)
>+        else:
>+            last.end = r.end
>+    return merged
>+
>+def memblocks_to_ranges(blocks, block_size):
>+    ranges = []
>+    for b in blocks:
>+        ranges.append(PaddrRange(b.index * block_size,
>+            (b.index + 1) * block_size, b.nid, b.state, None))
>+
>+    return collapse_ranges(ranges)
>+
>+def memblock_ranges():
>+    SYSFS='/sys/devices/system/node'
>+    sz_block = int(readfile('/sys/devices/system/memory/block_size_bytes'),
>16)
>+    sys_nodes = [x for x in os.listdir(SYSFS) if x.startswith('node')]
>+
>+    blocks = []
>+    for sys_node in sys_nodes:
>+        nid = int(sys_node[4:])
>+
>+        sys_node_files = os.listdir(os.path.join(SYSFS, sys_node))
>+        for f in sys_node_files:
>+            if not f.startswith('memory'):
>+                continue
>+            index = int(f[6:])
>+            sys_state = os.path.join(SYSFS, sys_node, f, 'state')
>+            state = readfile(sys_state).strip()
>+
>+            blocks.append(MemBlock(nid, index, state))
>+
>+    return memblocks_to_ranges(blocks, sz_block)
>+
>+def iomem_ranges():
>+    ranges = []
>+
>+    with open('/proc/iomem', 'r') as f:
>+        # example of the line: '100000000-42b201fff : System RAM'
>+        for line in f:
>+            fields = line.split(':')
>+            if len(fields) < 2:
>+                continue
>+            name = ':'.join(fields[1:]).strip()
>+            addrs = fields[0].split('-')
>+            if len(addrs) != 2:
>+                continue
>+            start = int(addrs[0], 16)
>+            end = int(addrs[1], 16) + 1
>+            ranges.append(PaddrRange(start, end, None, None, name))
>+
>+    return ranges

Hi SeongJae

Here on system with persistent memory, user can plug {all or portion of }persistent memory into buddy system
with drivers from patchset[1] implemented. The persistent memory in this setup will be treated as normal system
RAM, and have valid full-fledged page structure as well.

For example, here is what /proc/iomem looks like in system with such configuration on my testbed.

1840000000-963fffffff : Persistent Memory
  1840000000-18be1fffff : namespace0.0
  18c0000000-37bfffffff : dax0.0
    18c0000000-37bfffffff : System RAM  <- first 128G of persistent memory corresponding to node 2

# numactl  -H
available: 3 nodes (0-2)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
node 0 size: 95447 MB
node 0 free: 92391 MB
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
node 1 size: 96733 MB
node 1 free: 95670 MB
node 2 cpus:
node 2 size: 126976 MB
node 2 free: 126935 MB
node distances:
node   0   1   2
  0:  10  21  17
  1:  21  10  28
  2:  17  28  10

So here we have two ranges:
1840000000-963fffffff : Persistent Memory
18c0000000-37bfffffff : System RAM

[1]:
https://patchwork.kernel.org/cover/10829019/


>+def paddr_ranges():
>+    ranges1 = memblock_ranges()
>+    ranges2 = iomem_ranges()
>+    merged = []
>+
>+    for r in ranges1:
>+        subsets = []
>+        for r2 in ranges2:
>+            interleaved = r.interleaved(r2)
>+            if interleaved == None:
>+                continue
>+
>+            start, end = interleaved
>+            left = None
>+            if start > r.start:
>+                left = PaddrRange(r.start, start, r.nid, r.state, r.name)
>+                subsets.append(left)
>+
>+            middle = PaddrRange(start, end, r.nid, r.state, r.name)
>+            if r2.nid:
>+                middle.nid = r2.nid
>+            if r2.state:
>+                middle.state = r2.state
>+            if r2.name:
>+                middle.name = r2.name

Memory block from numa node 2 will match with range "1840000000-963fffffff : Persistent Memory"
But take the name "Persistent memory", expected "System RAM" here.

>+            subsets.append(middle)
>+            r.start = end
>+        if r.start < r.end:
>+            subsets = [r]
>+
>+        merged += subsets
>+    return merged
>+
>+def pr_ranges(ranges):
>+    print('#%12s %13s\tnode\tstate\tresource\tsize' % ('start', 'end'))
>+    for r in ranges:
>+        print('%13d %13d\t%s\t%s\t%s\t%d' % (r.start, r.end, r.nid,
>+            r.state, r.name, r.end - r.start))
>+
>+def main():
>+    ranges = paddr_ranges()
>+
>+    pr_ranges(ranges)
>+
>+if __name__ == '__main__':
>+    main()
>diff --git a/tools/damon/record.py b/tools/damon/record.py
>index 416dca940c1d..8440a9818810 100644
>--- a/tools/damon/record.py
>+++ b/tools/damon/record.py
>@@ -12,6 +12,7 @@ import subprocess
> import time
>
> import _damon
>+import _paddr_layout
>
> def do_record(target, is_target_cmd, init_regions, attrs, old_attrs):
>     if os.path.isfile(attrs.rfile_path):
>@@ -70,6 +71,8 @@ def set_argparser(parser):
>             help='the target command or the pid to record')
>     parser.add_argument('-l', '--rbuf', metavar='<len>', type=int,
>             default=1024*1024, help='length of record result buffer')
>+    parser.add_argument('--numa_node', metavar='<node id>', type=int,
>+            help='if target is \'paddr\', limit it to the numa node')
>     parser.add_argument('-o', '--out', metavar='<file path>', type=str,
>             default='damon.data', help='output file path')
>
>@@ -96,6 +99,18 @@ def default_paddr_region():
>                 ret = [start, end]
>     return ret
>
>+def paddr_region_of(numa_node):
>+    regions = []
>+    default_region = default_paddr_region()
>+    paddr_ranges = _paddr_layout.paddr_ranges()
>+    for r in paddr_ranges:
>+        if r.end <= default_region[0] or default_region[1] <= r.start:
>+            continue
>+        if r.nid == numa_node and r.name == 'System RAM':
>+            regions.append([r.start, r.end])

To profile physical address for numa node 2, above checking will not return valid node 2 range
i.e., 18c0000000-37bfffffff : System RAM, but empty ones.

I tweaked the checking to match "Persistent Memory" also, it looks physical address monitoring with
numa node specified works from first glance.

I'm willing to give it a try once you posted next updated version.

>+    return regions
>+
> def main(args=None):
>     global orig_attrs
>     if not args:
>@@ -113,12 +128,16 @@ def main(args=None):
>     args.schemes = ''
>     new_attrs = _damon.cmd_args_to_attrs(args)
>     init_regions = _damon.cmd_args_to_init_regions(args)
>+    numa_node = args.numa_node
>     target = args.target
>
>     target_fields = target.split()
>     if target == 'paddr':   # physical memory address space
>         if not init_regions:
>-            init_regions = [default_paddr_region()]
>+            if numa_node:
>+                init_regions = paddr_region_of(numa_node)
>+            else:
>+                init_regions = [default_paddr_region()]
>         do_record(target, False, init_regions, new_attrs, orig_attrs)
>     elif not subprocess.call('which %s > /dev/null' % target_fields[0],
>             shell=True, executable='/bin/bash'):
>--
>2.17.1
>
diff mbox series

Patch

diff --git a/tools/damon/_paddr_layout.py b/tools/damon/_paddr_layout.py
new file mode 100644
index 000000000000..10056172db21
--- /dev/null
+++ b/tools/damon/_paddr_layout.py
@@ -0,0 +1,158 @@ 
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import os
+
+class PaddrRange:
+    start = None
+    end = None
+    nid = None
+    state = None
+    name = None
+
+    def __init__(self, start, end, nid, state, name):
+        self.start = start
+        self.end = end
+        self.nid = nid
+        self.state = state
+        self.name = name
+
+    def interleaved(self, prange):
+        if self.end <= prange.start:
+            return None
+        if prange.end <= self.start:
+            return None
+        return [max(self.start, prange.start), min(self.end, prange.end)]
+
+    def __str__(self):
+        return '%x-%x, nid %s, state %s, name %s' % (self.start, self.end,
+                self.nid, self.state, self.name)
+
+class MemBlock:
+    nid = None
+    index = None
+    state = None
+
+    def __init__(self, nid, index, state):
+        self.nid = nid
+        self.index = index
+        self.state = state
+
+    def __str__(self):
+        return '%d (%s)' % (self.index, self.state)
+
+    def __repr__(self):
+        return self.__str__()
+
+def readfile(file_path):
+    with open(file_path, 'r') as f:
+        return f.read()
+
+def collapse_ranges(ranges):
+    ranges = sorted(ranges, key=lambda x: x.start)
+    merged = []
+    for r in ranges:
+        if not merged:
+            merged.append(r)
+            continue
+        last = merged[-1]
+        if last.end != r.start or last.nid != r.nid or last.state != r.state:
+            merged.append(r)
+        else:
+            last.end = r.end
+    return merged
+
+def memblocks_to_ranges(blocks, block_size):
+    ranges = []
+    for b in blocks:
+        ranges.append(PaddrRange(b.index * block_size,
+            (b.index + 1) * block_size, b.nid, b.state, None))
+
+    return collapse_ranges(ranges)
+
+def memblock_ranges():
+    SYSFS='/sys/devices/system/node'
+    sz_block = int(readfile('/sys/devices/system/memory/block_size_bytes'), 16)
+    sys_nodes = [x for x in os.listdir(SYSFS) if x.startswith('node')]
+
+    blocks = []
+    for sys_node in sys_nodes:
+        nid = int(sys_node[4:])
+
+        sys_node_files = os.listdir(os.path.join(SYSFS, sys_node))
+        for f in sys_node_files:
+            if not f.startswith('memory'):
+                continue
+            index = int(f[6:])
+            sys_state = os.path.join(SYSFS, sys_node, f, 'state')
+            state = readfile(sys_state).strip()
+
+            blocks.append(MemBlock(nid, index, state))
+
+    return memblocks_to_ranges(blocks, sz_block)
+
+def iomem_ranges():
+    ranges = []
+
+    with open('/proc/iomem', 'r') as f:
+        # example of the line: '100000000-42b201fff : System RAM'
+        for line in f:
+            fields = line.split(':')
+            if len(fields) < 2:
+                continue
+            name = ':'.join(fields[1:]).strip()
+            addrs = fields[0].split('-')
+            if len(addrs) != 2:
+                continue
+            start = int(addrs[0], 16)
+            end = int(addrs[1], 16) + 1
+            ranges.append(PaddrRange(start, end, None, None, name))
+
+    return ranges
+
+def paddr_ranges():
+    ranges1 = memblock_ranges()
+    ranges2 = iomem_ranges()
+    merged = []
+
+    for r in ranges1:
+        subsets = []
+        for r2 in ranges2:
+            interleaved = r.interleaved(r2)
+            if interleaved == None:
+                continue
+
+            start, end = interleaved
+            left = None
+            if start > r.start:
+                left = PaddrRange(r.start, start, r.nid, r.state, r.name)
+                subsets.append(left)
+
+            middle = PaddrRange(start, end, r.nid, r.state, r.name)
+            if r2.nid:
+                middle.nid = r2.nid
+            if r2.state:
+                middle.state = r2.state
+            if r2.name:
+                middle.name = r2.name
+            subsets.append(middle)
+            r.start = end
+        if r.start < r.end:
+            subsets = [r]
+
+        merged += subsets
+    return merged
+
+def pr_ranges(ranges):
+    print('#%12s %13s\tnode\tstate\tresource\tsize' % ('start', 'end'))
+    for r in ranges:
+        print('%13d %13d\t%s\t%s\t%s\t%d' % (r.start, r.end, r.nid,
+            r.state, r.name, r.end - r.start))
+
+def main():
+    ranges = paddr_ranges()
+
+    pr_ranges(ranges)
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/damon/record.py b/tools/damon/record.py
index 416dca940c1d..8440a9818810 100644
--- a/tools/damon/record.py
+++ b/tools/damon/record.py
@@ -12,6 +12,7 @@  import subprocess
 import time
 
 import _damon
+import _paddr_layout
 
 def do_record(target, is_target_cmd, init_regions, attrs, old_attrs):
     if os.path.isfile(attrs.rfile_path):
@@ -70,6 +71,8 @@  def set_argparser(parser):
             help='the target command or the pid to record')
     parser.add_argument('-l', '--rbuf', metavar='<len>', type=int,
             default=1024*1024, help='length of record result buffer')
+    parser.add_argument('--numa_node', metavar='<node id>', type=int,
+            help='if target is \'paddr\', limit it to the numa node')
     parser.add_argument('-o', '--out', metavar='<file path>', type=str,
             default='damon.data', help='output file path')
 
@@ -96,6 +99,18 @@  def default_paddr_region():
                 ret = [start, end]
     return ret
 
+def paddr_region_of(numa_node):
+    regions = []
+    default_region = default_paddr_region()
+    paddr_ranges = _paddr_layout.paddr_ranges()
+    for r in paddr_ranges:
+        if r.end <= default_region[0] or default_region[1] <= r.start:
+            continue
+        if r.nid == numa_node and r.name == 'System RAM':
+            regions.append([r.start, r.end])
+
+    return regions
+
 def main(args=None):
     global orig_attrs
     if not args:
@@ -113,12 +128,16 @@  def main(args=None):
     args.schemes = ''
     new_attrs = _damon.cmd_args_to_attrs(args)
     init_regions = _damon.cmd_args_to_init_regions(args)
+    numa_node = args.numa_node
     target = args.target
 
     target_fields = target.split()
     if target == 'paddr':   # physical memory address space
         if not init_regions:
-            init_regions = [default_paddr_region()]
+            if numa_node:
+                init_regions = paddr_region_of(numa_node)
+            else:
+                init_regions = [default_paddr_region()]
         do_record(target, False, init_regions, new_attrs, orig_attrs)
     elif not subprocess.call('which %s > /dev/null' % target_fields[0],
             shell=True, executable='/bin/bash'):