[RFC,1/5] numa: introduce per-cgroup numa balancing locality, statistic

Message ID	c0ec8861-2387-e73b-e450-2d636557a3dd@linux.alibaba.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <owner-linux-mm@kvack.org> Received-SPF: pass (google.com: domain of yun.wang@linux.alibaba.com designates 115.124.30.133 as permitted sender) client-ip=115.124.30.133; Subject: [RFC PATCH 1/5] numa: introduce per-cgroup numa balancing locality, statistic From: =?utf-8?b?546L6LSH?= <yun.wang@linux.alibaba.com> To: Peter Zijlstra <peterz@infradead.org>, hannes@cmpxchg.org, mhocko@kernel.org, vdavydov.dev@gmail.com, Ingo Molnar <mingo@redhat.com> Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org References: <209d247e-c1b2-3235-2722-dd7c1f896483@linux.alibaba.com> Message-ID: <c0ec8861-2387-e73b-e450-2d636557a3dd@linux.alibaba.com> Date: Mon, 22 Apr 2019 10:11:24 +0800 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Thunderbird/60.6.1 MIME-Version: 1.0 In-Reply-To: <209d247e-c1b2-3235-2722-dd7c1f896483@linux.alibaba.com> Content-Type: text/plain; charset=utf-8 Content-Language: en-US Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	NUMA Balancer Suite \| expand [RFC,0/5] NUMA Balancer Suite [RFC,1/5] numa: introduce per-cgroup numa balancing locality, statistic [RFC,2/5] numa: append per-node execution info in memory.numa_stat [RFC,3/5] numa: introduce per-cgroup preferred numa node [RFC,4/5] numa: introduce numa balancer infrastructure [RFC,5/5] numa: numa balancer

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 534267947664..bb62e6294484 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -179,6 +179,27 @@ enum memcg_kmem_state { KMEM_ONLINE, }; +#ifdef CONFIG_NUMA_BALANCING + +enum memcg_numa_locality_interval { + PERCENT_0_9, + PERCENT_10_19, + PERCENT_20_29, + PERCENT_30_39, + PERCENT_40_49, + PERCENT_50_59, + PERCENT_60_69, + PERCENT_70_79, + PERCENT_80_89, + PERCENT_90_100, + NR_NL_INTERVAL, +}; + +struct memcg_stat_numa { + u64 locality[NR_NL_INTERVAL]; +}; + +#endif #if defined(CONFIG_SMP) struct memcg_padding { char x[0]; @@ -311,6 +332,10 @@ struct mem_cgroup { struct list_head event_list; spinlock_t event_list_lock; +#ifdef CONFIG_NUMA_BALANCING + struct memcg_stat_numa __percpu *stat_numa; +#endif + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; @@ -818,6 +843,14 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void mem_cgroup_split_huge_fixup(struct page *head); #endif +#ifdef CONFIG_NUMA_BALANCING +extern void memcg_stat_numa_update(struct task_struct *p); +#else +static inline void memcg_stat_numa_update(struct task_struct *p) +{ +} +#endif + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1156,6 +1189,11 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline void memcg_stat_numa_update(struct task_struct *p) +{ +} + #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1a3c28d997d4..0b01262d110d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1049,8 +1049,14 @@ struct task_struct { * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults + * + * 0 -- remote faults + * 1 -- local faults + * 2 -- page migration failure + * 3 -- remote page accessing after page migration + * 4 -- local page accessing after page migration */ - unsigned long numa_faults_locality[3]; + unsigned long numa_faults_locality[5]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8039d62ae36e..2898f5fa4fba 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -873,6 +873,13 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); + SEQ_printf(m, "faults_locality local=%lu remote=%lu failed=%lu ", + p->numa_faults_locality[1], + p->numa_faults_locality[0], + p->numa_faults_locality[2]); + SEQ_printf(m, "lhit=%lu rhit=%lu\n", + p->numa_faults_locality[4], + p->numa_faults_locality[3]); mpol_put(pol); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fdab7eb6f351..ba5a67139d57 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,7 @@ #include "sched.h" #include <trace/events/sched.h> +#include <linux/memcontrol.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -2387,6 +2388,11 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } + p->numa_faults_locality[mem_node == numa_node_id() ? 4 : 3] += pages; + + if (mem_node == NUMA_NO_NODE) + return; + /* * First accesses are treated as private, otherwise consider accesses * to be private if the accessing pid has not changed @@ -2604,6 +2610,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) return; + memcg_stat_numa_update(curr); + /* * Using runtime rather than walltime has the dual advantage that * we (mostly) drive the selection from busy threads and that the diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 404acdcd0455..2614ce725a63 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1621,9 +1621,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (anon_vma) page_unlock_anon_vma_read(anon_vma); - if (page_nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, - flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c532f8685aa3..b810d4e9c906 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,6 +66,7 @@ #include <linux/lockdep.h> #include <linux/file.h> #include <linux/tracehook.h> +#include <linux/cpuset.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -3396,10 +3397,50 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) seq_putc(m, '\n'); } +#ifdef CONFIG_NUMA_BALANCING + seq_puts(m, "locality"); + for (nr = 0; nr < NR_NL_INTERVAL; nr++) { + int cpu; + u64 sum = 0; + + for_each_possible_cpu(cpu) + sum += per_cpu(memcg->stat_numa->locality[nr], cpu); + + seq_printf(m, " %llu", sum); + } + seq_putc(m, '\n'); +#endif + return 0; } #endif /* CONFIG_NUMA */ +#ifdef CONFIG_NUMA_BALANCING + +void memcg_stat_numa_update(struct task_struct *p) +{ + struct mem_cgroup *memcg; + unsigned long remote = p->numa_faults_locality[3]; + unsigned long local = p->numa_faults_locality[4]; + unsigned long idx = -1; + + if (mem_cgroup_disabled()) + return; + + if (remote || local) { + idx = (local * 10) / (remote + local); + if (idx >= NR_NL_INTERVAL) + idx = NR_NL_INTERVAL - 1; + } + + rcu_read_lock(); + memcg = mem_cgroup_from_task(p); + if (idx != -1) + this_cpu_inc(memcg->stat_numa->locality[idx]); + rcu_read_unlock(); +} +#endif + /* Universal VM events cgroup1 shows, original sort order */ static const unsigned int memcg1_events[] = { PGPGIN, @@ -4435,6 +4476,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); +#ifdef CONFIG_NUMA_BALANCING + free_percpu(memcg->stat_numa); +#endif free_percpu(memcg->vmstats_percpu); kfree(memcg); } @@ -4468,6 +4512,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg->vmstats_percpu) goto fail; +#ifdef CONFIG_NUMA_BALANCING + memcg->stat_numa = alloc_percpu(struct memcg_stat_numa); + if (!memcg->stat_numa) + goto fail; +#endif + for_each_node(node) if (alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; diff --git a/mm/memory.c b/mm/memory.c index c0391a9f18b8..fb0c1d940d36 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3609,7 +3609,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = NUMA_NO_NODE; - int last_cpupid; + int last_cpupid = 0; int target_nid; bool migrated = false; pte_t pte, old_pte; @@ -3689,8 +3689,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATE_FAIL; out: - if (page_nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, page_nid, 1, flags); + task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; }

[RFC,1/5] numa: introduce per-cgroup numa balancing locality, statistic

Commit Message

Comments

Patch