diff mbox series

[3/4] numa: introduce numa group per task group

Message ID 93cf9333-2f9a-ca1e-a4a6-54fc388d1673@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series per cpu cgroup numa suite | expand

Commit Message

王贇 July 3, 2019, 3:32 a.m. UTC
By tracing numa page faults, we recognize tasks sharing the same page,
and try pack them together into a single numa group.

However when two task share lot's of cache pages while not much
anonymous pages, since numa balancing do not tracing cache page, they
have no chance to join into the same group.

While tracing cache page cost too much, we could use some hints from
userland and cpu cgroup could be a good one.

This patch introduced new entry 'numa_group' for cpu cgroup, by echo
non-zero into the entry, we can now force all the tasks of this cgroup
to join the same numa group serving for task group.

In this way tasks are more likely to settle down on the same node, to
share closer cpu cache and gain benefit from NUMA on both file/anonymous
pages.

Besides, when multiple cgroup enabled numa group, they will be able to
exchange task location by utilizing numa migration, in this way they
could achieve single node settle down without breaking load balance.

Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
---
 kernel/sched/core.c  |  37 +++++++++++
 kernel/sched/fair.c  | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |  14 +++++
 3 files changed, 225 insertions(+), 1 deletion(-)

Comments

Peter Zijlstra July 11, 2019, 2:10 p.m. UTC | #1
On Wed, Jul 03, 2019 at 11:32:32AM +0800, 王贇 wrote:
> By tracing numa page faults, we recognize tasks sharing the same page,
> and try pack them together into a single numa group.
> 
> However when two task share lot's of cache pages while not much
> anonymous pages, since numa balancing do not tracing cache page, they
> have no chance to join into the same group.
> 
> While tracing cache page cost too much, we could use some hints from

I forgot; where again do we skip shared pages? task_numa_work() doesn't
seem to skip file vmas.

> userland and cpu cgroup could be a good one.
> 
> This patch introduced new entry 'numa_group' for cpu cgroup, by echo
> non-zero into the entry, we can now force all the tasks of this cgroup
> to join the same numa group serving for task group.
> 
> In this way tasks are more likely to settle down on the same node, to
> share closer cpu cache and gain benefit from NUMA on both file/anonymous
> pages.
> 
> Besides, when multiple cgroup enabled numa group, they will be able to
> exchange task location by utilizing numa migration, in this way they
> could achieve single node settle down without breaking load balance.

I dislike cgroup only interfaces; it there really nothing else we could
use for this?
王贇 July 12, 2019, 4:03 a.m. UTC | #2
On 2019/7/11 下午10:10, Peter Zijlstra wrote:
> On Wed, Jul 03, 2019 at 11:32:32AM +0800, 王贇 wrote:
>> By tracing numa page faults, we recognize tasks sharing the same page,
>> and try pack them together into a single numa group.
>>
>> However when two task share lot's of cache pages while not much
>> anonymous pages, since numa balancing do not tracing cache page, they
>> have no chance to join into the same group.
>>
>> While tracing cache page cost too much, we could use some hints from
> 
> I forgot; where again do we skip shared pages? task_numa_work() doesn't
> seem to skip file vmas.

That's the page cache generated by file read/write, rather than the pages
for file mapping, pages of memory to support IO also won't be considered as
shared between tasks since they don't belong to any particular task, but may
serving multiples.

> 
>> userland and cpu cgroup could be a good one.
>>
>> This patch introduced new entry 'numa_group' for cpu cgroup, by echo
>> non-zero into the entry, we can now force all the tasks of this cgroup
>> to join the same numa group serving for task group.
>>
>> In this way tasks are more likely to settle down on the same node, to
>> share closer cpu cache and gain benefit from NUMA on both file/anonymous
>> pages.
>>
>> Besides, when multiple cgroup enabled numa group, they will be able to
>> exchange task location by utilizing numa migration, in this way they
>> could achieve single node settle down without breaking load balance.
> 
> I dislike cgroup only interfaces; it there really nothing else we could
> use for this?

Me too... while at this moment that's the best approach we have got, we also
tried to use separately module to handle these automatically, but this need
a very good understanding of the system, configuration and workloads which
only known by the owner.

So maybe just providing the functionality and leave the choice to user is not
that bad?

Regards,
Michael Wang

>
diff mbox series

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa43ce3962e7..148c231a4309 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6790,6 +6790,8 @@  void sched_offline_group(struct task_group *tg)
 {
 	unsigned long flags;

+	update_tg_numa_group(tg, false);
+
 	/* End participation in shares distribution: */
 	unregister_fair_sched_group(tg);

@@ -7277,6 +7279,34 @@  static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */

+#ifdef CONFIG_NUMA_BALANCING
+static DEFINE_MUTEX(numa_mutex);
+
+static int cpu_numa_group_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	mutex_lock(&numa_mutex);
+	show_tg_numa_group(tg, sf);
+	mutex_unlock(&numa_mutex);
+
+	return 0;
+}
+
+static int cpu_numa_group_write_s64(struct cgroup_subsys_state *css,
+				struct cftype *cft, s64 numa_group)
+{
+	int ret;
+	struct task_group *tg = css_tg(css);
+
+	mutex_lock(&numa_mutex);
+	ret = update_tg_numa_group(tg, numa_group);
+	mutex_unlock(&numa_mutex);
+
+	return ret;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
@@ -7312,6 +7342,13 @@  static struct cftype cpu_legacy_files[] = {
 		.read_u64 = cpu_rt_period_read_uint,
 		.write_u64 = cpu_rt_period_write_uint,
 	},
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+	{
+		.name = "numa_group",
+		.write_s64 = cpu_numa_group_write_s64,
+		.seq_show = cpu_numa_group_show,
+	},
 #endif
 	{ }	/* Terminate */
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b32304817eeb..6cf9c9c61258 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1074,6 +1074,7 @@  struct numa_group {
 	int nr_tasks;
 	pid_t gid;
 	int active_nodes;
+	bool evacuate;

 	struct rcu_head rcu;
 	unsigned long total_faults;
@@ -2247,6 +2248,176 @@  static inline void put_numa_group(struct numa_group *grp)
 		kfree_rcu(grp, rcu);
 }

+void show_tg_numa_group(struct task_group *tg, struct seq_file *sf)
+{
+	int nid;
+	struct numa_group *ng = tg->numa_group;
+
+	if (!ng) {
+		seq_puts(sf, "disabled\n");
+		return;
+	}
+
+	seq_printf(sf, "id %d nr_tasks %d active_nodes %d\n",
+		   ng->gid, ng->nr_tasks, ng->active_nodes);
+
+	for_each_online_node(nid) {
+		int f_idx = task_faults_idx(NUMA_MEM, nid, 0);
+		int pf_idx = task_faults_idx(NUMA_MEM, nid, 1);
+
+		seq_printf(sf, "node %d ", nid);
+
+		seq_printf(sf, "mem_private %lu mem_shared %lu ",
+			   ng->faults[f_idx], ng->faults[pf_idx]);
+
+		seq_printf(sf, "cpu_private %lu cpu_shared %lu\n",
+			   ng->faults_cpu[f_idx], ng->faults_cpu[pf_idx]);
+	}
+}
+
+int update_tg_numa_group(struct task_group *tg, bool numa_group)
+{
+	struct numa_group *ng = tg->numa_group;
+
+	/* if no change then do nothing */
+	if ((ng != NULL) == numa_group)
+		return 0;
+
+	if (ng) {
+		/* put and evacuate tg's numa group */
+		rcu_assign_pointer(tg->numa_group, NULL);
+		ng->evacuate = true;
+		put_numa_group(ng);
+	} else {
+		unsigned int size = sizeof(struct numa_group) +
+				    4*nr_node_ids*sizeof(unsigned long);
+
+		ng = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+		if (!ng)
+			return -ENOMEM;
+
+		refcount_set(&ng->refcount, 1);
+		spin_lock_init(&ng->lock);
+		ng->faults_cpu = ng->faults + NR_NUMA_HINT_FAULT_TYPES *
+						nr_node_ids;
+		/* now make tasks see and join */
+		rcu_assign_pointer(tg->numa_group, ng);
+	}
+
+	return 0;
+}
+
+static bool tg_numa_group(struct task_struct *p)
+{
+	int i;
+	struct task_group *tg;
+	struct numa_group *grp, *my_grp;
+
+	rcu_read_lock();
+
+	tg = task_group(p);
+	if (!tg)
+		goto no_join;
+
+	grp = rcu_dereference(tg->numa_group);
+	my_grp = rcu_dereference(p->numa_group);
+
+	if (!grp)
+		goto no_join;
+
+	if (grp == my_grp) {
+		if (!grp->evacuate)
+			goto joined;
+
+		/*
+		 * Evacuate task from tg's numa group
+		 */
+		rcu_read_unlock();
+
+		spin_lock_irq(&grp->lock);
+
+		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+			grp->faults[i] -= p->numa_faults[i];
+
+		grp->total_faults -= p->total_numa_faults;
+		grp->nr_tasks--;
+
+		spin_unlock_irq(&grp->lock);
+
+		rcu_assign_pointer(p->numa_group, NULL);
+
+		put_numa_group(grp);
+
+		return false;
+	}
+
+	if (!get_numa_group(grp))
+		goto no_join;
+
+	rcu_read_unlock();
+
+	/*
+	 * Just join tg's numa group
+	 */
+	if (!my_grp) {
+		spin_lock_irq(&grp->lock);
+
+		if (refcount_read(&grp->refcount) == 2) {
+			grp->gid = p->pid;
+			grp->active_nodes = 1;
+			grp->max_faults_cpu = 0;
+		}
+
+		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+			grp->faults[i] += p->numa_faults[i];
+
+		grp->total_faults += p->total_numa_faults;
+		grp->nr_tasks++;
+
+		spin_unlock_irq(&grp->lock);
+		rcu_assign_pointer(p->numa_group, grp);
+
+		return true;
+	}
+
+	/*
+	 * Switch from the task's numa group to the tg's
+	 */
+	double_lock_irq(&my_grp->lock, &grp->lock);
+
+	if (refcount_read(&grp->refcount) == 2) {
+		grp->gid = p->pid;
+		grp->active_nodes = 1;
+		grp->max_faults_cpu = 0;
+	}
+
+	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
+		my_grp->faults[i] -= p->numa_faults[i];
+		grp->faults[i] += p->numa_faults[i];
+	}
+
+	my_grp->total_faults -= p->total_numa_faults;
+	grp->total_faults += p->total_numa_faults;
+
+	my_grp->nr_tasks--;
+	grp->nr_tasks++;
+
+	spin_unlock(&my_grp->lock);
+	spin_unlock_irq(&grp->lock);
+
+	rcu_assign_pointer(p->numa_group, grp);
+
+	put_numa_group(my_grp);
+	return true;
+
+joined:
+	rcu_read_unlock();
+	return true;
+no_join:
+	rcu_read_unlock();
+	return false;
+}
+
 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 			int *priv)
 {
@@ -2417,7 +2588,9 @@  void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 		priv = 1;
 	} else {
 		priv = cpupid_match_pid(p, last_cpupid);
-		if (!priv && !(flags & TNF_NO_GROUP))
+		if (tg_numa_group(p))
+			priv = (flags & TNF_SHARED) ? 0 : priv;
+		else if (!priv && !(flags & TNF_NO_GROUP))
 			task_numa_group(p, last_cpupid, flags, &priv);
 	}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 802b1f3405f2..b5bc4d804e2d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -393,6 +393,10 @@  struct task_group {
 #endif

 	struct cfs_bandwidth	cfs_bandwidth;
+
+#ifdef CONFIG_NUMA_BALANCING
+	void *numa_group;
+#endif
 };

 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1286,11 +1290,21 @@  extern int migrate_task_to(struct task_struct *p, int cpu);
 extern int migrate_swap(struct task_struct *p, struct task_struct *t,
 			int cpu, int scpu);
 extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+extern void show_tg_numa_group(struct task_group *tg, struct seq_file *sf);
+extern int update_tg_numa_group(struct task_group *tg, bool numa_group);
 #else
 static inline void
 init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 {
 }
+static inline void
+show_tg_numa_group(struct task_group *tg, struct seq_file *sf)
+{
+}
+update_tg_numa_group(struct task_group *tg, bool numa_group)
+{
+	return 0;
+}
 #endif /* CONFIG_NUMA_BALANCING */

 #ifdef CONFIG_SMP