[RFC,v4] mm: add page preempt

Message ID	20191118113356.5448-1-hdanton@sina.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=2sRG=ZK=kvack.org=owner-linux-mm@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org A680D20878 From: Hillf Danton <hdanton@sina.com> To: linux-mm <linux-mm@kvack.org> Cc: linux-kernel <linux-kernel@vger.kernel.org>, Hillf Danton <hdanton@sina.com> Subject: [RFC v4] mm: add page preempt Date: Mon, 18 Nov 2019 19:33:56 +0800 Message-Id: <20191118113356.5448-1-hdanton@sina.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	[RFC,v4] mm: add page preempt \| expand [RFC,v4] mm: add page preempt

Message ID

20191118113356.5448-1-hdanton@sina.com (mailing list archive)

State

New, archived

Headers

DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org A680D20878
From: Hillf Danton <hdanton@sina.com>
To: linux-mm <linux-mm@kvack.org>
Cc: linux-kernel <linux-kernel@vger.kernel.org>,
	Hillf Danton <hdanton@sina.com>
Subject: [RFC v4] mm: add page preempt
Date: Mon, 18 Nov 2019 19:33:56 +0800
Message-Id: <20191118113356.5448-1-hdanton@sina.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

[RFC,v4] mm: add page preempt | expand

Commit Message

Hillf Danton Nov. 18, 2019, 11:33 a.m. UTC

The cpu preempt (cp) feature makes a task able to preempt other tasks
of lower priorities for cpu.

This work introduces task prio to page reclaiming in order to add the
page preempt (pp) feature that makes a task able to preempt other tasks
of lower priorities for page.

It has nothing to do with memory protection in the first place because
of preempt where no page under pp is exempt from reclaim by definition.

Instead, because no task is exempt from preempt, pp targets automatically
adding the sense a page itself can make by helping a task undergo no more
memory stalls than its prio can justify in an ideal state with memory
pressure and prio hierarchy across the system taken into account.

And for that the nice system call for instance is the turn key users need
to do their works if they are aware it is too much to compute the dips in
performance due to jitters in active pages combined with prio hierarchy
for every individual workload they have strong interests in.

Currently pages are reclaimed without prio taken into account; pages can
be reclaimed from tasks of lower priorities on behalf of higher-prio tasks
and vice versa.

s/and vice versa/only/ is what we need to make pp by definition, but
it could not make a sense without prio introduced; otherwise we can
simply skip deactivating the lru pages based on prio comprison, and
work is done.

The introduction consists of two parts. On the page side, page owner
task's prio is stored in page, which grows the page struct by four bytes.

On the reclaimer side, kswapd's prio is set with the prio of its waker,
and updated in the same manner as kswapd_order.

Another change is, because pp is a two-edge option, added in oom to
avoid killing high-prio task in favor of low-prio task.

V4 is based on next-20191115.

Changes since v3
- fix kswapd_prio update

Changes since v2
- page->prio depends on CONFIG_64BIT
- fix high-prio task killed in oom

Changes since v1
- page->prio depends on !LAST_CPUPID_NOT_IN_PAGE_FLAGS

Changes since v0
- s/page->nice/page->prio/
- drop the role of kswapd's reclaiming prioirty in prio comparison
- add pgdat->kswapd_prio

Signed-off-by: Hillf Danton <hdanton@sina.com>
---

--

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -24,6 +24,7 @@ 
 #include <linux/page_ext.h>
 #include <linux/err.h>
 #include <linux/page_ref.h>
+#include <linux/page_prio.h>
 #include <linux/memremap.h>
 #include <linux/overflow.h>
 #include <linux/sizes.h>
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -219,6 +219,14 @@  struct page {
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 	int _last_cpupid;
 #endif
+
+#ifdef CONFIG_64BIT
+	union {
+		int prio;
+		//struct page *__pad;
+	};
+#define CONFIG_PAGE_PREEMPTION PP
+#endif
 } _struct_page_alignment;
 
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -743,6 +743,7 @@  typedef struct pglist_data {
 	int kswapd_order;
 	enum zone_type kswapd_classzone_idx;
 
+	int kswapd_prio;
 	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
 
 #ifdef CONFIG_COMPACTION
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -671,6 +671,7 @@  static void __collapse_huge_page_copy(pt
 			}
 		} else {
 			src_page = pte_page(pteval);
+			copy_page_prio(page, src_page);
 			copy_user_highpage(page, src_page, address, vma);
 			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
 			release_pte_page(src_page);
@@ -1746,6 +1747,7 @@  xa_unlocked:
 				clear_highpage(new_page + (index % HPAGE_PMD_NR));
 				index++;
 			}
+			copy_page_prio(new_page, page);
 			copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
 					page);
 			list_del(&page->lru);
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -647,6 +647,7 @@  void migrate_page_states(struct page *ne
 		end_page_writeback(newpage);
 
 	copy_page_owner(page, newpage);
+	copy_page_prio(newpage, page);
 
 	mem_cgroup_migrate(page, newpage);
 }
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -164,6 +164,11 @@  static bool oom_unkillable_task(struct t
 		return true;
 	if (p->flags & PF_KTHREAD)
 		return true;
+
+#ifdef CONFIG_PAGE_PREEMPTION
+	if (p->prio < current->prio)
+		return true;
+#endif
 	return false;
 }
 
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1576,6 +1576,7 @@  static int shmem_replace_page(struct pag
 
 	get_page(newpage);
 	copy_highpage(newpage, oldpage);
+	copy_page_prio(newpage, oldpage);
 	flush_dcache_page(newpage);
 
 	__SetPageLocked(newpage);
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -407,6 +407,7 @@  static void __lru_cache_add(struct page
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
 
 	get_page(page);
+	set_page_prio(page, current->prio);
 	if (!pagevec_add(pvec, page) || PageCompound(page))
 		__pagevec_lru_add(pvec);
 	put_cpu_var(lru_add_pvec);
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -123,6 +123,9 @@  struct scan_control {
 	/* The highest zone to isolate pages for reclaim from */
 	s8 reclaim_idx;
 
+	s8 __pad;
+	int reclaimer_prio;
+
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 
@@ -1673,11 +1676,17 @@  static unsigned long isolate_lru_pages(u
 		total_scan += nr_pages;
 
 		if (page_zonenum(page) > sc->reclaim_idx) {
+next_page:
 			list_move(&page->lru, &pages_skipped);
 			nr_skipped[page_zonenum(page)] += nr_pages;
 			continue;
 		}
 
+#ifdef CONFIG_PAGE_PREEMPTION
+		if (is_active_lru(lru) && !cgroup_reclaim(sc) &&
+		    page_prio_higher(page, sc->reclaimer_prio))
+			goto next_page;
+#endif
 		/*
 		 * Do not count skipped pages because that makes the function
 		 * return with no isolated pages if the LRU mostly contains
@@ -3256,6 +3265,7 @@  unsigned long try_to_free_pages(struct z
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
+		.reclaimer_prio = current->prio,
 		.gfp_mask = current_gfp_context(gfp_mask),
 		.reclaim_idx = gfp_zone(gfp_mask),
 		.order = order,
@@ -3583,6 +3593,7 @@  static int balance_pgdat(pg_data_t *pgda
 	bool boosted;
 	struct zone *zone;
 	struct scan_control sc = {
+		.reclaimer_prio = pgdat->kswapd_prio,
 		.gfp_mask = GFP_KERNEL,
 		.order = order,
 		.may_unmap = 1,
@@ -3736,6 +3747,8 @@  restart:
 		if (nr_boost_reclaim && !nr_reclaimed)
 			break;
 
+		sc.reclaimer_prio = pgdat->kswapd_prio;
+
 		if (raise_priority || !nr_reclaimed)
 			sc.priority--;
 	} while (sc.priority >= 1);
@@ -3828,6 +3841,7 @@  static void kswapd_try_to_sleep(pg_data_
 		 */
 		wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
 
+		pgdat->kswapd_prio = MAX_PRIO + 1;
 		remaining = schedule_timeout(HZ/10);
 
 		/*
@@ -3862,8 +3876,10 @@  static void kswapd_try_to_sleep(pg_data_
 		 */
 		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
 
-		if (!kthread_should_stop())
+		if (!kthread_should_stop()) {
+			pgdat->kswapd_prio = MAX_PRIO + 1;
 			schedule();
+		}
 
 		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
 	} else {
@@ -3914,6 +3930,7 @@  static int kswapd(void *p)
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 	set_freezable();
 
+	pgdat->kswapd_prio = MAX_PRIO + 1;
 	pgdat->kswapd_order = 0;
 	pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
 	for ( ; ; ) {
@@ -3982,6 +3999,17 @@  void wakeup_kswapd(struct zone *zone, gf
 		return;
 	pgdat = zone->zone_pgdat;
 
+#ifdef CONFIG_PAGE_PREEMPTION
+	do {
+		int prio = current->prio;
+
+		if (pgdat->kswapd_prio < prio)
+			return;
+
+		pgdat->kswapd_prio = prio;
+	} while (0);
+#endif
+
 	if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
 		pgdat->kswapd_classzone_idx = classzone_idx;
 	else
--- /dev/null
+++ b/include/linux/page_prio.h
@@ -0,0 +1,55 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PAGE_PRIO_H
+#define _LINUX_PAGE_PRIO_H
+
+#include <linux/sched/prio.h>
+#include <linux/mm_types.h>
+
+#ifdef CONFIG_PAGE_PREEMPTION
+static inline bool page_prio_valid(struct page *p)
+{
+	return p->prio > MAX_PRIO;
+}
+
+static inline void set_page_prio(struct page *p, int task_prio)
+{
+	if (!page_prio_valid(p))
+		p->prio = task_prio + MAX_PRIO + 1;
+}
+
+static inline void copy_page_prio(struct page *to, struct page *from)
+{
+	to->prio = from->prio;
+}
+
+static inline int page_prio(struct page *p)
+{
+	return p->prio - MAX_PRIO - 1;
+}
+
+static inline bool page_prio_higher(struct page *p, int prio)
+{
+	return page_prio(p) < prio;
+}
+#else
+static inline bool page_prio_valid(struct page *p)
+{
+	return true;
+}
+static inline void set_page_prio(struct page *p, int task_prio)
+{
+}
+static inline void copy_page_prio(struct page *to, struct page *from)
+{
+}
+static inline int page_prio(struct page *p)
+{
+	return MAX_PRIO + 1;
+}
+static inline bool page_prio_higher(struct page *p, int prio)
+{
+	return false;
+}
+#endif /* CONFIG_PAGE_PREEMPTION */
+
+#endif /* _LINUX_PAGE_PRIO_H */

[RFC,v4] mm: add page preempt

Commit Message

Patch