diff mbox series

[v2,4/5] mm: introduce MADV_PAGEOUT

Message ID 20190610111252.239156-5-minchan@kernel.org (mailing list archive)
State New, archived
Headers show
Series Introduce MADV_COLD and MADV_PAGEOUT | expand

Commit Message

Minchan Kim June 10, 2019, 11:12 a.m. UTC
When a process expects no accesses to a certain memory range
for a long time, it could hint kernel that the pages can be
reclaimed instantly but data should be preserved for future use.
This could reduce workingset eviction so it ends up increasing
performance.

This patch introduces the new MADV_PAGEOUT hint to madvise(2)
syscall. MADV_PAGEOUT can be used by a process to mark a memory
range as not expected to be used for a long time so that kernel
reclaims *any LRU* pages instantly. The hint can help kernel in
deciding which pages to evict proactively.

All of error rule is same with MADV_DONTNEED.

* v1
 * change pte to old and rely on the other's reference - hannes
 * remove page_mapcount to check shared page - mhocko

* RFC v2
 * make reclaim_pages simple via factoring out isolate logic - hannes

* RFCv1
 * rename from MADV_COLD to MADV_PAGEOUT - hannes
 * bail out if process is being killed - Hillf
 * fix reclaim_pages bugs - Hillf

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/swap.h                   |   1 +
 include/uapi/asm-generic/mman-common.h |   1 +
 mm/madvise.c                           | 161 +++++++++++++++++++++++++
 mm/vmscan.c                            |  58 +++++++++
 4 files changed, 221 insertions(+)

Comments

Michal Hocko June 19, 2019, 1:24 p.m. UTC | #1
On Mon 10-06-19 20:12:51, Minchan Kim wrote:
[...]
> +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> +				unsigned long end, struct mm_walk *walk)

Again the same question about a potential code reuse...
[...]
> +regular_page:
> +	tlb_change_page_size(tlb, PAGE_SIZE);
> +	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> +	flush_tlb_batched_pending(mm);
> +	arch_enter_lazy_mmu_mode();
> +	for (; addr < end; pte++, addr += PAGE_SIZE) {
> +		ptent = *pte;
> +		if (!pte_present(ptent))
> +			continue;
> +
> +		page = vm_normal_page(vma, addr, ptent);
> +		if (!page)
> +			continue;
> +
> +		if (isolate_lru_page(page))
> +			continue;
> +
> +		isolated++;
> +		if (pte_young(ptent)) {
> +			ptent = ptep_get_and_clear_full(mm, addr, pte,
> +							tlb->fullmm);
> +			ptent = pte_mkold(ptent);
> +			set_pte_at(mm, addr, pte, ptent);
> +			tlb_remove_tlb_entry(tlb, pte, addr);
> +		}
> +		ClearPageReferenced(page);
> +		test_and_clear_page_young(page);
> +		list_add(&page->lru, &page_list);
> +		if (isolated >= SWAP_CLUSTER_MAX) {

Why do we need SWAP_CLUSTER_MAX batching? Especially when we need ...
[...]

> +unsigned long reclaim_pages(struct list_head *page_list)
> +{
> +	int nid = -1;
> +	unsigned long nr_reclaimed = 0;
> +	LIST_HEAD(node_page_list);
> +	struct reclaim_stat dummy_stat;
> +	struct scan_control sc = {
> +		.gfp_mask = GFP_KERNEL,
> +		.priority = DEF_PRIORITY,
> +		.may_writepage = 1,
> +		.may_unmap = 1,
> +		.may_swap = 1,
> +	};
> +
> +	while (!list_empty(page_list)) {
> +		struct page *page;
> +
> +		page = lru_to_page(page_list);
> +		if (nid == -1) {
> +			nid = page_to_nid(page);
> +			INIT_LIST_HEAD(&node_page_list);
> +		}
> +
> +		if (nid == page_to_nid(page)) {
> +			list_move(&page->lru, &node_page_list);
> +			continue;
> +		}
> +
> +		nr_reclaimed += shrink_page_list(&node_page_list,
> +						NODE_DATA(nid),
> +						&sc, 0,
> +						&dummy_stat, false);

per-node batching in fact. Other than that nothing really jumped at me.
Except for the shared page cache side channel timing aspect not being
considered AFAICS. To be more specific. Pushing out a shared page cache
is possible even now but this interface gives a much easier tool to
evict shared state and perform all sorts of timing attacks. Unless I am
missing something we should be doing something similar to mincore and
ignore shared pages without a writeable access or at least document why
we do not care.
Minchan Kim June 20, 2019, 4:16 a.m. UTC | #2
On Wed, Jun 19, 2019 at 03:24:50PM +0200, Michal Hocko wrote:
> On Mon 10-06-19 20:12:51, Minchan Kim wrote:
> [...]
> > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> > +				unsigned long end, struct mm_walk *walk)
> 
> Again the same question about a potential code reuse...
> [...]
> > +regular_page:
> > +	tlb_change_page_size(tlb, PAGE_SIZE);
> > +	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> > +	flush_tlb_batched_pending(mm);
> > +	arch_enter_lazy_mmu_mode();
> > +	for (; addr < end; pte++, addr += PAGE_SIZE) {
> > +		ptent = *pte;
> > +		if (!pte_present(ptent))
> > +			continue;
> > +
> > +		page = vm_normal_page(vma, addr, ptent);
> > +		if (!page)
> > +			continue;
> > +
> > +		if (isolate_lru_page(page))
> > +			continue;
> > +
> > +		isolated++;
> > +		if (pte_young(ptent)) {
> > +			ptent = ptep_get_and_clear_full(mm, addr, pte,
> > +							tlb->fullmm);
> > +			ptent = pte_mkold(ptent);
> > +			set_pte_at(mm, addr, pte, ptent);
> > +			tlb_remove_tlb_entry(tlb, pte, addr);
> > +		}
> > +		ClearPageReferenced(page);
> > +		test_and_clear_page_young(page);
> > +		list_add(&page->lru, &page_list);
> > +		if (isolated >= SWAP_CLUSTER_MAX) {
> 
> Why do we need SWAP_CLUSTER_MAX batching? Especially when we need ...
> [...]

It aims for preventing early OOM kill since we isolate too many LRU
pages concurrently.

> 
> > +unsigned long reclaim_pages(struct list_head *page_list)
> > +{
> > +	int nid = -1;
> > +	unsigned long nr_reclaimed = 0;
> > +	LIST_HEAD(node_page_list);
> > +	struct reclaim_stat dummy_stat;
> > +	struct scan_control sc = {
> > +		.gfp_mask = GFP_KERNEL,
> > +		.priority = DEF_PRIORITY,
> > +		.may_writepage = 1,
> > +		.may_unmap = 1,
> > +		.may_swap = 1,
> > +	};
> > +
> > +	while (!list_empty(page_list)) {
> > +		struct page *page;
> > +
> > +		page = lru_to_page(page_list);
> > +		if (nid == -1) {
> > +			nid = page_to_nid(page);
> > +			INIT_LIST_HEAD(&node_page_list);
> > +		}
> > +
> > +		if (nid == page_to_nid(page)) {
> > +			list_move(&page->lru, &node_page_list);
> > +			continue;
> > +		}
> > +
> > +		nr_reclaimed += shrink_page_list(&node_page_list,
> > +						NODE_DATA(nid),
> > +						&sc, 0,
> > +						&dummy_stat, false);
> 
> per-node batching in fact. Other than that nothing really jumped at me.
> Except for the shared page cache side channel timing aspect not being
> considered AFAICS. To be more specific. Pushing out a shared page cache
> is possible even now but this interface gives a much easier tool to
> evict shared state and perform all sorts of timing attacks. Unless I am
> missing something we should be doing something similar to mincore and
> ignore shared pages without a writeable access or at least document why
> we do not care.

I'm not sure IIUC side channel attach. As you mentioned, without this syscall,
1. they already can do that simply by memory hogging
2. If we need fix MADV_PAGEOUT, that means we need to fix MADV_DONTNEED, too?
Michal Hocko June 20, 2019, 7:04 a.m. UTC | #3
On Thu 20-06-19 13:16:20, Minchan Kim wrote:
> On Wed, Jun 19, 2019 at 03:24:50PM +0200, Michal Hocko wrote:
> > On Mon 10-06-19 20:12:51, Minchan Kim wrote:
> > [...]
> > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> > > +				unsigned long end, struct mm_walk *walk)
> > 
> > Again the same question about a potential code reuse...
> > [...]
> > > +regular_page:
> > > +	tlb_change_page_size(tlb, PAGE_SIZE);
> > > +	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> > > +	flush_tlb_batched_pending(mm);
> > > +	arch_enter_lazy_mmu_mode();
> > > +	for (; addr < end; pte++, addr += PAGE_SIZE) {
> > > +		ptent = *pte;
> > > +		if (!pte_present(ptent))
> > > +			continue;
> > > +
> > > +		page = vm_normal_page(vma, addr, ptent);
> > > +		if (!page)
> > > +			continue;
> > > +
> > > +		if (isolate_lru_page(page))
> > > +			continue;
> > > +
> > > +		isolated++;
> > > +		if (pte_young(ptent)) {
> > > +			ptent = ptep_get_and_clear_full(mm, addr, pte,
> > > +							tlb->fullmm);
> > > +			ptent = pte_mkold(ptent);
> > > +			set_pte_at(mm, addr, pte, ptent);
> > > +			tlb_remove_tlb_entry(tlb, pte, addr);
> > > +		}
> > > +		ClearPageReferenced(page);
> > > +		test_and_clear_page_young(page);
> > > +		list_add(&page->lru, &page_list);
> > > +		if (isolated >= SWAP_CLUSTER_MAX) {
> > 
> > Why do we need SWAP_CLUSTER_MAX batching? Especially when we need ...
> > [...]
> 
> It aims for preventing early OOM kill since we isolate too many LRU
> pages concurrently.

This is a good point. For some reason I thought that we consider
isolated pages in should_reclaim_retry but we do not anymore (since we
move from zone to node LRUs I guess). Please stick a comment there.

> > > +unsigned long reclaim_pages(struct list_head *page_list)
> > > +{
> > > +	int nid = -1;
> > > +	unsigned long nr_reclaimed = 0;
> > > +	LIST_HEAD(node_page_list);
> > > +	struct reclaim_stat dummy_stat;
> > > +	struct scan_control sc = {
> > > +		.gfp_mask = GFP_KERNEL,
> > > +		.priority = DEF_PRIORITY,
> > > +		.may_writepage = 1,
> > > +		.may_unmap = 1,
> > > +		.may_swap = 1,
> > > +	};
> > > +
> > > +	while (!list_empty(page_list)) {
> > > +		struct page *page;
> > > +
> > > +		page = lru_to_page(page_list);
> > > +		if (nid == -1) {
> > > +			nid = page_to_nid(page);
> > > +			INIT_LIST_HEAD(&node_page_list);
> > > +		}
> > > +
> > > +		if (nid == page_to_nid(page)) {
> > > +			list_move(&page->lru, &node_page_list);
> > > +			continue;
> > > +		}
> > > +
> > > +		nr_reclaimed += shrink_page_list(&node_page_list,
> > > +						NODE_DATA(nid),
> > > +						&sc, 0,
> > > +						&dummy_stat, false);
> > 
> > per-node batching in fact. Other than that nothing really jumped at me.
> > Except for the shared page cache side channel timing aspect not being
> > considered AFAICS. To be more specific. Pushing out a shared page cache
> > is possible even now but this interface gives a much easier tool to
> > evict shared state and perform all sorts of timing attacks. Unless I am
> > missing something we should be doing something similar to mincore and
> > ignore shared pages without a writeable access or at least document why
> > we do not care.
> 
> I'm not sure IIUC side channel attach. As you mentioned, without this syscall,
> 1. they already can do that simply by memory hogging

This is way much more harder for practical attacks because the reclaim
logic is not fully under the attackers control. Having a direct tool to
reclaim memory directly then just opens doors to measure the other
consumers of that memory and all sorts of side channel.

> 2. If we need fix MADV_PAGEOUT, that means we need to fix MADV_DONTNEED, too?

nope because MADV_DONTNEED doesn't unmap from other processes.
Minchan Kim June 20, 2019, 8:40 a.m. UTC | #4
On Thu, Jun 20, 2019 at 09:04:44AM +0200, Michal Hocko wrote:
> On Thu 20-06-19 13:16:20, Minchan Kim wrote:
> > On Wed, Jun 19, 2019 at 03:24:50PM +0200, Michal Hocko wrote:
> > > On Mon 10-06-19 20:12:51, Minchan Kim wrote:
> > > [...]
> > > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> > > > +				unsigned long end, struct mm_walk *walk)
> > > 
> > > Again the same question about a potential code reuse...
> > > [...]
> > > > +regular_page:
> > > > +	tlb_change_page_size(tlb, PAGE_SIZE);
> > > > +	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> > > > +	flush_tlb_batched_pending(mm);
> > > > +	arch_enter_lazy_mmu_mode();
> > > > +	for (; addr < end; pte++, addr += PAGE_SIZE) {
> > > > +		ptent = *pte;
> > > > +		if (!pte_present(ptent))
> > > > +			continue;
> > > > +
> > > > +		page = vm_normal_page(vma, addr, ptent);
> > > > +		if (!page)
> > > > +			continue;
> > > > +
> > > > +		if (isolate_lru_page(page))
> > > > +			continue;
> > > > +
> > > > +		isolated++;
> > > > +		if (pte_young(ptent)) {
> > > > +			ptent = ptep_get_and_clear_full(mm, addr, pte,
> > > > +							tlb->fullmm);
> > > > +			ptent = pte_mkold(ptent);
> > > > +			set_pte_at(mm, addr, pte, ptent);
> > > > +			tlb_remove_tlb_entry(tlb, pte, addr);
> > > > +		}
> > > > +		ClearPageReferenced(page);
> > > > +		test_and_clear_page_young(page);
> > > > +		list_add(&page->lru, &page_list);
> > > > +		if (isolated >= SWAP_CLUSTER_MAX) {
> > > 
> > > Why do we need SWAP_CLUSTER_MAX batching? Especially when we need ...
> > > [...]
> > 
> > It aims for preventing early OOM kill since we isolate too many LRU
> > pages concurrently.
> 
> This is a good point. For some reason I thought that we consider
> isolated pages in should_reclaim_retry but we do not anymore (since we
> move from zone to node LRUs I guess). Please stick a comment there.

Sure.

> 
> > > > +unsigned long reclaim_pages(struct list_head *page_list)
> > > > +{
> > > > +	int nid = -1;
> > > > +	unsigned long nr_reclaimed = 0;
> > > > +	LIST_HEAD(node_page_list);
> > > > +	struct reclaim_stat dummy_stat;
> > > > +	struct scan_control sc = {
> > > > +		.gfp_mask = GFP_KERNEL,
> > > > +		.priority = DEF_PRIORITY,
> > > > +		.may_writepage = 1,
> > > > +		.may_unmap = 1,
> > > > +		.may_swap = 1,
> > > > +	};
> > > > +
> > > > +	while (!list_empty(page_list)) {
> > > > +		struct page *page;
> > > > +
> > > > +		page = lru_to_page(page_list);
> > > > +		if (nid == -1) {
> > > > +			nid = page_to_nid(page);
> > > > +			INIT_LIST_HEAD(&node_page_list);
> > > > +		}
> > > > +
> > > > +		if (nid == page_to_nid(page)) {
> > > > +			list_move(&page->lru, &node_page_list);
> > > > +			continue;
> > > > +		}
> > > > +
> > > > +		nr_reclaimed += shrink_page_list(&node_page_list,
> > > > +						NODE_DATA(nid),
> > > > +						&sc, 0,
> > > > +						&dummy_stat, false);
> > > 
> > > per-node batching in fact. Other than that nothing really jumped at me.
> > > Except for the shared page cache side channel timing aspect not being
> > > considered AFAICS. To be more specific. Pushing out a shared page cache
> > > is possible even now but this interface gives a much easier tool to
> > > evict shared state and perform all sorts of timing attacks. Unless I am
> > > missing something we should be doing something similar to mincore and
> > > ignore shared pages without a writeable access or at least document why
> > > we do not care.
> > 
> > I'm not sure IIUC side channel attach. As you mentioned, without this syscall,
> > 1. they already can do that simply by memory hogging
> 
> This is way much more harder for practical attacks because the reclaim
> logic is not fully under the attackers control. Having a direct tool to
> reclaim memory directly then just opens doors to measure the other
> consumers of that memory and all sorts of side channel.

Not sure it's much more harder. It's really easy on my experience.
Just creating new memory hogger and consume memory step by step until
you newly allocated pages will be reclaimed.
Anyway, we fixed mincore so attacker cannot see when the page fault-in
if he don't enough permission for the file. Right?
What's the concern of you even though we reclaim more aggressively?


> 
> > 2. If we need fix MADV_PAGEOUT, that means we need to fix MADV_DONTNEED, too?
> 
> nope because MADV_DONTNEED doesn't unmap from other processes.

Hmm, I don't understand. MADV_PAGEOUT doesn't unmap from other
processes, either. Could you elborate it a bit more what's your concern?


> -- 
> Michal Hocko
> SUSE Labs
Michal Hocko June 20, 2019, 9:22 a.m. UTC | #5
On Thu 20-06-19 17:40:40, Minchan Kim wrote:
> > > > Pushing out a shared page cache
> > > > is possible even now but this interface gives a much easier tool to
> > > > evict shared state and perform all sorts of timing attacks. Unless I am
> > > > missing something we should be doing something similar to mincore and
> > > > ignore shared pages without a writeable access or at least document why
> > > > we do not care.
> > > 
> > > I'm not sure IIUC side channel attach. As you mentioned, without this syscall,
> > > 1. they already can do that simply by memory hogging
> > 
> > This is way much more harder for practical attacks because the reclaim
> > logic is not fully under the attackers control. Having a direct tool to
> > reclaim memory directly then just opens doors to measure the other
> > consumers of that memory and all sorts of side channel.
> 
> Not sure it's much more harder. It's really easy on my experience.
> Just creating new memory hogger and consume memory step by step until
> you newly allocated pages will be reclaimed.

You can contain an untrusted application into a memcg and it will only
reclaim its own working set.

> > > 2. If we need fix MADV_PAGEOUT, that means we need to fix MADV_DONTNEED, too?
> > 
> > nope because MADV_DONTNEED doesn't unmap from other processes.
> 
> Hmm, I don't understand. MADV_PAGEOUT doesn't unmap from other
> processes, either.

Either I am confused or missing something. shrink_page_list does
try_to_unmap and that unmaps from all processes, right?

> Could you elborate it a bit more what's your concern?

If you manage to unmap from a remote process then you can measure delays
implied from the refault and that information can be used to infer what
the remote application is doing.
Minchan Kim June 20, 2019, 10:32 a.m. UTC | #6
On Thu, Jun 20, 2019 at 11:22:09AM +0200, Michal Hocko wrote:
> On Thu 20-06-19 17:40:40, Minchan Kim wrote:
> > > > > Pushing out a shared page cache
> > > > > is possible even now but this interface gives a much easier tool to
> > > > > evict shared state and perform all sorts of timing attacks. Unless I am
> > > > > missing something we should be doing something similar to mincore and
> > > > > ignore shared pages without a writeable access or at least document why
> > > > > we do not care.
> > > > 
> > > > I'm not sure IIUC side channel attach. As you mentioned, without this syscall,
> > > > 1. they already can do that simply by memory hogging
> > > 
> > > This is way much more harder for practical attacks because the reclaim
> > > logic is not fully under the attackers control. Having a direct tool to
> > > reclaim memory directly then just opens doors to measure the other
> > > consumers of that memory and all sorts of side channel.
> > 
> > Not sure it's much more harder. It's really easy on my experience.
> > Just creating new memory hogger and consume memory step by step until
> > you newly allocated pages will be reclaimed.
> 
> You can contain an untrusted application into a memcg and it will only
> reclaim its own working set.
> 
> > > > 2. If we need fix MADV_PAGEOUT, that means we need to fix MADV_DONTNEED, too?
> > > 
> > > nope because MADV_DONTNEED doesn't unmap from other processes.
> > 
> > Hmm, I don't understand. MADV_PAGEOUT doesn't unmap from other
> > processes, either.
> 
> Either I am confused or missing something. shrink_page_list does
> try_to_unmap and that unmaps from all processes, right?

You don't miss it. It seems now I undetstand what you pointed out.
What you meant is attacker can see what page was faulting-in from other processes
via measuring access delay from his address space and MADV_PAGEOUT makes it more
easiler. Thus, it's an issue regardless of recent mincore fix. Right?
Then, okay, I will add can_do_mincore similar check for the MADV_PAGEOUT syscall
if others have different ideas.

Thanks.
Michal Hocko June 20, 2019, 10:55 a.m. UTC | #7
On Thu 20-06-19 19:32:15, Minchan Kim wrote:
[...]
> Then, okay, I will add can_do_mincore similar check for the MADV_PAGEOUT syscall
> if others have different ideas.

Great that we are on the same page. We can simply skip over those pages.
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ce997edb8bb..063c0c1e112b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -365,6 +365,7 @@  extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
+extern unsigned long reclaim_pages(struct list_head *page_list);
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index d7b4231eea63..f545e159b472 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -48,6 +48,7 @@ 
 #define MADV_WILLNEED	3		/* will need these pages */
 #define MADV_DONTNEED	4		/* don't need these pages */
 #define MADV_COLD	5		/* deactivatie these pages */
+#define MADV_PAGEOUT	6		/* reclaim these pages */
 
 /* common parameters: try to keep these consistent across architectures */
 #define MADV_FREE	8		/* free pages only if memory pressure */
diff --git a/mm/madvise.c b/mm/madvise.c
index 67c0379f64a7..3b9d2ba421b1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@ 
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
+#include <linux/page_idle.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
@@ -41,6 +42,7 @@  static int madvise_need_mmap_write(int behavior)
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_COLD:
+	case MADV_PAGEOUT:
 	case MADV_FREE:
 		return 0;
 	default:
@@ -451,6 +453,162 @@  static long madvise_cold(struct vm_area_struct *vma,
 	return 0;
 }
 
+static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	struct mmu_gather *tlb = walk->private;
+	struct mm_struct *mm = tlb->mm;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *orig_pte, *pte, ptent;
+	spinlock_t *ptl;
+	LIST_HEAD(page_list);
+	struct page *page;
+	int isolated = 0;
+	unsigned long next;
+
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+	next = pmd_addr_end(addr, end);
+	if (pmd_trans_huge(*pmd)) {
+		pmd_t orig_pmd;
+
+		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+		ptl = pmd_trans_huge_lock(pmd, vma);
+		if (!ptl)
+			return 0;
+
+		orig_pmd = *pmd;
+		if (is_huge_zero_pmd(orig_pmd))
+			goto huge_unlock;
+
+		if (unlikely(!pmd_present(orig_pmd))) {
+			VM_BUG_ON(thp_migration_supported() &&
+					!is_pmd_migration_entry(orig_pmd));
+			goto huge_unlock;
+		}
+
+		page = pmd_page(orig_pmd);
+		if (next - addr != HPAGE_PMD_SIZE) {
+			int err;
+
+			if (page_mapcount(page) != 1)
+				goto huge_unlock;
+			get_page(page);
+			spin_unlock(ptl);
+			lock_page(page);
+			err = split_huge_page(page);
+			unlock_page(page);
+			put_page(page);
+			if (!err)
+				goto regular_page;
+			return 0;
+		}
+
+		if (isolate_lru_page(page))
+			goto huge_unlock;
+
+		if (pmd_young(orig_pmd)) {
+			pmdp_invalidate(vma, addr, pmd);
+			orig_pmd = pmd_mkold(orig_pmd);
+
+			set_pmd_at(mm, addr, pmd, orig_pmd);
+			tlb_remove_tlb_entry(tlb, pmd, addr);
+		}
+
+		ClearPageReferenced(page);
+		test_and_clear_page_young(page);
+		list_add(&page->lru, &page_list);
+huge_unlock:
+		spin_unlock(ptl);
+		reclaim_pages(&page_list);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+regular_page:
+	tlb_change_page_size(tlb, PAGE_SIZE);
+	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	flush_tlb_batched_pending(mm);
+	arch_enter_lazy_mmu_mode();
+	for (; addr < end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		if (isolate_lru_page(page))
+			continue;
+
+		isolated++;
+		if (pte_young(ptent)) {
+			ptent = ptep_get_and_clear_full(mm, addr, pte,
+							tlb->fullmm);
+			ptent = pte_mkold(ptent);
+			set_pte_at(mm, addr, pte, ptent);
+			tlb_remove_tlb_entry(tlb, pte, addr);
+		}
+		ClearPageReferenced(page);
+		test_and_clear_page_young(page);
+		list_add(&page->lru, &page_list);
+		if (isolated >= SWAP_CLUSTER_MAX) {
+			arch_leave_lazy_mmu_mode();
+			pte_unmap_unlock(orig_pte, ptl);
+			reclaim_pages(&page_list);
+			isolated = 0;
+			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+			arch_enter_lazy_mmu_mode();
+			orig_pte = pte;
+		}
+	}
+
+	arch_leave_lazy_mmu_mode();
+	pte_unmap_unlock(orig_pte, ptl);
+	reclaim_pages(&page_list);
+	cond_resched();
+
+	return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end)
+{
+	struct mm_walk pageout_walk = {
+		.pmd_entry = madvise_pageout_pte_range,
+		.mm = vma->vm_mm,
+		.private = tlb,
+	};
+
+	tlb_start_vma(tlb, vma);
+	walk_page_range(addr, end, &pageout_walk);
+	tlb_end_vma(tlb, vma);
+}
+
+
+static long madvise_pageout(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start_addr, unsigned long end_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
+
+	*prev = vma;
+	if (!can_madv_lru_vma(vma))
+		return -EINVAL;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+	tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+	return 0;
+}
+
 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 
@@ -841,6 +999,8 @@  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_COLD:
 		return madvise_cold(vma, prev, start, end);
+	case MADV_PAGEOUT:
+		return madvise_pageout(vma, prev, start, end);
 	case MADV_FREE:
 	case MADV_DONTNEED:
 		return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -863,6 +1023,7 @@  madvise_behavior_valid(int behavior)
 	case MADV_DONTNEED:
 	case MADV_FREE:
 	case MADV_COLD:
+	case MADV_PAGEOUT:
 #ifdef CONFIG_KSM
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 56df55e8afcd..04061185677f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2136,6 +2136,64 @@  static void shrink_active_list(unsigned long nr_to_scan,
 			nr_deactivate, nr_rotated, sc->priority, file);
 }
 
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+	int nid = -1;
+	unsigned long nr_reclaimed = 0;
+	LIST_HEAD(node_page_list);
+	struct reclaim_stat dummy_stat;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.priority = DEF_PRIORITY,
+		.may_writepage = 1,
+		.may_unmap = 1,
+		.may_swap = 1,
+	};
+
+	while (!list_empty(page_list)) {
+		struct page *page;
+
+		page = lru_to_page(page_list);
+		if (nid == -1) {
+			nid = page_to_nid(page);
+			INIT_LIST_HEAD(&node_page_list);
+		}
+
+		if (nid == page_to_nid(page)) {
+			list_move(&page->lru, &node_page_list);
+			continue;
+		}
+
+		nr_reclaimed += shrink_page_list(&node_page_list,
+						NODE_DATA(nid),
+						&sc, 0,
+						&dummy_stat, false);
+		while (!list_empty(&node_page_list)) {
+			struct page *page = lru_to_page(&node_page_list);
+
+			list_del(&page->lru);
+			putback_lru_page(page);
+		}
+
+		nid = -1;
+	}
+
+	if (!list_empty(&node_page_list)) {
+		nr_reclaimed += shrink_page_list(&node_page_list,
+						NODE_DATA(nid),
+						&sc, 0,
+						&dummy_stat, false);
+		while (!list_empty(&node_page_list)) {
+			struct page *page = lru_to_page(&node_page_list);
+
+			list_del(&page->lru);
+			putback_lru_page(page);
+		}
+	}
+
+	return nr_reclaimed;
+}
+
 /*
  * The inactive anon list should be small enough that the VM never has
  * to do too much work.