diff mbox series

mm: vmscan: scan anonymous pages on file refaults

Message ID 20190701081038.GA83398@google.com (mailing list archive)
State New, archived
Headers show
Series mm: vmscan: scan anonymous pages on file refaults | expand

Commit Message

Kuo-Hsin Yang July 1, 2019, 8:10 a.m. UTC
When file refaults are detected and there are many inactive file pages,
the system never reclaim anonymous pages, the file pages are dropped
aggressively when there are still a lot of cold anonymous pages and
system thrashes.  This issue impacts the performance of applications
with large executable, e.g. chrome.

With this patch, when file refault is detected, inactive_list_is_low()
always returns true for file pages in get_scan_count() to enable
scanning anonymous pages.

The problem can be reproduced by the following test program.

---8<---
void fallocate_file(const char *filename, off_t size)
{
	struct stat st;
	int fd;

	if (!stat(filename, &st) && st.st_size >= size)
		return;

	fd = open(filename, O_WRONLY | O_CREAT, 0600);
	if (fd < 0) {
		perror("create file");
		exit(1);
	}
	if (posix_fallocate(fd, 0, size)) {
		perror("fallocate");
		exit(1);
	}
	close(fd);
}

long *alloc_anon(long size)
{
	long *start = malloc(size);
	memset(start, 1, size);
	return start;
}

long access_file(const char *filename, long size, long rounds)
{
	int fd, i;
	volatile char *start1, *end1, *start2;
	const int page_size = getpagesize();
	long sum = 0;

	fd = open(filename, O_RDONLY);
	if (fd == -1) {
		perror("open");
		exit(1);
	}

	/*
	 * Some applications, e.g. chrome, use a lot of executable file
	 * pages, map some of the pages with PROT_EXEC flag to simulate
	 * the behavior.
	 */
	start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED,
		      fd, 0);
	if (start1 == MAP_FAILED) {
		perror("mmap");
		exit(1);
	}
	end1 = start1 + size / 2;

	start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2);
	if (start2 == MAP_FAILED) {
		perror("mmap");
		exit(1);
	}

	for (i = 0; i < rounds; ++i) {
		struct timeval before, after;
		volatile char *ptr1 = start1, *ptr2 = start2;
		gettimeofday(&before, NULL);
		for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size)
			sum += *ptr1 + *ptr2;
		gettimeofday(&after, NULL);
		printf("File access time, round %d: %f (sec)\n", i,
		       (after.tv_sec - before.tv_sec) +
		       (after.tv_usec - before.tv_usec) / 1000000.0);
	}
	return sum;
}

int main(int argc, char *argv[])
{
	const long MB = 1024 * 1024;
	long anon_mb, file_mb, file_rounds;
	const char filename[] = "large";
	long *ret1;
	long ret2;

	if (argc != 4) {
		printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS\n");
		exit(0);
	}
	anon_mb = atoi(argv[1]);
	file_mb = atoi(argv[2]);
	file_rounds = atoi(argv[3]);

	fallocate_file(filename, file_mb * MB);
	printf("Allocate %ld MB anonymous pages\n", anon_mb);
	ret1 = alloc_anon(anon_mb * MB);
	printf("Access %ld MB file pages\n", file_mb);
	ret2 = access_file(filename, file_mb * MB, file_rounds);
	printf("Print result to prevent optimization: %ld\n",
	       *ret1 + ret2);
	return 0;
}
---8<---

Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the
program fills ram with 2048 MB memory, access a 200 MB file for 10
times.  Without this patch, the file cache is dropped aggresively and
every access to the file is from disk.

  $ ./thrash 2048 200 10
  Allocate 2048 MB anonymous pages
  Access 200 MB file pages
  File access time, round 0: 2.489316 (sec)
  File access time, round 1: 2.581277 (sec)
  File access time, round 2: 2.487624 (sec)
  File access time, round 3: 2.449100 (sec)
  File access time, round 4: 2.420423 (sec)
  File access time, round 5: 2.343411 (sec)
  File access time, round 6: 2.454833 (sec)
  File access time, round 7: 2.483398 (sec)
  File access time, round 8: 2.572701 (sec)
  File access time, round 9: 2.493014 (sec)

With this patch, these file pages can be cached.

  $ ./thrash 2048 200 10
  Allocate 2048 MB anonymous pages
  Access 200 MB file pages
  File access time, round 0: 2.475189 (sec)
  File access time, round 1: 2.440777 (sec)
  File access time, round 2: 2.411671 (sec)
  File access time, round 3: 1.955267 (sec)
  File access time, round 4: 0.029924 (sec)
  File access time, round 5: 0.000808 (sec)
  File access time, round 6: 0.000771 (sec)
  File access time, round 7: 0.000746 (sec)
  File access time, round 8: 0.000738 (sec)
  File access time, round 9: 0.000747 (sec)

Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty")
Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty")
Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org> # 4.12+
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

Comments

Michal Hocko July 3, 2019, 2:30 p.m. UTC | #1
On Mon 01-07-19 16:10:38, Kuo-Hsin Yang wrote:
> When file refaults are detected and there are many inactive file pages,
> the system never reclaim anonymous pages, the file pages are dropped
> aggressively when there are still a lot of cold anonymous pages and
> system thrashes.  This issue impacts the performance of applications
> with large executable, e.g. chrome.
> 
> With this patch, when file refault is detected, inactive_list_is_low()
> always returns true for file pages in get_scan_count() to enable
> scanning anonymous pages.
> 
> The problem can be reproduced by the following test program.
> 
> ---8<---
> void fallocate_file(const char *filename, off_t size)
> {
> 	struct stat st;
> 	int fd;
> 
> 	if (!stat(filename, &st) && st.st_size >= size)
> 		return;
> 
> 	fd = open(filename, O_WRONLY | O_CREAT, 0600);
> 	if (fd < 0) {
> 		perror("create file");
> 		exit(1);
> 	}
> 	if (posix_fallocate(fd, 0, size)) {
> 		perror("fallocate");
> 		exit(1);
> 	}
> 	close(fd);
> }
> 
> long *alloc_anon(long size)
> {
> 	long *start = malloc(size);
> 	memset(start, 1, size);
> 	return start;
> }
> 
> long access_file(const char *filename, long size, long rounds)
> {
> 	int fd, i;
> 	volatile char *start1, *end1, *start2;
> 	const int page_size = getpagesize();
> 	long sum = 0;
> 
> 	fd = open(filename, O_RDONLY);
> 	if (fd == -1) {
> 		perror("open");
> 		exit(1);
> 	}
> 
> 	/*
> 	 * Some applications, e.g. chrome, use a lot of executable file
> 	 * pages, map some of the pages with PROT_EXEC flag to simulate
> 	 * the behavior.
> 	 */
> 	start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED,
> 		      fd, 0);
> 	if (start1 == MAP_FAILED) {
> 		perror("mmap");
> 		exit(1);
> 	}
> 	end1 = start1 + size / 2;
> 
> 	start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2);
> 	if (start2 == MAP_FAILED) {
> 		perror("mmap");
> 		exit(1);
> 	}
> 
> 	for (i = 0; i < rounds; ++i) {
> 		struct timeval before, after;
> 		volatile char *ptr1 = start1, *ptr2 = start2;
> 		gettimeofday(&before, NULL);
> 		for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size)
> 			sum += *ptr1 + *ptr2;
> 		gettimeofday(&after, NULL);
> 		printf("File access time, round %d: %f (sec)\n", i,
> 		       (after.tv_sec - before.tv_sec) +
> 		       (after.tv_usec - before.tv_usec) / 1000000.0);
> 	}
> 	return sum;
> }
> 
> int main(int argc, char *argv[])
> {
> 	const long MB = 1024 * 1024;
> 	long anon_mb, file_mb, file_rounds;
> 	const char filename[] = "large";
> 	long *ret1;
> 	long ret2;
> 
> 	if (argc != 4) {
> 		printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS\n");
> 		exit(0);
> 	}
> 	anon_mb = atoi(argv[1]);
> 	file_mb = atoi(argv[2]);
> 	file_rounds = atoi(argv[3]);
> 
> 	fallocate_file(filename, file_mb * MB);
> 	printf("Allocate %ld MB anonymous pages\n", anon_mb);
> 	ret1 = alloc_anon(anon_mb * MB);
> 	printf("Access %ld MB file pages\n", file_mb);
> 	ret2 = access_file(filename, file_mb * MB, file_rounds);
> 	printf("Print result to prevent optimization: %ld\n",
> 	       *ret1 + ret2);
> 	return 0;
> }
> ---8<---
> 
> Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the
> program fills ram with 2048 MB memory, access a 200 MB file for 10
> times.  Without this patch, the file cache is dropped aggresively and
> every access to the file is from disk.
> 
>   $ ./thrash 2048 200 10
>   Allocate 2048 MB anonymous pages
>   Access 200 MB file pages
>   File access time, round 0: 2.489316 (sec)
>   File access time, round 1: 2.581277 (sec)
>   File access time, round 2: 2.487624 (sec)
>   File access time, round 3: 2.449100 (sec)
>   File access time, round 4: 2.420423 (sec)
>   File access time, round 5: 2.343411 (sec)
>   File access time, round 6: 2.454833 (sec)
>   File access time, round 7: 2.483398 (sec)
>   File access time, round 8: 2.572701 (sec)
>   File access time, round 9: 2.493014 (sec)
> 
> With this patch, these file pages can be cached.
> 
>   $ ./thrash 2048 200 10
>   Allocate 2048 MB anonymous pages
>   Access 200 MB file pages
>   File access time, round 0: 2.475189 (sec)
>   File access time, round 1: 2.440777 (sec)
>   File access time, round 2: 2.411671 (sec)
>   File access time, round 3: 1.955267 (sec)
>   File access time, round 4: 0.029924 (sec)
>   File access time, round 5: 0.000808 (sec)
>   File access time, round 6: 0.000771 (sec)
>   File access time, round 7: 0.000746 (sec)
>   File access time, round 8: 0.000738 (sec)
>   File access time, round 9: 0.000747 (sec)

How does the reclaim behave with workloads with file backed data set
not fitting into the memory? Aren't we going to to swap a lot -
something that the heuristic is protecting from?

> Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty")
> Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty")
> Signed-off-by: Kuo-Hsin Yang <vovoy@chromium.org>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Cc: <stable@vger.kernel.org> # 4.12+
Kuo-Hsin Yang July 4, 2019, 9:47 a.m. UTC | #2
On Wed, Jul 03, 2019 at 04:30:57PM +0200, Michal Hocko wrote:
> 
> How does the reclaim behave with workloads with file backed data set
> not fitting into the memory? Aren't we going to to swap a lot -
> something that the heuristic is protecting from?
> 

In common case, most of the pages in a large file backed data set are
non-executable. When there are a lot of non-executable file pages,
usually more file pages are scanned because of the recent_scanned /
recent_rotated ratio.

I modified the test program to set the accessed sizes of the executable
and non-executable file pages respectively. The test program runs on 2GB
RAM VM with kernel 5.2.0-rc7 and this patch, allocates 2000 MB anonymous
memory, then accesses 100 MB executable file pages and 2100 MB
non-executable file pages for 10 times. The test also prints the file
and anonymous page sizes in kB from /proc/meminfo. There are not too
many swaps in this test case. I got similar test result without this
patch.

$ ./thrash 2000 100 2100 10
Allocate 2000 MB anonymous pages
Active(anon): 1850964, Inactive(anon): 133140, Active(file): 1528, Inactive(file): 1352
Access 100 MB executable file pages
Access 2100 MB regular file pages
File access time, round 0: 26.833665 (sec)
Active(anon): 1476084, Inactive(anon): 492060, Active(file): 2236, Inactive(file): 2224
File access time, round 1: 26.362102 (sec)
Active(anon): 1471364, Inactive(anon): 490464, Active(file): 8508, Inactive(file): 8172
File access time, round 2: 26.828894 (sec)
Active(anon): 1469184, Inactive(anon): 489688, Active(file): 10012, Inactive(file): 9840
File access time, round 3: 27.105603 (sec)
Active(anon): 1468128, Inactive(anon): 489408, Active(file): 11000, Inactive(file): 10388
File access time, round 4: 26.936500 (sec)
Active(anon): 1466380, Inactive(anon): 488788, Active(file): 12872, Inactive(file): 12504
File access time, round 5: 26.294687 (sec)
Active(anon): 1466384, Inactive(anon): 488780, Active(file): 13332, Inactive(file): 12396
File access time, round 6: 27.382404 (sec)
Active(anon): 1466344, Inactive(anon): 488772, Active(file): 13100, Inactive(file): 12276
File access time, round 7: 26.607976 (sec)
Active(anon): 1466392, Inactive(anon): 488764, Active(file): 12892, Inactive(file): 11928
File access time, round 8: 26.477663 (sec)
Active(anon): 1466344, Inactive(anon): 488760, Active(file): 12920, Inactive(file): 12092
File access time, round 9: 26.552859 (sec)
Active(anon): 1465820, Inactive(anon): 488748, Active(file): 13300, Inactive(file): 12372
Michal Hocko July 4, 2019, 11:04 a.m. UTC | #3
On Thu 04-07-19 17:47:16, Kuo-Hsin Yang wrote:
> On Wed, Jul 03, 2019 at 04:30:57PM +0200, Michal Hocko wrote:
> > 
> > How does the reclaim behave with workloads with file backed data set
> > not fitting into the memory? Aren't we going to to swap a lot -
> > something that the heuristic is protecting from?
> > 
> 
> In common case, most of the pages in a large file backed data set are
> non-executable. When there are a lot of non-executable file pages,
> usually more file pages are scanned because of the recent_scanned /
> recent_rotated ratio.
> 
> I modified the test program to set the accessed sizes of the executable
> and non-executable file pages respectively. The test program runs on 2GB
> RAM VM with kernel 5.2.0-rc7 and this patch, allocates 2000 MB anonymous
> memory, then accesses 100 MB executable file pages and 2100 MB
> non-executable file pages for 10 times. The test also prints the file
> and anonymous page sizes in kB from /proc/meminfo. There are not too
> many swaps in this test case. I got similar test result without this
> patch.

Could you record swap out stats please? Also what happens if you have
multiple readers?

Thanks!
Kuo-Hsin Yang July 5, 2019, 12:45 p.m. UTC | #4
On Thu, Jul 04, 2019 at 01:04:25PM +0200, Michal Hocko wrote:
> On Thu 04-07-19 17:47:16, Kuo-Hsin Yang wrote:
> > On Wed, Jul 03, 2019 at 04:30:57PM +0200, Michal Hocko wrote:
> > > 
> > > How does the reclaim behave with workloads with file backed data set
> > > not fitting into the memory? Aren't we going to to swap a lot -
> > > something that the heuristic is protecting from?
> > > 
> > 
> > In common case, most of the pages in a large file backed data set are
> > non-executable. When there are a lot of non-executable file pages,
> > usually more file pages are scanned because of the recent_scanned /
> > recent_rotated ratio.
> > 
> > I modified the test program to set the accessed sizes of the executable
> > and non-executable file pages respectively. The test program runs on 2GB
> > RAM VM with kernel 5.2.0-rc7 and this patch, allocates 2000 MB anonymous
> > memory, then accesses 100 MB executable file pages and 2100 MB
> > non-executable file pages for 10 times. The test also prints the file
> > and anonymous page sizes in kB from /proc/meminfo. There are not too
> > many swaps in this test case. I got similar test result without this
> > patch.
> 
> Could you record swap out stats please? Also what happens if you have
> multiple readers?

Checked the swap out stats during the test [1], 19006 pages swapped out
with this patch, 3418 pages swapped out without this patch. There are
more swap out, but I think it's within reasonable range when file backed
data set doesn't fit into the memory.

$ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS PROCESSES
Allocate 2000 MB anonymous pages
active_anon: 1613644, inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB)
pswpout: 7972443, pgpgin: 478615246
Access 100 MB executable file pages
Access 2100 MB regular file pages
File access time, round 0: 12.165, (sec)
active_anon: 1433788, inactive_anon: 478116, active_file: 17896, inactive_file: 24328 (kB)
File access time, round 1: 11.493, (sec)
active_anon: 1430576, inactive_anon: 477144, active_file: 25440, inactive_file: 26172 (kB)
File access time, round 2: 11.455, (sec)
active_anon: 1427436, inactive_anon: 476060, active_file: 21112, inactive_file: 28808 (kB)
File access time, round 3: 11.454, (sec)
active_anon: 1420444, inactive_anon: 473632, active_file: 23216, inactive_file: 35036 (kB)
File access time, round 4: 11.479, (sec)
active_anon: 1413964, inactive_anon: 471460, active_file: 31728, inactive_file: 32224 (kB)
pswpout: 7991449 (+ 19006), pgpgin: 489924366 (+ 11309120)

With 4 processes accessing non-overlapping parts of a large file, 30316
pages swapped out with this patch, 5152 pages swapped out without this
patch. The swapout number is small comparing to pgpgin.

[1]: https://github.com/vovo/testing/blob/master/mem_thrash.c
Michal Hocko July 12, 2019, 7:13 a.m. UTC | #5
On Fri 05-07-19 20:45:05, Kuo-Hsin Yang wrote:
> With 4 processes accessing non-overlapping parts of a large file, 30316
> pages swapped out with this patch, 5152 pages swapped out without this
> patch. The swapout number is small comparing to pgpgin.

which is 5 times more swapout. This may be seen to be a lot for
workloads that prefer no swapping (e.g. large in memory databases) with
an occasional heavy IO (e.g. backup). And I am worried those would
regress. I do agree that the current behavior is far from optimal
because the trashing is real. I believe that we really need a different
approach. Johannes has brought this up few years back (sorry I do not
have a link handy) but it was essentially about implementing refault
logic to anonymous memory and swap out based on the refault price. If
there is effectively no swapin then it simply makes more sense to swap
out rather than refault a page cache.

That being said, I am not nacking the patch. Let's see whether something
regresses as there is a no clear cut for the proper behavior. But I am
bringing that up because we really need a better and more robust plan
for the future.
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7889f583ced9f..da0b97204372e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2125,7 +2125,7 @@  static void shrink_active_list(unsigned long nr_to_scan,
  *   10TB     320        32GB
  */
 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-				 struct scan_control *sc, bool actual_reclaim)
+				 struct scan_control *sc, bool trace)
 {
 	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2151,7 +2151,7 @@  static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	 * rid of the stale workingset quickly.
 	 */
 	refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
-	if (file && actual_reclaim && lruvec->refaults != refaults) {
+	if (file && lruvec->refaults != refaults) {
 		inactive_ratio = 0;
 	} else {
 		gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2161,7 +2161,7 @@  static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 			inactive_ratio = 1;
 	}
 
-	if (actual_reclaim)
+	if (trace)
 		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
 			lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
 			lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,