Message ID | 20181010071924.18767-1-ying.huang@intel.com (mailing list archive) |
---|---|
Headers | show |
Series | swap: Swapout/swapin THP in one piece | expand |
On Wed, Oct 10, 2018 at 03:19:03PM +0800, Huang Ying wrote: > And for all, Any comment is welcome! > > This patchset is based on the 2018-10-3 head of mmotm/master. There seems to be some infrequent memory corruption with THPs that have been swapped out: page contents differ after swapin. Reproducer at the bottom. Part of some tests I'm writing, had to separate it a little hack-ily. Basically it writes the word offset _at_ each word offset in a memory blob, tries to push it to swap, and verifies the offset is the same after swapin. I ran with THP enabled=always. THP swapin_enabled could be always or never, it happened with both. Every time swapping occurred, a single THP-sized chunk in the middle of the blob had different offsets. Example: ** > word corruption gap ** corruption detected 14929920 bytes in (got 15179776, expected 14929920) ** ** corruption detected 14929928 bytes in (got 15179784, expected 14929928) ** ** corruption detected 14929936 bytes in (got 15179792, expected 14929936) ** ...pattern continues... ** corruption detected 17027048 bytes in (got 15179752, expected 17027048) ** ** corruption detected 17027056 bytes in (got 15179760, expected 17027056) ** ** corruption detected 17027064 bytes in (got 15179768, expected 17027064) ** 100.0% of memory was swapped out at mincore time 0.00305% of pages were corrupted (first corrupt word 14929920, last corrupt word 17027064) The problem goes away with THP enabled=never, and I don't see it on 2018-10-3 mmotm/master with THP enabled=always. The server had an NVMe swap device and ~760G memory over two nodes, and the program was always run like this: swap-verify -s $((64 * 2**30)) The kernels had one extra patch, Alexander Duyck's "dma-direct: Fix return value of dma_direct_supported", which was required to get them to build. ---------------------------------------8<--------------------------------------- /* * swap-verify.c - helper to verify contents of swapped out pages * * Daniel Jordan <daniel.m.jordan@oracle.com> */ #define _GNU_SOURCE #include <getopt.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <sys/mman.h> #include <unistd.h> #define TEST_SUCCESS 0 #define TEST_FAIL 1 #define TEST_SKIP 2 static void usagedie(int exitcode) { fprintf(stderr, "usage: swap-verify\n" " -h show this message\n" " -s bytes\n"); exit(exitcode); } int main(int argc, char **argv) { int c, pgsize; char *pages; unsigned char *mincore_vec; size_t i, j, nr_pages_swapped, nr_pages_corrupted; size_t bytes = 1ul << 30; /* default 1G */ ssize_t bytes_read; size_t first_corrupt_word, last_corrupt_word, prev_corrupt_word; while ((c = getopt(argc, argv, "hs:")) != -1) { switch (c) { case 'h': usagedie(0); break; case 's': bytes = strtoul(optarg, NULL, 10); break; default: fprintf(stderr, "unrecognized option %c\n", c); exit(TEST_SKIP); } } pgsize = getpagesize(); if ((mincore_vec = calloc(bytes / pgsize, 1)) == NULL) { perror("calloc"); exit(TEST_SKIP); } if ((pages = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == MAP_FAILED) { perror("mmap"); exit(TEST_SKIP); } /* Fill pages with a "random" pattern. */ for (i = 0; i < bytes; i += sizeof(unsigned long)) *(unsigned long *)(pages + i) = i; /* Now fill memory, trying to push the pages just allocated to swap. */ system("./use-mem-total"); /* Is the memory swapped out? */ if (mincore(pages, bytes, mincore_vec) == -1) { perror("mincore"); exit(TEST_SKIP); } nr_pages_swapped = 0; nr_pages_corrupted = 0; first_corrupt_word = bytes; last_corrupt_word = 0; prev_corrupt_word = bytes; for (i = 0; i < bytes; i += pgsize) { bool page_corrupt = false; if (mincore_vec[i / pgsize] & 1) { /* Resident, don't bother checking. */ continue; } ++nr_pages_swapped; for (j = i; j < i + pgsize; j += sizeof(unsigned long)) { unsigned long val = *(unsigned long *)(pages + j); if (val != j) { if (!page_corrupt) ++nr_pages_corrupted; page_corrupt = true; if (j - prev_corrupt_word != sizeof(unsigned long)) fprintf(stderr, "** > word corruption gap\n"); if (j % (1ul << 21) == 0) fprintf(stderr, "-- THP boundary\n"); if (j < first_corrupt_word) first_corrupt_word = j; if (j > last_corrupt_word) last_corrupt_word = j; fprintf(stderr, "** corruption detected %lu " "bytes in (got %lu, expected %lu) **\n", j, val, j); prev_corrupt_word = j; } } } fprintf(stderr, "%.1f%% of memory was swapped out at mincore time\n", ((double)nr_pages_swapped / (bytes / pgsize)) * 100); if (nr_pages_corrupted) { fprintf(stderr, "%.5f%% of pages were corrupted (first corrupt " "word %lu, last corrupt word %lu)\n", ((double)nr_pages_corrupted / (bytes / pgsize)) * 100, first_corrupt_word, last_corrupt_word); } else { fprintf(stderr, "no memory corruption detected\n"); } return (nr_pages_corrupted) ? TEST_FAIL : TEST_SUCCESS; } ---------------------------------------8<--------------------------------------- #!/usr/bin/env bash # # use-mem-total # # Helper that allocates MemTotal and exits immediately. Useful for causing # swapping of previously allocated memory. # XXX fix paths source /path/to/vm-scalability/hw_vars /path/to/usemem --thread $nr_task --step $pagesize -q --repeat 4 \ $(( mem * 11 / 10 / nr_task )) > /dev/null ---------------------------------------8<---------------------------------------
Hi, Daniel, Daniel Jordan <daniel.m.jordan@oracle.com> writes: > On Wed, Oct 10, 2018 at 03:19:03PM +0800, Huang Ying wrote: >> And for all, Any comment is welcome! >> >> This patchset is based on the 2018-10-3 head of mmotm/master. > > There seems to be some infrequent memory corruption with THPs that have been > swapped out: page contents differ after swapin. Thanks a lot for testing this! I know there were big effort behind this and it definitely will improve the quality of the patchset greatly! > Reproducer at the bottom. Part of some tests I'm writing, had to separate it a > little hack-ily. Basically it writes the word offset _at_ each word offset in > a memory blob, tries to push it to swap, and verifies the offset is the same > after swapin. > > I ran with THP enabled=always. THP swapin_enabled could be always or never, it > happened with both. Every time swapping occurred, a single THP-sized chunk in > the middle of the blob had different offsets. Example: > > ** > word corruption gap > ** corruption detected 14929920 bytes in (got 15179776, expected 14929920) ** > ** corruption detected 14929928 bytes in (got 15179784, expected 14929928) ** > ** corruption detected 14929936 bytes in (got 15179792, expected 14929936) ** > ...pattern continues... > ** corruption detected 17027048 bytes in (got 15179752, expected 17027048) ** > ** corruption detected 17027056 bytes in (got 15179760, expected 17027056) ** > ** corruption detected 17027064 bytes in (got 15179768, expected 17027064) ** 15179776 < 15179xxx <= 17027064 15179776 % 4096 = 0 And 15179776 = 15179768 + 8 So I guess we have some alignment bug. Could you try the patches attached? It deal with some alignment issue. > 100.0% of memory was swapped out at mincore time > 0.00305% of pages were corrupted (first corrupt word 14929920, last corrupt word 17027064) > > The problem goes away with THP enabled=never, and I don't see it on 2018-10-3 > mmotm/master with THP enabled=always. > > The server had an NVMe swap device and ~760G memory over two nodes, and the > program was always run like this: swap-verify -s $((64 * 2**30)) > > The kernels had one extra patch, Alexander Duyck's > "dma-direct: Fix return value of dma_direct_supported", which was required to > get them to build. > Thanks again! Best Regards, Huang, Ying ---------------------------------->8----------------------------- From e1c3e4f565deeb8245bdc4ee53a1f1e4188b6d4a Mon Sep 17 00:00:00 2001 From: Huang Ying <ying.huang@intel.com> Date: Wed, 24 Oct 2018 11:24:15 +0800 Subject: [PATCH] Fix alignment bug --- include/linux/huge_mm.h | 6 ++---- mm/huge_memory.c | 9 ++++----- mm/swap_state.c | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 96baae08f47c..e7b3527bc493 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -379,8 +379,7 @@ struct page_vma_mapped_walk; #ifdef CONFIG_THP_SWAP extern void __split_huge_swap_pmd(struct vm_area_struct *vma, - unsigned long haddr, - pmd_t *pmd); + unsigned long addr, pmd_t *pmd); extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, pmd_t orig_pmd); extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd); @@ -411,8 +410,7 @@ static inline bool transparent_hugepage_swapin_enabled( } #else /* CONFIG_THP_SWAP */ static inline void __split_huge_swap_pmd(struct vm_area_struct *vma, - unsigned long haddr, - pmd_t *pmd) + unsigned long addr, pmd_t *pmd) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ed64266b63dc..b2af3bff7624 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1731,10 +1731,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) #ifdef CONFIG_THP_SWAP /* Convert a PMD swap mapping to a set of PTE swap mappings */ void __split_huge_swap_pmd(struct vm_area_struct *vma, - unsigned long haddr, + unsigned long addr, pmd_t *pmd) { struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = addr & HPAGE_PMD_MASK; pgtable_t pgtable; pmd_t _pmd; swp_entry_t entry; @@ -1772,7 +1773,7 @@ int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd, ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, orig_pmd)) - __split_huge_swap_pmd(vma, address & HPAGE_PMD_MASK, pmd); + __split_huge_swap_pmd(vma, address, pmd); else ret = -ENOENT; spin_unlock(ptl); @@ -2013,9 +2014,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * swap mapping and operate on the PTEs */ if (next - addr != HPAGE_PMD_SIZE) { - unsigned long haddr = addr & HPAGE_PMD_MASK; - - __split_huge_swap_pmd(vma, haddr, pmd); + __split_huge_swap_pmd(vma, addr, pmd); goto out; } free_swap_and_cache(entry, HPAGE_PMD_NR); diff --git a/mm/swap_state.c b/mm/swap_state.c index 784ad6388da0..fd143ef82351 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -451,7 +451,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* May fail (-ENOMEM) if XArray node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); - err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); + err = add_to_swap_cache(new_page, hentry, gfp_mask & GFP_KERNEL); if (likely(!err)) { /* Initiate read into locked page */ SetPageWorkingset(new_page);
On Wed, Oct 24, 2018 at 11:31:42AM +0800, Huang, Ying wrote: > Hi, Daniel, > > Daniel Jordan <daniel.m.jordan@oracle.com> writes: > > > On Wed, Oct 10, 2018 at 03:19:03PM +0800, Huang Ying wrote: > >> And for all, Any comment is welcome! > >> > >> This patchset is based on the 2018-10-3 head of mmotm/master. > > > > There seems to be some infrequent memory corruption with THPs that have been > > swapped out: page contents differ after swapin. > > Thanks a lot for testing this! I know there were big effort behind this > and it definitely will improve the quality of the patchset greatly! You're welcome! Hopefully I'll have more results and tests to share in the next two weeks. > > > Reproducer at the bottom. Part of some tests I'm writing, had to separate it a > > little hack-ily. Basically it writes the word offset _at_ each word offset in > > a memory blob, tries to push it to swap, and verifies the offset is the same > > after swapin. > > > > I ran with THP enabled=always. THP swapin_enabled could be always or never, it > > happened with both. Every time swapping occurred, a single THP-sized chunk in > > the middle of the blob had different offsets. Example: > > > > ** > word corruption gap > > ** corruption detected 14929920 bytes in (got 15179776, expected 14929920) ** > > ** corruption detected 14929928 bytes in (got 15179784, expected 14929928) ** > > ** corruption detected 14929936 bytes in (got 15179792, expected 14929936) ** > > ...pattern continues... > > ** corruption detected 17027048 bytes in (got 15179752, expected 17027048) ** > > ** corruption detected 17027056 bytes in (got 15179760, expected 17027056) ** > > ** corruption detected 17027064 bytes in (got 15179768, expected 17027064) ** > > 15179776 < 15179xxx <= 17027064 > > 15179776 % 4096 = 0 > > And 15179776 = 15179768 + 8 > > So I guess we have some alignment bug. Could you try the patches > attached? It deal with some alignment issue. That fixed it. And removed three lines of code. Nice :)
Daniel Jordan <daniel.m.jordan@oracle.com> writes: > On Wed, Oct 24, 2018 at 11:31:42AM +0800, Huang, Ying wrote: >> Hi, Daniel, >> >> Daniel Jordan <daniel.m.jordan@oracle.com> writes: >> >> > On Wed, Oct 10, 2018 at 03:19:03PM +0800, Huang Ying wrote: >> >> And for all, Any comment is welcome! >> >> >> >> This patchset is based on the 2018-10-3 head of mmotm/master. >> > >> > There seems to be some infrequent memory corruption with THPs that have been >> > swapped out: page contents differ after swapin. >> >> Thanks a lot for testing this! I know there were big effort behind this >> and it definitely will improve the quality of the patchset greatly! > > You're welcome! Hopefully I'll have more results and tests to share in the > next two weeks. > >> >> > Reproducer at the bottom. Part of some tests I'm writing, had to separate it a >> > little hack-ily. Basically it writes the word offset _at_ each word offset in >> > a memory blob, tries to push it to swap, and verifies the offset is the same >> > after swapin. >> > >> > I ran with THP enabled=always. THP swapin_enabled could be always or never, it >> > happened with both. Every time swapping occurred, a single THP-sized chunk in >> > the middle of the blob had different offsets. Example: >> > >> > ** > word corruption gap >> > ** corruption detected 14929920 bytes in (got 15179776, expected 14929920) ** >> > ** corruption detected 14929928 bytes in (got 15179784, expected 14929928) ** >> > ** corruption detected 14929936 bytes in (got 15179792, expected 14929936) ** >> > ...pattern continues... >> > ** corruption detected 17027048 bytes in (got 15179752, expected 17027048) ** >> > ** corruption detected 17027056 bytes in (got 15179760, expected 17027056) ** >> > ** corruption detected 17027064 bytes in (got 15179768, expected 17027064) ** >> >> 15179776 < 15179xxx <= 17027064 >> >> 15179776 % 4096 = 0 >> >> And 15179776 = 15179768 + 8 >> >> So I guess we have some alignment bug. Could you try the patches >> attached? It deal with some alignment issue. > > That fixed it. And removed three lines of code. Nice :) Thanks! I will merge the fixes into the patchset. Best Regards, Huang, Ying
On Wed, Oct 10, 2018 at 03:19:03PM +0800, Huang Ying wrote:
> And for all, Any comment is welcome!
Hi Ying,
Looks like an edge case. I'd run the program at the bottom like
./stress-usage-counts -l 4 -s 4 -U 780g
where the 780g was big enough to cause swapping on the machine. This allocates
a bunch of THPs in the parent and then forks children that either unmap pieces
of the THPs and then do random reads in the pieces still mapped, or just
randomly read in the whole range without unmapping anything.
I had your patch from the other thread, fyi.
Thanks,
Daniel
[15384.814483] ------------[ cut here ]------------
[15384.820622] kernel BUG at /home/dbbench/src/linux/mm/swapfile.c:4134!
[15384.828793] invalid opcode: 0000 [#1] SMP PTI
[15384.834604] CPU: 15 PID: 27456 Comm: stress-usage-co Kdump: loaded Not tainted 4.19.0-rc6-mm1-thp-swap-v6-gcov+ #3
[15384.847096] Hardware name: Oracle Corporation ORACLE SERVER X7-2/ASM, MB, X7-2, BIOS 41017600 10/06/2017
[15384.858637] RIP: 0010:split_swap_cluster_map+0x172/0x1d0
[15384.865493] Code: 89 4c 01 01 e9 2a ff ff ff 5b 5d 31 c0 48 83 05 1b 89 4c 01 01 41 5c c3 b8 f0 ff ff ff e9 37 ff ff ff 48 83 05 0e 88 4c 01 01 <0f> 0b 48 83 05 14 88 4c 01 01 48 83 05 14 88 4c 01 01 48 83 05 14
[15384.888329] RSP: 0018:ffffaca85fb9bc88 EFLAGS: 00010202
[15384.895075] RAX: 0000000000000000 RBX: 00007f0463800000 RCX: 0000000000000000
[15384.903964] RDX: ffff9154229e28e0 RSI: 00007f0463800000 RDI: 0000000000194ff8
[15384.912834] RBP: ffff9154229e28e0 R08: 0000000000000000 R09: 000fffffffe00000
[15384.921694] R10: 0000000000000000 R11: ffff90f8000008e0 R12: 0000000000194ff8
[15384.930533] R13: ffff9156a8bb1100 R14: ffff915168646c00 R15: ffff9156a8bb1100
[15384.939363] FS: 00007fc763ff5740(0000) GS:ffff9156c07c0000(0000) knlGS:0000000000000000
[15384.949272] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[15384.956567] CR2: 0000000000603090 CR3: 0000005a9a2f8003 CR4: 00000000007606e0
[15384.965373] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[15384.974164] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[15384.982943] PKRU: 55555554
[15384.986756] Call Trace:
[15384.990268] __split_huge_swap_pmd+0x48/0x170
[15384.995928] __split_huge_pmd_locked+0x8be/0x1590
[15385.001933] ? flush_tlb_mm_range+0xa1/0x120
[15385.007439] ? memcg_check_events+0x2f/0x2e0
[15385.012949] __split_huge_pmd+0x2d6/0x3e0
[15385.018135] split_huge_pmd_address+0xbd/0x100
[15385.023794] vma_adjust_trans_huge+0xe0/0x150
[15385.029344] __vma_adjust+0xb8/0x770
[15385.034004] __split_vma+0x182/0x1a0
[15385.038647] __do_munmap+0xfd/0x340
[15385.043182] __vm_munmap+0x6d/0xc0
[15385.047600] __x64_sys_munmap+0x27/0x30
[15385.052509] do_syscall_64+0x49/0x100
[15385.057206] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[15385.063430] RIP: 0033:0x7fc7638fa087
[15385.068018] Code: 64 89 02 48 83 c8 ff eb 9c 48 8b 15 03 ce 2c 00 f7 d8 64 89 02 e9 6a ff ff ff 66 0f 1f 84 00 00 00 00 00 b8 0b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d d9 cd 2c 00 f7 d8 64 89 01 48
[15385.090123] RSP: 002b:00007ffec5d40198 EFLAGS: 00000202 ORIG_RAX: 000000000000000b
[15385.099142] RAX: ffffffffffffffda RBX: 000000005be4d221 RCX: 00007fc7638fa087
[15385.107700] RDX: 00007f0463960000 RSI: 00000000000216d6 RDI: 00007f0463960000
[15385.116223] RBP: 00007ffec5d401f0 R08: 0000000000006ab7 R09: 00007ffec5d401f0
[15385.124738] R10: 00007ffec5d3f5a0 R11: 0000000000000202 R12: 0000000000400ca0
[15385.133217] R13: 00007ffec5d40340 R14: 0000000000000000 R15: 0000000000000000
[15385.141684] Modules linked in: sunrpc vfat fat coretemp x86_pkg_temp_thermal crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel aes_x86_64 ext4 crypto_simd cryptd glue_helper jbd2 ext2 mbcache ipmi_ssif ipmi_si ioatdma ipmi_devintf sg iTCO_wdt lpc_ich pcspkr wmi mfd_core i2c_i801 ipmi_msghandler ip_tables xfs libcrc32c sd_mod mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops igb ttm nvme hwmon xhci_pci drm dca megaraid_sas xhci_hcd crc32c_intel nvme_core i2c_algo_bit ahci i2c_core libahci dm_mirror dm_region_hash dm_log dm_mod dax efivarfs ipv6 crc_ccitt autofs4
--------------------------------------8<---------------------------------------
/*
* stress-usage-counts.c
*
* gcc -o stress-usage-counts stress-usage-counts.c -pthread
*
* Daniel Jordan <daniel.m.jordan@oracle.com>
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <time.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <assert.h>
#include <fcntl.h>
#include <pthread.h>
#include <time.h>
#define ALIGN(x, a) ((x) & ~((a) - 1))
#define DEBUG 0
#define dprintf if (DEBUG) printf
#define THP_PGSZ_SYSFS "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
/* Taken from include/linux/kernel.h */
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
typedef void * (*start_routine)(void *);
char *ourname;
size_t bytes;
char *memory;
char *memory_unaligned;
char *munmap_arg;
unsigned long munmap_size;
unsigned long munmap_offset;
int random_munmap;
size_t pagesize;
unsigned long thp_size;
size_t nr_thread;
static void usage(int ok)
{
fprintf(stderr,
"Usage: %s [options] size[k|m|g|t]\n"
" -h show this message\n"
" -l N start N large-page processes, default 1\n"
" -s N start N small-page processes, default 1\n"
" -u offset,size munmap in small-page procs at each offset of size\n"
" -U munmap in small-page procs at random offset and size\n"
, ourname);
exit(ok);
}
/**
* [copied from usemem.c - dmj]
* memparse - parse a string with mem suffixes into a number
* @ptr: Where parse begins
* @retptr: (output) Optional pointer to next char after parse completes
*
* Parses a string into a number. The number stored at @ptr is
* potentially suffixed with %K (for kilobytes, or 1024 bytes),
* %M (for megabytes, or 1048576 bytes), or %G (for gigabytes, or
* 1073741824). If the number is suffixed with K, M, or G, then
* the return value is the number multiplied by one kilobyte, one
* megabyte, or one gigabyte, respectively.
*/
static unsigned long long memparse(const char *ptr, char **retptr)
{
char *endptr; /* local pointer to end of parsed string */
unsigned long long ret = strtoull(ptr, &endptr, 0);
switch (*endptr) {
case 'T':
case 't':
ret <<= 10;
case 'G':
case 'g':
ret <<= 10;
case 'M':
case 'm':
ret <<= 10;
case 'K':
case 'k':
ret <<= 10;
endptr++;
default:
break;
}
if (retptr)
*retptr = endptr;
return ret;
}
static unsigned long read_sysfs_ul(const char *fname)
{
int fd;
ssize_t len;
char buf[64];
fd = open(fname, O_RDONLY);
if (fd == -1) {
perror("sysfs open");
exit(1);
}
len = read(fd, buf, sizeof(buf) - 1);
if (len == -1) {
perror("sysfs read");
exit(1);
}
return strtoul(buf, NULL, 10);
}
static inline void os_random_seed(unsigned long seed, struct drand48_data *rs)
{
srand48_r(seed, rs);
}
static inline long os_random_long(unsigned long max, struct drand48_data *rs)
{
long val;
lrand48_r(rs, &val);
return (unsigned long)((double)max * val / (RAND_MAX + 1.0));
}
struct fault_range {
size_t start, end;
};
static long fault_thread(void *arg)
{
size_t i;
struct fault_range *range = (struct fault_range *)arg;
for (i = range->start; i < range->end; i += pagesize)
memory[i] = 'b';
}
static void fault_all(char *memory, size_t bytes)
{
int ret;
size_t i;
long thread_ret;
pthread_t threads[nr_thread];
struct fault_range ranges[nr_thread];
if (nr_thread > bytes) {
ranges[0].start = 0;
ranges[0].end = bytes;
fault_thread(&ranges[0]);
return;
}
for (i = 0; i < nr_thread; i++) {
ranges[i].start = bytes * i / nr_thread;
ranges[i].end = bytes * (i + 1) / nr_thread;
ret = pthread_create(&threads[i], NULL, (start_routine)fault_thread, &ranges[i]);
if (ret) {
perror("pthread_create");
exit(1);
}
}
for (i = 0; i < nr_thread; i++) {
ret = pthread_join(threads[i], (void *)&thread_ret);
if (ret) {
perror("pthread_join");
exit(1);
}
}
dprintf("done faulting\n");
}
static void read_memory(size_t idx)
{
volatile char c = (volatile char)memory[idx];
}
int do_small_page_task(void)
{
size_t i;
struct drand48_data rand_data;
os_random_seed(time(0) ^ syscall(SYS_gettid), &rand_data);
/* Unmap parts of the range? */
if (munmap_size) {
assert(munmap_offset % pagesize == 0);
for (i = munmap_offset; i < bytes; i += thp_size) {
dprintf("munmap(%lx, %lx)\n", memory + i, munmap_size);
if (munmap(memory + i, munmap_size) == -1) {
fprintf(stderr, "munmap failed: %s\n",
strerror(errno));
exit(1);
}
}
}
while (1) {
struct timespec ts;
size_t thp_offset;
i = ALIGN(os_random_long(bytes, &rand_data), pagesize);
thp_offset = i % thp_size;
if (thp_offset >= munmap_offset &&
thp_offset <= munmap_offset + munmap_size)
continue;
read_memory(i);
ts.tv_sec = 0;
ts.tv_nsec = 1000;
if (nanosleep(&ts, NULL) == -1) {
fprintf(stderr, "nanosleep failed: %s\n", strerror(errno));
exit(1);
}
}
}
int do_large_page_task(void)
{
size_t i;
size_t pmd_aligned_start, pmd_aligned_end;
struct drand48_data rand_data;
struct timespec ts;
os_random_seed(time(0) ^ syscall(SYS_gettid), &rand_data);
while (1) {
volatile char *c;
i = ALIGN(os_random_long(bytes, &rand_data), thp_size);
read_memory(i);
ts.tv_sec = 0;
ts.tv_nsec = 1000;
if (nanosleep(&ts, NULL) == -1) {
fprintf(stderr, "nanosleep failed: %s\n", strerror(errno));
exit(1);
}
}
}
int main(int argc, char **argv)
{
int i, c, child_pid, status;
struct drand48_data rand_data;
size_t nr_smallpg_procs = 1;
size_t nr_largepg_procs = 1;
ourname = argv[0];
pagesize = sysconf(_SC_PAGESIZE);
dprintf("pagesize = %lu\n", pagesize);
thp_size = read_sysfs_ul(THP_PGSZ_SYSFS);
dprintf("thp_size = %lu\n", thp_size);
nr_thread = sysconf(_SC_NPROCESSORS_ONLN);
dprintf("nr_thread = %lu\n", nr_thread);
while ((c = getopt(argc, argv, "hl:s:u:U")) != -1) {
switch (c) {
case 'h':
usage(0);
case 'l':
nr_largepg_procs = strtol(optarg, NULL, 10);
break;
case 's':
nr_smallpg_procs = strtol(optarg, NULL, 10);
break;
case 'u':
if ((munmap_arg = strtok(optarg, ",")) == NULL)
usage(1);
munmap_offset = memparse(munmap_arg, NULL);
if ((munmap_arg = strtok(NULL, ",")) == NULL)
usage(1);
munmap_size = memparse(munmap_arg, NULL);
break;
case 'U':
random_munmap = 1;
break;
default:
usage(1);
}
}
if (optind != argc - 1)
usage(0);
bytes = memparse(argv[optind], NULL);
if (random_munmap) {
os_random_seed(time(0) ^ syscall(SYS_gettid), &rand_data);
munmap_offset = ALIGN(os_random_long(thp_size - 1, &rand_data),
pagesize);
printf("random munmap offset = %lu\n", munmap_offset);
munmap_size = os_random_long(thp_size - munmap_offset,
&rand_data);
printf("random munmap size = %lu\n", munmap_size);
}
memory_unaligned = mmap(NULL, bytes + thp_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (memory_unaligned == MAP_FAILED) {
fprintf(stderr, "mmap failed: %s\n", strerror(errno));
exit(1);
}
if (madvise(memory_unaligned, bytes, MADV_HUGEPAGE) == -1) {
fprintf(stderr, "madvise failed: %s\n", strerror(errno));
exit(1);
}
memory = (char *)round_up((unsigned long)memory_unaligned, thp_size);
printf("mmap(%ld, %ld)\n", memory, bytes);
/* fault it all in */
fault_all(memory_unaligned, bytes);
for (i = 0; i < nr_smallpg_procs; i++) {
if ((child_pid = fork()) == 0)
return do_small_page_task();
else if (child_pid < 0)
fprintf(stderr, "failed to fork: %s\n",
strerror(errno));
}
for (i = 0; i < nr_largepg_procs; i++) {
if ((child_pid = fork()) == 0)
return do_large_page_task();
else if (child_pid < 0)
fprintf(stderr, "failed to fork: %s\n",
strerror(errno));
}
for (i = 0; i < nr_smallpg_procs + nr_largepg_procs; i++) {
if (wait3(&status, 0, 0) < 0) {
if (errno != EINTR) {
printf("wait3 error on %dth child\n", i);
perror("wait3");
return 1;
}
}
}
dprintf("finished\n");
return 0;
}
--------------------------------------8<---------------------------------------