Message ID | 20190520035254.57579-7-minchan@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | introduce memory hinting API for external process | expand |
[Cc linux-api] On Mon 20-05-19 12:52:53, Minchan Kim wrote: > Currently, process_madvise syscall works for only one address range > so user should call the syscall several times to give hints to > multiple address range. Is that a problem? How big of a problem? Any numbers? > This patch extends process_madvise syscall to support multiple > hints, address ranges and return vaules so user could give hints > all at once. > > struct pr_madvise_param { > int size; /* the size of this structure */ > const struct iovec __user *vec; /* address range array */ > } > > int process_madvise(int pidfd, ssize_t nr_elem, > int *behavior, > struct pr_madvise_param *results, > struct pr_madvise_param *ranges, > unsigned long flags); > > - pidfd > > target process fd > > - nr_elem > > the number of elemenent of array behavior, results, ranges > > - behavior > > hints for each address range in remote process so that user could > give different hints for each range. What is the guarantee of a single call? Do all hints get applied or the first failure backs of? What are the atomicity guarantees? > > - results > > array of buffers to get results for associated remote address range > action. > > - ranges > > array to buffers to have remote process's address ranges to be > processed > > - flags > > extra argument for the future. It should be zero this moment. > > Example) > > struct pr_madvise_param { > int size; > const struct iovec *vec; > }; > > int main(int argc, char *argv[]) > { > struct pr_madvise_param retp, rangep; > struct iovec result_vec[2], range_vec[2]; > int hints[2]; > long ret[2]; > void *addr[2]; > > pid_t pid; > char cmd[64] = {0,}; > addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, > MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); > > if (MAP_FAILED == addr[0]) > return 1; > > addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, > MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); > > if (MAP_FAILED == addr[1]) > return 1; > > hints[0] = MADV_COLD; > range_vec[0].iov_base = addr[0]; > range_vec[0].iov_len = ALLOC_SIZE; > result_vec[0].iov_base = &ret[0]; > result_vec[0].iov_len = sizeof(long); > retp.vec = result_vec; > retp.size = sizeof(struct pr_madvise_param); > > hints[1] = MADV_COOL; > range_vec[1].iov_base = addr[1]; > range_vec[1].iov_len = ALLOC_SIZE; > result_vec[1].iov_base = &ret[1]; > result_vec[1].iov_len = sizeof(long); > rangep.vec = range_vec; > rangep.size = sizeof(struct pr_madvise_param); > > pid = fork(); > if (!pid) { > sleep(10); > } else { > int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC); > if (pidfd < 0) > return 1; > > /* munmap to make pages private for the child */ > munmap(addr[0], ALLOC_SIZE); > munmap(addr[1], ALLOC_SIZE); > system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); > if (syscall(__NR_process_madvise, pidfd, 2, behaviors, > &retp, &rangep, 0)) > perror("process_madvise fail\n"); > system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); > } > > return 0; > } > > Signed-off-by: Minchan Kim <minchan@kernel.org> > --- > include/uapi/asm-generic/mman-common.h | 5 + > mm/madvise.c | 184 +++++++++++++++++++++---- > 2 files changed, 166 insertions(+), 23 deletions(-) > > diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h > index b9b51eeb8e1a..b8e230de84a6 100644 > --- a/include/uapi/asm-generic/mman-common.h > +++ b/include/uapi/asm-generic/mman-common.h > @@ -74,4 +74,9 @@ > #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ > PKEY_DISABLE_WRITE) > > +struct pr_madvise_param { > + int size; /* the size of this structure */ > + const struct iovec __user *vec; /* address range array */ > +}; > + > #endif /* __ASM_GENERIC_MMAN_COMMON_H */ > diff --git a/mm/madvise.c b/mm/madvise.c > index af02aa17e5c1..f4f569dac2bd 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, > struct page *page; > struct vm_area_struct *vma = walk->vma; > unsigned long next; > + long nr_pages = 0; > > next = pmd_addr_end(addr, end); > if (pmd_trans_huge(*pmd)) { > @@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, > > ptep_test_and_clear_young(vma, addr, pte); > deactivate_page(page); > + nr_pages++; > + > } > > pte_unmap_unlock(orig_pte, ptl); > + *(long *)walk->private += nr_pages; > cond_resched(); > > return 0; > @@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, > > static void madvise_cool_page_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, > - unsigned long addr, unsigned long end) > + unsigned long addr, unsigned long end, > + long *nr_pages) > { > struct mm_walk cool_walk = { > .pmd_entry = madvise_cool_pte_range, > .mm = vma->vm_mm, > + .private = nr_pages > }; > > tlb_start_vma(tlb, vma); > @@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb, > } > > static long madvise_cool(struct vm_area_struct *vma, > - unsigned long start_addr, unsigned long end_addr) > + unsigned long start_addr, unsigned long end_addr, > + long *nr_pages) > { > struct mm_struct *mm = vma->vm_mm; > struct mmu_gather tlb; > @@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma, > > lru_add_drain(); > tlb_gather_mmu(&tlb, mm, start_addr, end_addr); > - madvise_cool_page_range(&tlb, vma, start_addr, end_addr); > + madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages); > tlb_finish_mmu(&tlb, start_addr, end_addr); > > return 0; > @@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > int isolated = 0; > struct vm_area_struct *vma = walk->vma; > unsigned long next; > + long nr_pages = 0; > > next = pmd_addr_end(addr, end); > if (pmd_trans_huge(*pmd)) { > @@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > list_add(&page->lru, &page_list); > if (isolated >= SWAP_CLUSTER_MAX) { > pte_unmap_unlock(orig_pte, ptl); > - reclaim_pages(&page_list); > + nr_pages += reclaim_pages(&page_list); > isolated = 0; > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > orig_pte = pte; > @@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > } > > pte_unmap_unlock(orig_pte, ptl); > - reclaim_pages(&page_list); > + nr_pages += reclaim_pages(&page_list); > cond_resched(); > > + *(long *)walk->private += nr_pages; > return 0; > } > > static void madvise_cold_page_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, > - unsigned long addr, unsigned long end) > + unsigned long addr, unsigned long end, > + long *nr_pages) > { > struct mm_walk warm_walk = { > .pmd_entry = madvise_cold_pte_range, > .mm = vma->vm_mm, > + .private = nr_pages, > }; > > tlb_start_vma(tlb, vma); > @@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, > > > static long madvise_cold(struct vm_area_struct *vma, > - unsigned long start_addr, unsigned long end_addr) > + unsigned long start_addr, unsigned long end_addr, > + long *nr_pages) > { > struct mm_struct *mm = vma->vm_mm; > struct mmu_gather tlb; > @@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma, > > lru_add_drain(); > tlb_gather_mmu(&tlb, mm, start_addr, end_addr); > - madvise_cold_page_range(&tlb, vma, start_addr, end_addr); > + madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages); > tlb_finish_mmu(&tlb, start_addr, end_addr); > > return 0; > @@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior, > static long > madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma, > struct vm_area_struct **prev, unsigned long start, > - unsigned long end, int behavior) > + unsigned long end, int behavior, long *nr_pages) > { > switch (behavior) { > case MADV_REMOVE: > @@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma, > case MADV_WILLNEED: > return madvise_willneed(vma, prev, start, end); > case MADV_COOL: > - return madvise_cool(vma, start, end); > + return madvise_cool(vma, start, end, nr_pages); > case MADV_COLD: > - return madvise_cold(vma, start, end); > + return madvise_cold(vma, start, end, nr_pages); > case MADV_FREE: > case MADV_DONTNEED: > return madvise_dontneed_free(tsk, vma, prev, start, > @@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior) > } > > static int madvise_core(struct task_struct *tsk, unsigned long start, > - size_t len_in, int behavior) > + size_t len_in, int behavior, long *nr_pages) > { > unsigned long end, tmp; > struct vm_area_struct *vma, *prev; > @@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, > > if (start & ~PAGE_MASK) > return error; > + > len = (len_in + ~PAGE_MASK) & PAGE_MASK; > > /* Check to see whether len was rounded up from small -ve to zero */ > @@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, > blk_start_plug(&plug); > for (;;) { > /* Still start < end. */ > + long pages = 0; > + > error = -ENOMEM; > if (!vma) > goto out; > @@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, > tmp = end; > > /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ > - error = madvise_vma(tsk, vma, &prev, start, tmp, behavior); > + error = madvise_vma(tsk, vma, &prev, start, tmp, > + behavior, &pages); > if (error) > goto out; > + *nr_pages += pages; > start = tmp; > if (prev && start < prev->vm_end) > start = prev->vm_end; > @@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, > */ > SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) > { > - return madvise_core(current, start, len_in, behavior); > + unsigned long dummy; > + > + return madvise_core(current, start, len_in, behavior, &dummy); > } > > -SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, > - size_t, len_in, int, behavior) > +static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param, > + struct pr_madvise_param *param) > +{ > + u32 size; > + int ret; > + > + memset(param, 0, sizeof(*param)); > + > + ret = get_user(size, &u_param->size); > + if (ret) > + return ret; > + > + if (size > PAGE_SIZE) > + return -E2BIG; > + > + if (!size || size > sizeof(struct pr_madvise_param)) > + return -EINVAL; > + > + ret = copy_from_user(param, u_param, size); > + if (ret) > + return -EFAULT; > + > + return ret; > +} > + > +static int process_madvise_core(struct task_struct *tsk, int *behaviors, > + struct iov_iter *iter, > + const struct iovec *range_vec, > + unsigned long riovcnt, > + unsigned long flags) > +{ > + int i; > + long err; > + > + for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) { > + long ret = 0; > + > + err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base, > + range_vec[i].iov_len, behaviors[i], > + &ret); > + if (err) > + ret = err; > + > + if (copy_to_iter(&ret, sizeof(long), iter) != > + sizeof(long)) { > + err = -EFAULT; > + break; > + } > + > + err = 0; > + } > + > + return err; > +} > + > +SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem, > + const int __user *, hints, > + struct pr_madvise_param __user *, results, > + struct pr_madvise_param __user *, ranges, > + unsigned long, flags) > { > int ret; > struct fd f; > struct pid *pid; > struct task_struct *tsk; > struct mm_struct *mm; > + struct pr_madvise_param result_p, range_p; > + const struct iovec __user *result_vec, __user *range_vec; > + int *behaviors; > + struct iovec iovstack_result[UIO_FASTIOV]; > + struct iovec iovstack_r[UIO_FASTIOV]; > + struct iovec *iov_l = iovstack_result; > + struct iovec *iov_r = iovstack_r; > + struct iov_iter iter; > + > + if (flags != 0) > + return -EINVAL; > + > + ret = pr_madvise_copy_param(results, &result_p); > + if (ret) > + return ret; > + > + ret = pr_madvise_copy_param(ranges, &range_p); > + if (ret) > + return ret; > + > + result_vec = result_p.vec; > + range_vec = range_p.vec; > + > + if (result_p.size != sizeof(struct pr_madvise_param) || > + range_p.size != sizeof(struct pr_madvise_param)) > + return -EINVAL; > + > + behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL); > + if (!behaviors) > + return -ENOMEM; > + > + ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem); > + if (ret < 0) > + goto free_behavior_vec; > + > + ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV, > + &iov_l, &iter); > + if (ret < 0) > + goto free_behavior_vec; > + > + if (!iov_iter_count(&iter)) { > + ret = -EINVAL; > + goto free_iovecs; > + } > + > + ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem, > + UIO_FASTIOV, iovstack_r, &iov_r); > + if (ret <= 0) > + goto free_iovecs; > > f = fdget(pidfd); > - if (!f.file) > - return -EBADF; > + if (!f.file) { > + ret = -EBADF; > + goto free_iovecs; > + } > > pid = pidfd_to_pid(f.file); > if (IS_ERR(pid)) { > ret = PTR_ERR(pid); > - goto err; > + goto put_fd; > } > > ret = -EINVAL; > @@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, > tsk = pid_task(pid, PIDTYPE_PID); > if (!tsk) { > rcu_read_unlock(); > - goto err; > + goto put_fd; > } > get_task_struct(tsk); > rcu_read_unlock(); > @@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, > ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; > if (ret == -EACCES) > ret = -EPERM; > - goto err; > + goto put_task; > } > - ret = madvise_core(tsk, start, len_in, behavior); > + > + ret = process_madvise_core(tsk, behaviors, &iter, iov_r, > + nr_elem, flags); > mmput(mm); > +put_task: > put_task_struct(tsk); > -err: > +put_fd: > fdput(f); > +free_iovecs: > + if (iov_r != iovstack_r) > + kfree(iov_r); > + kfree(iov_l); > +free_behavior_vec: > + kfree(behaviors); > + > return ret; > } > -- > 2.21.0.1020.gf2820cf01a-goog >
On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > [Cc linux-api] > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > Currently, process_madvise syscall works for only one address range > > so user should call the syscall several times to give hints to > > multiple address range. > > Is that a problem? How big of a problem? Any numbers? We easily have 2000+ vma so it's not trivial overhead. I will come up with number in the description at respin. > > > This patch extends process_madvise syscall to support multiple > > hints, address ranges and return vaules so user could give hints > > all at once. > > > > struct pr_madvise_param { > > int size; /* the size of this structure */ > > const struct iovec __user *vec; /* address range array */ > > } > > > > int process_madvise(int pidfd, ssize_t nr_elem, > > int *behavior, > > struct pr_madvise_param *results, > > struct pr_madvise_param *ranges, > > unsigned long flags); > > > > - pidfd > > > > target process fd > > > > - nr_elem > > > > the number of elemenent of array behavior, results, ranges > > > > - behavior > > > > hints for each address range in remote process so that user could > > give different hints for each range. > > What is the guarantee of a single call? Do all hints get applied or the > first failure backs of? What are the atomicity guarantees? All hints will be tried even though one of them is failed. User will see the success or failure from the restuls parameter. For the single call, there is no guarantee of atomicity. > > > > > - results > > > > array of buffers to get results for associated remote address range > > action. > > > > - ranges > > > > array to buffers to have remote process's address ranges to be > > processed > > > > - flags > > > > extra argument for the future. It should be zero this moment. > > > > Example) > > > > struct pr_madvise_param { > > int size; > > const struct iovec *vec; > > }; > > > > int main(int argc, char *argv[]) > > { > > struct pr_madvise_param retp, rangep; > > struct iovec result_vec[2], range_vec[2]; > > int hints[2]; > > long ret[2]; > > void *addr[2]; > > > > pid_t pid; > > char cmd[64] = {0,}; > > addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, > > MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); > > > > if (MAP_FAILED == addr[0]) > > return 1; > > > > addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, > > MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); > > > > if (MAP_FAILED == addr[1]) > > return 1; > > > > hints[0] = MADV_COLD; > > range_vec[0].iov_base = addr[0]; > > range_vec[0].iov_len = ALLOC_SIZE; > > result_vec[0].iov_base = &ret[0]; > > result_vec[0].iov_len = sizeof(long); > > retp.vec = result_vec; > > retp.size = sizeof(struct pr_madvise_param); > > > > hints[1] = MADV_COOL; > > range_vec[1].iov_base = addr[1]; > > range_vec[1].iov_len = ALLOC_SIZE; > > result_vec[1].iov_base = &ret[1]; > > result_vec[1].iov_len = sizeof(long); > > rangep.vec = range_vec; > > rangep.size = sizeof(struct pr_madvise_param); > > > > pid = fork(); > > if (!pid) { > > sleep(10); > > } else { > > int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC); > > if (pidfd < 0) > > return 1; > > > > /* munmap to make pages private for the child */ > > munmap(addr[0], ALLOC_SIZE); > > munmap(addr[1], ALLOC_SIZE); > > system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); > > if (syscall(__NR_process_madvise, pidfd, 2, behaviors, > > &retp, &rangep, 0)) > > perror("process_madvise fail\n"); > > system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); > > } > > > > return 0; > > } > > > > Signed-off-by: Minchan Kim <minchan@kernel.org> > > --- > > include/uapi/asm-generic/mman-common.h | 5 + > > mm/madvise.c | 184 +++++++++++++++++++++---- > > 2 files changed, 166 insertions(+), 23 deletions(-) > > > > diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h > > index b9b51eeb8e1a..b8e230de84a6 100644 > > --- a/include/uapi/asm-generic/mman-common.h > > +++ b/include/uapi/asm-generic/mman-common.h > > @@ -74,4 +74,9 @@ > > #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ > > PKEY_DISABLE_WRITE) > > > > +struct pr_madvise_param { > > + int size; /* the size of this structure */ > > + const struct iovec __user *vec; /* address range array */ > > +}; > > + > > #endif /* __ASM_GENERIC_MMAN_COMMON_H */ > > diff --git a/mm/madvise.c b/mm/madvise.c > > index af02aa17e5c1..f4f569dac2bd 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, > > struct page *page; > > struct vm_area_struct *vma = walk->vma; > > unsigned long next; > > + long nr_pages = 0; > > > > next = pmd_addr_end(addr, end); > > if (pmd_trans_huge(*pmd)) { > > @@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, > > > > ptep_test_and_clear_young(vma, addr, pte); > > deactivate_page(page); > > + nr_pages++; > > + > > } > > > > pte_unmap_unlock(orig_pte, ptl); > > + *(long *)walk->private += nr_pages; > > cond_resched(); > > > > return 0; > > @@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, > > > > static void madvise_cool_page_range(struct mmu_gather *tlb, > > struct vm_area_struct *vma, > > - unsigned long addr, unsigned long end) > > + unsigned long addr, unsigned long end, > > + long *nr_pages) > > { > > struct mm_walk cool_walk = { > > .pmd_entry = madvise_cool_pte_range, > > .mm = vma->vm_mm, > > + .private = nr_pages > > }; > > > > tlb_start_vma(tlb, vma); > > @@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb, > > } > > > > static long madvise_cool(struct vm_area_struct *vma, > > - unsigned long start_addr, unsigned long end_addr) > > + unsigned long start_addr, unsigned long end_addr, > > + long *nr_pages) > > { > > struct mm_struct *mm = vma->vm_mm; > > struct mmu_gather tlb; > > @@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma, > > > > lru_add_drain(); > > tlb_gather_mmu(&tlb, mm, start_addr, end_addr); > > - madvise_cool_page_range(&tlb, vma, start_addr, end_addr); > > + madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages); > > tlb_finish_mmu(&tlb, start_addr, end_addr); > > > > return 0; > > @@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > > int isolated = 0; > > struct vm_area_struct *vma = walk->vma; > > unsigned long next; > > + long nr_pages = 0; > > > > next = pmd_addr_end(addr, end); > > if (pmd_trans_huge(*pmd)) { > > @@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > > list_add(&page->lru, &page_list); > > if (isolated >= SWAP_CLUSTER_MAX) { > > pte_unmap_unlock(orig_pte, ptl); > > - reclaim_pages(&page_list); > > + nr_pages += reclaim_pages(&page_list); > > isolated = 0; > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > > orig_pte = pte; > > @@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > > } > > > > pte_unmap_unlock(orig_pte, ptl); > > - reclaim_pages(&page_list); > > + nr_pages += reclaim_pages(&page_list); > > cond_resched(); > > > > + *(long *)walk->private += nr_pages; > > return 0; > > } > > > > static void madvise_cold_page_range(struct mmu_gather *tlb, > > struct vm_area_struct *vma, > > - unsigned long addr, unsigned long end) > > + unsigned long addr, unsigned long end, > > + long *nr_pages) > > { > > struct mm_walk warm_walk = { > > .pmd_entry = madvise_cold_pte_range, > > .mm = vma->vm_mm, > > + .private = nr_pages, > > }; > > > > tlb_start_vma(tlb, vma); > > @@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, > > > > > > static long madvise_cold(struct vm_area_struct *vma, > > - unsigned long start_addr, unsigned long end_addr) > > + unsigned long start_addr, unsigned long end_addr, > > + long *nr_pages) > > { > > struct mm_struct *mm = vma->vm_mm; > > struct mmu_gather tlb; > > @@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma, > > > > lru_add_drain(); > > tlb_gather_mmu(&tlb, mm, start_addr, end_addr); > > - madvise_cold_page_range(&tlb, vma, start_addr, end_addr); > > + madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages); > > tlb_finish_mmu(&tlb, start_addr, end_addr); > > > > return 0; > > @@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior, > > static long > > madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma, > > struct vm_area_struct **prev, unsigned long start, > > - unsigned long end, int behavior) > > + unsigned long end, int behavior, long *nr_pages) > > { > > switch (behavior) { > > case MADV_REMOVE: > > @@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma, > > case MADV_WILLNEED: > > return madvise_willneed(vma, prev, start, end); > > case MADV_COOL: > > - return madvise_cool(vma, start, end); > > + return madvise_cool(vma, start, end, nr_pages); > > case MADV_COLD: > > - return madvise_cold(vma, start, end); > > + return madvise_cold(vma, start, end, nr_pages); > > case MADV_FREE: > > case MADV_DONTNEED: > > return madvise_dontneed_free(tsk, vma, prev, start, > > @@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior) > > } > > > > static int madvise_core(struct task_struct *tsk, unsigned long start, > > - size_t len_in, int behavior) > > + size_t len_in, int behavior, long *nr_pages) > > { > > unsigned long end, tmp; > > struct vm_area_struct *vma, *prev; > > @@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, > > > > if (start & ~PAGE_MASK) > > return error; > > + > > len = (len_in + ~PAGE_MASK) & PAGE_MASK; > > > > /* Check to see whether len was rounded up from small -ve to zero */ > > @@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, > > blk_start_plug(&plug); > > for (;;) { > > /* Still start < end. */ > > + long pages = 0; > > + > > error = -ENOMEM; > > if (!vma) > > goto out; > > @@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, > > tmp = end; > > > > /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ > > - error = madvise_vma(tsk, vma, &prev, start, tmp, behavior); > > + error = madvise_vma(tsk, vma, &prev, start, tmp, > > + behavior, &pages); > > if (error) > > goto out; > > + *nr_pages += pages; > > start = tmp; > > if (prev && start < prev->vm_end) > > start = prev->vm_end; > > @@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, > > */ > > SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) > > { > > - return madvise_core(current, start, len_in, behavior); > > + unsigned long dummy; > > + > > + return madvise_core(current, start, len_in, behavior, &dummy); > > } > > > > -SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, > > - size_t, len_in, int, behavior) > > +static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param, > > + struct pr_madvise_param *param) > > +{ > > + u32 size; > > + int ret; > > + > > + memset(param, 0, sizeof(*param)); > > + > > + ret = get_user(size, &u_param->size); > > + if (ret) > > + return ret; > > + > > + if (size > PAGE_SIZE) > > + return -E2BIG; > > + > > + if (!size || size > sizeof(struct pr_madvise_param)) > > + return -EINVAL; > > + > > + ret = copy_from_user(param, u_param, size); > > + if (ret) > > + return -EFAULT; > > + > > + return ret; > > +} > > + > > +static int process_madvise_core(struct task_struct *tsk, int *behaviors, > > + struct iov_iter *iter, > > + const struct iovec *range_vec, > > + unsigned long riovcnt, > > + unsigned long flags) > > +{ > > + int i; > > + long err; > > + > > + for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) { > > + long ret = 0; > > + > > + err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base, > > + range_vec[i].iov_len, behaviors[i], > > + &ret); > > + if (err) > > + ret = err; > > + > > + if (copy_to_iter(&ret, sizeof(long), iter) != > > + sizeof(long)) { > > + err = -EFAULT; > > + break; > > + } > > + > > + err = 0; > > + } > > + > > + return err; > > +} > > + > > +SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem, > > + const int __user *, hints, > > + struct pr_madvise_param __user *, results, > > + struct pr_madvise_param __user *, ranges, > > + unsigned long, flags) > > { > > int ret; > > struct fd f; > > struct pid *pid; > > struct task_struct *tsk; > > struct mm_struct *mm; > > + struct pr_madvise_param result_p, range_p; > > + const struct iovec __user *result_vec, __user *range_vec; > > + int *behaviors; > > + struct iovec iovstack_result[UIO_FASTIOV]; > > + struct iovec iovstack_r[UIO_FASTIOV]; > > + struct iovec *iov_l = iovstack_result; > > + struct iovec *iov_r = iovstack_r; > > + struct iov_iter iter; > > + > > + if (flags != 0) > > + return -EINVAL; > > + > > + ret = pr_madvise_copy_param(results, &result_p); > > + if (ret) > > + return ret; > > + > > + ret = pr_madvise_copy_param(ranges, &range_p); > > + if (ret) > > + return ret; > > + > > + result_vec = result_p.vec; > > + range_vec = range_p.vec; > > + > > + if (result_p.size != sizeof(struct pr_madvise_param) || > > + range_p.size != sizeof(struct pr_madvise_param)) > > + return -EINVAL; > > + > > + behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL); > > + if (!behaviors) > > + return -ENOMEM; > > + > > + ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem); > > + if (ret < 0) > > + goto free_behavior_vec; > > + > > + ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV, > > + &iov_l, &iter); > > + if (ret < 0) > > + goto free_behavior_vec; > > + > > + if (!iov_iter_count(&iter)) { > > + ret = -EINVAL; > > + goto free_iovecs; > > + } > > + > > + ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem, > > + UIO_FASTIOV, iovstack_r, &iov_r); > > + if (ret <= 0) > > + goto free_iovecs; > > > > f = fdget(pidfd); > > - if (!f.file) > > - return -EBADF; > > + if (!f.file) { > > + ret = -EBADF; > > + goto free_iovecs; > > + } > > > > pid = pidfd_to_pid(f.file); > > if (IS_ERR(pid)) { > > ret = PTR_ERR(pid); > > - goto err; > > + goto put_fd; > > } > > > > ret = -EINVAL; > > @@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, > > tsk = pid_task(pid, PIDTYPE_PID); > > if (!tsk) { > > rcu_read_unlock(); > > - goto err; > > + goto put_fd; > > } > > get_task_struct(tsk); > > rcu_read_unlock(); > > @@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, > > ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; > > if (ret == -EACCES) > > ret = -EPERM; > > - goto err; > > + goto put_task; > > } > > - ret = madvise_core(tsk, start, len_in, behavior); > > + > > + ret = process_madvise_core(tsk, behaviors, &iter, iov_r, > > + nr_elem, flags); > > mmput(mm); > > +put_task: > > put_task_struct(tsk); > > -err: > > +put_fd: > > fdput(f); > > +free_iovecs: > > + if (iov_r != iovstack_r) > > + kfree(iov_r); > > + kfree(iov_l); > > +free_behavior_vec: > > + kfree(behaviors); > > + > > return ret; > > } > > -- > > 2.21.0.1020.gf2820cf01a-goog > > > > -- > Michal Hocko > SUSE Labs
On Tue 21-05-19 11:48:20, Minchan Kim wrote: > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > [Cc linux-api] > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > Currently, process_madvise syscall works for only one address range > > > so user should call the syscall several times to give hints to > > > multiple address range. > > > > Is that a problem? How big of a problem? Any numbers? > > We easily have 2000+ vma so it's not trivial overhead. I will come up > with number in the description at respin. Does this really have to be a fast operation? I would expect the monitor is by no means a fast path. The system call overhead is not what it used to be, sigh, but still for something that is not a hot path it should be tolerable, especially when the whole operation is quite expensive on its own (wrt. the syscall entry/exit). I am not saying we do not need a multiplexing API, I am just not sure we need it right away. Btw. there was some demand for other MM syscalls to provide a multiplexing API (e.g. mprotect), maybe it would be better to handle those in one go?
On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote: > On Tue 21-05-19 11:48:20, Minchan Kim wrote: > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > > [Cc linux-api] > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > > Currently, process_madvise syscall works for only one address range > > > > so user should call the syscall several times to give hints to > > > > multiple address range. > > > > > > Is that a problem? How big of a problem? Any numbers? > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up > > with number in the description at respin. > > Does this really have to be a fast operation? I would expect the monitor > is by no means a fast path. The system call overhead is not what it used > to be, sigh, but still for something that is not a hot path it should be > tolerable, especially when the whole operation is quite expensive on its > own (wrt. the syscall entry/exit). What's different with process_vm_[readv|writev] and vmsplice? If the range needed to be covered is a lot, vector operation makes senese to me. > > I am not saying we do not need a multiplexing API, I am just not sure > we need it right away. Btw. there was some demand for other MM syscalls > to provide a multiplexing API (e.g. mprotect), maybe it would be better > to handle those in one go? That's the exactly what Daniel Colascione suggested from internal review. That would be a interesting approach if we could aggregate all of system call in one go. > -- > Michal Hocko > SUSE Labs
On Tue 21-05-19 19:26:13, Minchan Kim wrote: > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote: > > On Tue 21-05-19 11:48:20, Minchan Kim wrote: > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > > > [Cc linux-api] > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > > > Currently, process_madvise syscall works for only one address range > > > > > so user should call the syscall several times to give hints to > > > > > multiple address range. > > > > > > > > Is that a problem? How big of a problem? Any numbers? > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up > > > with number in the description at respin. > > > > Does this really have to be a fast operation? I would expect the monitor > > is by no means a fast path. The system call overhead is not what it used > > to be, sigh, but still for something that is not a hot path it should be > > tolerable, especially when the whole operation is quite expensive on its > > own (wrt. the syscall entry/exit). > > What's different with process_vm_[readv|writev] and vmsplice? > If the range needed to be covered is a lot, vector operation makes senese > to me. I am not saying that the vector API is wrong. All I am trying to say is that the benefit is not really clear so far. If you want to push it through then you should better get some supporting data.
On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote: > On Tue 21-05-19 19:26:13, Minchan Kim wrote: > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote: > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote: > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > > > > [Cc linux-api] > > > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > > > > Currently, process_madvise syscall works for only one address range > > > > > > so user should call the syscall several times to give hints to > > > > > > multiple address range. > > > > > > > > > > Is that a problem? How big of a problem? Any numbers? > > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up > > > > with number in the description at respin. > > > > > > Does this really have to be a fast operation? I would expect the monitor > > > is by no means a fast path. The system call overhead is not what it used > > > to be, sigh, but still for something that is not a hot path it should be > > > tolerable, especially when the whole operation is quite expensive on its > > > own (wrt. the syscall entry/exit). > > > > What's different with process_vm_[readv|writev] and vmsplice? > > If the range needed to be covered is a lot, vector operation makes senese > > to me. > > I am not saying that the vector API is wrong. All I am trying to say is > that the benefit is not really clear so far. If you want to push it > through then you should better get some supporting data. I measured 1000 madvise syscall vs. a vector range syscall with 1000 ranges on ARM64 mordern device. Even though I saw 15% improvement but absoluate gain is just 1ms so I don't think it's worth to support. I will drop vector support at next revision. Thanks for the review, Michal!
On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote: > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote: > > On Tue 21-05-19 19:26:13, Minchan Kim wrote: > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote: > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote: > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > > > > > [Cc linux-api] > > > > > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > > > > > Currently, process_madvise syscall works for only one address range > > > > > > > so user should call the syscall several times to give hints to > > > > > > > multiple address range. > > > > > > > > > > > > Is that a problem? How big of a problem? Any numbers? > > > > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up > > > > > with number in the description at respin. > > > > > > > > Does this really have to be a fast operation? I would expect the monitor > > > > is by no means a fast path. The system call overhead is not what it used > > > > to be, sigh, but still for something that is not a hot path it should be > > > > tolerable, especially when the whole operation is quite expensive on its > > > > own (wrt. the syscall entry/exit). > > > > > > What's different with process_vm_[readv|writev] and vmsplice? > > > If the range needed to be covered is a lot, vector operation makes senese > > > to me. > > > > I am not saying that the vector API is wrong. All I am trying to say is > > that the benefit is not really clear so far. If you want to push it > > through then you should better get some supporting data. > > I measured 1000 madvise syscall vs. a vector range syscall with 1000 > ranges on ARM64 mordern device. Even though I saw 15% improvement but > absoluate gain is just 1ms so I don't think it's worth to support. > I will drop vector support at next revision. Please do keep the vector support. Absolute timing is misleading, since in a tight loop, you're not going to contend on mmap_sem. We've seen tons of improvements in things like camera start come from coalescing mprotect calls, with the gains coming from taking and releasing various locks a lot less often and bouncing around less on the contended lock paths. Raw throughput doesn't tell the whole story, especially on mobile.
On Wed 29-05-19 03:08:32, Daniel Colascione wrote: > On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote: > > > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote: > > > On Tue 21-05-19 19:26:13, Minchan Kim wrote: > > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote: > > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote: > > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > > > > > > [Cc linux-api] > > > > > > > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > > > > > > Currently, process_madvise syscall works for only one address range > > > > > > > > so user should call the syscall several times to give hints to > > > > > > > > multiple address range. > > > > > > > > > > > > > > Is that a problem? How big of a problem? Any numbers? > > > > > > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up > > > > > > with number in the description at respin. > > > > > > > > > > Does this really have to be a fast operation? I would expect the monitor > > > > > is by no means a fast path. The system call overhead is not what it used > > > > > to be, sigh, but still for something that is not a hot path it should be > > > > > tolerable, especially when the whole operation is quite expensive on its > > > > > own (wrt. the syscall entry/exit). > > > > > > > > What's different with process_vm_[readv|writev] and vmsplice? > > > > If the range needed to be covered is a lot, vector operation makes senese > > > > to me. > > > > > > I am not saying that the vector API is wrong. All I am trying to say is > > > that the benefit is not really clear so far. If you want to push it > > > through then you should better get some supporting data. > > > > I measured 1000 madvise syscall vs. a vector range syscall with 1000 > > ranges on ARM64 mordern device. Even though I saw 15% improvement but > > absoluate gain is just 1ms so I don't think it's worth to support. > > I will drop vector support at next revision. > > Please do keep the vector support. Absolute timing is misleading, > since in a tight loop, you're not going to contend on mmap_sem. We've > seen tons of improvements in things like camera start come from > coalescing mprotect calls, with the gains coming from taking and > releasing various locks a lot less often and bouncing around less on > the contended lock paths. Raw throughput doesn't tell the whole story, > especially on mobile. This will always be a double edge sword. Taking a lock for longer can improve a throughput of a single call but it would make a latency for anybody contending on the lock much worse. Besides that, please do not overcomplicate the thing from the early beginning please. Let's start with a simple and well defined remote madvise alternative first and build a vector API on top with some numbers based on _real_ workloads.
On Wed, May 29, 2019 at 12:14:47PM +0800, Hillf Danton wrote: > > On Mon, 20 May 2019 12:52:53 +0900 Minchan Kim wrote: > > Example) > > > Better if the following stuff is stored somewhere under the > tools/testing directory. Sure, I will do once we figure out RFC stage.
On Wed, May 29, 2019 at 12:33:52PM +0200, Michal Hocko wrote: > On Wed 29-05-19 03:08:32, Daniel Colascione wrote: > > On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote: > > > > > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote: > > > > On Tue 21-05-19 19:26:13, Minchan Kim wrote: > > > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote: > > > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote: > > > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > > > > > > > [Cc linux-api] > > > > > > > > > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > > > > > > > Currently, process_madvise syscall works for only one address range > > > > > > > > > so user should call the syscall several times to give hints to > > > > > > > > > multiple address range. > > > > > > > > > > > > > > > > Is that a problem? How big of a problem? Any numbers? > > > > > > > > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up > > > > > > > with number in the description at respin. > > > > > > > > > > > > Does this really have to be a fast operation? I would expect the monitor > > > > > > is by no means a fast path. The system call overhead is not what it used > > > > > > to be, sigh, but still for something that is not a hot path it should be > > > > > > tolerable, especially when the whole operation is quite expensive on its > > > > > > own (wrt. the syscall entry/exit). > > > > > > > > > > What's different with process_vm_[readv|writev] and vmsplice? > > > > > If the range needed to be covered is a lot, vector operation makes senese > > > > > to me. > > > > > > > > I am not saying that the vector API is wrong. All I am trying to say is > > > > that the benefit is not really clear so far. If you want to push it > > > > through then you should better get some supporting data. > > > > > > I measured 1000 madvise syscall vs. a vector range syscall with 1000 > > > ranges on ARM64 mordern device. Even though I saw 15% improvement but > > > absoluate gain is just 1ms so I don't think it's worth to support. > > > I will drop vector support at next revision. > > > > Please do keep the vector support. Absolute timing is misleading, > > since in a tight loop, you're not going to contend on mmap_sem. We've > > seen tons of improvements in things like camera start come from > > coalescing mprotect calls, with the gains coming from taking and > > releasing various locks a lot less often and bouncing around less on > > the contended lock paths. Raw throughput doesn't tell the whole story, > > especially on mobile. > > This will always be a double edge sword. Taking a lock for longer can > improve a throughput of a single call but it would make a latency for > anybody contending on the lock much worse. > > Besides that, please do not overcomplicate the thing from the early > beginning please. Let's start with a simple and well defined remote > madvise alternative first and build a vector API on top with some > numbers based on _real_ workloads. First time, I didn't think about atomicity about address range race because MADV_COLD/PAGEOUT is not critical for the race. However you raised the atomicity issue because people would extend hints to destructive ones easily. I agree with that and that's why we discussed how to guarantee the race and Daniel comes up with good idea. - vma configuration seq number via process_getinfo(2). We discussed the race issue without _read_ workloads/requests because it's common sense that people might extend the syscall later. Here is same. For current workload, we don't need to support vector for perfomance point of view based on my experiment. However, it's rather limited experiment. Some configuration might have 10000+ vmas or really slow CPU. Furthermore, I want to have vector support due to atomicity issue if it's really the one we should consider. With vector support of the API and vma configuration sequence number from Daniel, we could support address ranges operations's atomicity. However, since we don't introduce vector at this moment, we need to introduce *another syscall* later to be able to handle multile ranges all at once atomically if it's okay. Other thought: Maybe we could extend address range batch syscall covers other MM syscall like mmap/munmap/madvise/mprotect and so on because there are multiple users that would benefit from this general batching mechanism.
On Thu 30-05-19 11:17:48, Minchan Kim wrote: > On Wed, May 29, 2019 at 12:33:52PM +0200, Michal Hocko wrote: > > On Wed 29-05-19 03:08:32, Daniel Colascione wrote: > > > On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote: > > > > > On Tue 21-05-19 19:26:13, Minchan Kim wrote: > > > > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote: > > > > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote: > > > > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > > > > > > > > [Cc linux-api] > > > > > > > > > > > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > > > > > > > > Currently, process_madvise syscall works for only one address range > > > > > > > > > > so user should call the syscall several times to give hints to > > > > > > > > > > multiple address range. > > > > > > > > > > > > > > > > > > Is that a problem? How big of a problem? Any numbers? > > > > > > > > > > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up > > > > > > > > with number in the description at respin. > > > > > > > > > > > > > > Does this really have to be a fast operation? I would expect the monitor > > > > > > > is by no means a fast path. The system call overhead is not what it used > > > > > > > to be, sigh, but still for something that is not a hot path it should be > > > > > > > tolerable, especially when the whole operation is quite expensive on its > > > > > > > own (wrt. the syscall entry/exit). > > > > > > > > > > > > What's different with process_vm_[readv|writev] and vmsplice? > > > > > > If the range needed to be covered is a lot, vector operation makes senese > > > > > > to me. > > > > > > > > > > I am not saying that the vector API is wrong. All I am trying to say is > > > > > that the benefit is not really clear so far. If you want to push it > > > > > through then you should better get some supporting data. > > > > > > > > I measured 1000 madvise syscall vs. a vector range syscall with 1000 > > > > ranges on ARM64 mordern device. Even though I saw 15% improvement but > > > > absoluate gain is just 1ms so I don't think it's worth to support. > > > > I will drop vector support at next revision. > > > > > > Please do keep the vector support. Absolute timing is misleading, > > > since in a tight loop, you're not going to contend on mmap_sem. We've > > > seen tons of improvements in things like camera start come from > > > coalescing mprotect calls, with the gains coming from taking and > > > releasing various locks a lot less often and bouncing around less on > > > the contended lock paths. Raw throughput doesn't tell the whole story, > > > especially on mobile. > > > > This will always be a double edge sword. Taking a lock for longer can > > improve a throughput of a single call but it would make a latency for > > anybody contending on the lock much worse. > > > > Besides that, please do not overcomplicate the thing from the early > > beginning please. Let's start with a simple and well defined remote > > madvise alternative first and build a vector API on top with some > > numbers based on _real_ workloads. > > First time, I didn't think about atomicity about address range race > because MADV_COLD/PAGEOUT is not critical for the race. > However you raised the atomicity issue because people would extend > hints to destructive ones easily. I agree with that and that's why > we discussed how to guarantee the race and Daniel comes up with good idea. Just for the clarification, I didn't really mean atomicity but rather a _consistency_ (essentially time to check to time to use consistency). > - vma configuration seq number via process_getinfo(2). > > We discussed the race issue without _read_ workloads/requests because > it's common sense that people might extend the syscall later. > > Here is same. For current workload, we don't need to support vector > for perfomance point of view based on my experiment. However, it's > rather limited experiment. Some configuration might have 10000+ vmas > or really slow CPU. > > Furthermore, I want to have vector support due to atomicity issue > if it's really the one we should consider. > With vector support of the API and vma configuration sequence number > from Daniel, we could support address ranges operations's atomicity. I am not sure what do you mean here. Perform all ranges atomicaly wrt. other address space modifications? If yes I am not sure we want that semantic because it can cause really long stalls for other operations but that is a discussion on its own and I would rather focus on a simple interface first. > However, since we don't introduce vector at this moment, we need to > introduce *another syscall* later to be able to handle multile ranges > all at once atomically if it's okay. Agreed. > Other thought: > Maybe we could extend address range batch syscall covers other MM > syscall like mmap/munmap/madvise/mprotect and so on because there > are multiple users that would benefit from this general batching > mechanism. Again a discussion on its own ;)
On Thu, May 30, 2019 at 08:57:55AM +0200, Michal Hocko wrote: > On Thu 30-05-19 11:17:48, Minchan Kim wrote: > > On Wed, May 29, 2019 at 12:33:52PM +0200, Michal Hocko wrote: > > > On Wed 29-05-19 03:08:32, Daniel Colascione wrote: > > > > On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote: > > > > > > On Tue 21-05-19 19:26:13, Minchan Kim wrote: > > > > > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote: > > > > > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote: > > > > > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > > > > > > > > > [Cc linux-api] > > > > > > > > > > > > > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > > > > > > > > > Currently, process_madvise syscall works for only one address range > > > > > > > > > > > so user should call the syscall several times to give hints to > > > > > > > > > > > multiple address range. > > > > > > > > > > > > > > > > > > > > Is that a problem? How big of a problem? Any numbers? > > > > > > > > > > > > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up > > > > > > > > > with number in the description at respin. > > > > > > > > > > > > > > > > Does this really have to be a fast operation? I would expect the monitor > > > > > > > > is by no means a fast path. The system call overhead is not what it used > > > > > > > > to be, sigh, but still for something that is not a hot path it should be > > > > > > > > tolerable, especially when the whole operation is quite expensive on its > > > > > > > > own (wrt. the syscall entry/exit). > > > > > > > > > > > > > > What's different with process_vm_[readv|writev] and vmsplice? > > > > > > > If the range needed to be covered is a lot, vector operation makes senese > > > > > > > to me. > > > > > > > > > > > > I am not saying that the vector API is wrong. All I am trying to say is > > > > > > that the benefit is not really clear so far. If you want to push it > > > > > > through then you should better get some supporting data. > > > > > > > > > > I measured 1000 madvise syscall vs. a vector range syscall with 1000 > > > > > ranges on ARM64 mordern device. Even though I saw 15% improvement but > > > > > absoluate gain is just 1ms so I don't think it's worth to support. > > > > > I will drop vector support at next revision. > > > > > > > > Please do keep the vector support. Absolute timing is misleading, > > > > since in a tight loop, you're not going to contend on mmap_sem. We've > > > > seen tons of improvements in things like camera start come from > > > > coalescing mprotect calls, with the gains coming from taking and > > > > releasing various locks a lot less often and bouncing around less on > > > > the contended lock paths. Raw throughput doesn't tell the whole story, > > > > especially on mobile. > > > > > > This will always be a double edge sword. Taking a lock for longer can > > > improve a throughput of a single call but it would make a latency for > > > anybody contending on the lock much worse. > > > > > > Besides that, please do not overcomplicate the thing from the early > > > beginning please. Let's start with a simple and well defined remote > > > madvise alternative first and build a vector API on top with some > > > numbers based on _real_ workloads. > > > > First time, I didn't think about atomicity about address range race > > because MADV_COLD/PAGEOUT is not critical for the race. > > However you raised the atomicity issue because people would extend > > hints to destructive ones easily. I agree with that and that's why > > we discussed how to guarantee the race and Daniel comes up with good idea. > > Just for the clarification, I didn't really mean atomicity but rather a > _consistency_ (essentially time to check to time to use consistency). What do you mean by *consistency*? Could you elaborate it more? > > > - vma configuration seq number via process_getinfo(2). > > > > We discussed the race issue without _read_ workloads/requests because > > it's common sense that people might extend the syscall later. > > > > Here is same. For current workload, we don't need to support vector > > for perfomance point of view based on my experiment. However, it's > > rather limited experiment. Some configuration might have 10000+ vmas > > or really slow CPU. > > > > Furthermore, I want to have vector support due to atomicity issue > > if it's really the one we should consider. > > With vector support of the API and vma configuration sequence number > > from Daniel, we could support address ranges operations's atomicity. > > I am not sure what do you mean here. Perform all ranges atomicaly wrt. > other address space modifications? If yes I am not sure we want that Yub, I think it's *necessary* if we want to support destructive hints via process_madvise. > semantic because it can cause really long stalls for other operations It could be or it couldn't be. For example, if we could multiplex several syscalls which we should enumerate all of page table lookup, it could be more effective rather than doing each page table on each syscall. > but that is a discussion on its own and I would rather focus on a simple > interface first. It seems it's time to send RFCv2 since we discussed a lot although we don't have clear conclution yet. But still want to understand what you meant _consistency_. Thanks for the review, Michal! It's very helpful. > > > However, since we don't introduce vector at this moment, we need to > > introduce *another syscall* later to be able to handle multile ranges > > all at once atomically if it's okay. > > Agreed. > > > Other thought: > > Maybe we could extend address range batch syscall covers other MM > > syscall like mmap/munmap/madvise/mprotect and so on because there > > are multiple users that would benefit from this general batching > > mechanism. > > Again a discussion on its own ;) > > -- > Michal Hocko > SUSE Labs
On Thu, May 30, 2019 at 1:02 AM Minchan Kim <minchan@kernel.org> wrote: > > On Thu, May 30, 2019 at 08:57:55AM +0200, Michal Hocko wrote: > > On Thu 30-05-19 11:17:48, Minchan Kim wrote: > > > On Wed, May 29, 2019 at 12:33:52PM +0200, Michal Hocko wrote: > > > > On Wed 29-05-19 03:08:32, Daniel Colascione wrote: > > > > > On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > > > > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote: > > > > > > > On Tue 21-05-19 19:26:13, Minchan Kim wrote: > > > > > > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote: > > > > > > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote: > > > > > > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > > > > > > > > > > [Cc linux-api] > > > > > > > > > > > > > > > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > > > > > > > > > > Currently, process_madvise syscall works for only one address range > > > > > > > > > > > > so user should call the syscall several times to give hints to > > > > > > > > > > > > multiple address range. > > > > > > > > > > > > > > > > > > > > > > Is that a problem? How big of a problem? Any numbers? > > > > > > > > > > > > > > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up > > > > > > > > > > with number in the description at respin. > > > > > > > > > > > > > > > > > > Does this really have to be a fast operation? I would expect the monitor > > > > > > > > > is by no means a fast path. The system call overhead is not what it used > > > > > > > > > to be, sigh, but still for something that is not a hot path it should be > > > > > > > > > tolerable, especially when the whole operation is quite expensive on its > > > > > > > > > own (wrt. the syscall entry/exit). > > > > > > > > > > > > > > > > What's different with process_vm_[readv|writev] and vmsplice? > > > > > > > > If the range needed to be covered is a lot, vector operation makes senese > > > > > > > > to me. > > > > > > > > > > > > > > I am not saying that the vector API is wrong. All I am trying to say is > > > > > > > that the benefit is not really clear so far. If you want to push it > > > > > > > through then you should better get some supporting data. > > > > > > > > > > > > I measured 1000 madvise syscall vs. a vector range syscall with 1000 > > > > > > ranges on ARM64 mordern device. Even though I saw 15% improvement but > > > > > > absoluate gain is just 1ms so I don't think it's worth to support. > > > > > > I will drop vector support at next revision. > > > > > > > > > > Please do keep the vector support. Absolute timing is misleading, > > > > > since in a tight loop, you're not going to contend on mmap_sem. We've > > > > > seen tons of improvements in things like camera start come from > > > > > coalescing mprotect calls, with the gains coming from taking and > > > > > releasing various locks a lot less often and bouncing around less on > > > > > the contended lock paths. Raw throughput doesn't tell the whole story, > > > > > especially on mobile. > > > > > > > > This will always be a double edge sword. Taking a lock for longer can > > > > improve a throughput of a single call but it would make a latency for > > > > anybody contending on the lock much worse. > > > > > > > > Besides that, please do not overcomplicate the thing from the early > > > > beginning please. Let's start with a simple and well defined remote > > > > madvise alternative first and build a vector API on top with some > > > > numbers based on _real_ workloads. > > > > > > First time, I didn't think about atomicity about address range race > > > because MADV_COLD/PAGEOUT is not critical for the race. > > > However you raised the atomicity issue because people would extend > > > hints to destructive ones easily. I agree with that and that's why > > > we discussed how to guarantee the race and Daniel comes up with good idea. > > > > Just for the clarification, I didn't really mean atomicity but rather a > > _consistency_ (essentially time to check to time to use consistency). > > What do you mean by *consistency*? Could you elaborate it more? > > > > > > - vma configuration seq number via process_getinfo(2). > > > > > > We discussed the race issue without _read_ workloads/requests because > > > it's common sense that people might extend the syscall later. > > > > > > Here is same. For current workload, we don't need to support vector > > > for perfomance point of view based on my experiment. However, it's > > > rather limited experiment. Some configuration might have 10000+ vmas > > > or really slow CPU. > > > > > > Furthermore, I want to have vector support due to atomicity issue > > > if it's really the one we should consider. > > > With vector support of the API and vma configuration sequence number > > > from Daniel, we could support address ranges operations's atomicity. > > > > I am not sure what do you mean here. Perform all ranges atomicaly wrt. > > other address space modifications? If yes I am not sure we want that > > Yub, I think it's *necessary* if we want to support destructive hints > via process_madvise. [Puts on flame-proof suit] Here's a quick sketch of what I have in mind for process_getinfo(2). Keep in mind that it's still just a rough idea. We've had trouble efficiently learning about process and memory state of the system via procfs. Android background memory-use scans (the android.bg daemon) consume quite a bit of CPU time for PSS; Minchan's done a lot of thinking for how we can specify desired page sets for compaction as part of this patch set; and the full procfs walks that some trace collection tools need to undertake take more than 200ms to collect (sometimes much more) due mostly to procfs iteration. ISTM we can do better. While procfs *works* on a functional level, it's inefficient due to the splatting of information we want across several different files (which need to be independently opened --- e.g., /proc/pid/oom_score_adj *and* /proc/pid/status), inefficient due to the ad-hoc text formatting, inefficient due to information over-fetch, and cumbersome due to the fundamental impedance mismatch between filesystem APIs and process lifetimes. procfs itself is also optional, which has caused various bits of awkwardness that you'll recall from the pidfd discussions. How about we solve the problem once and for all? I'm imagining a new process_getinfo(2) that solves all of these problems at the same time. I want something with a few properties: 1) if we want to learn M facts about N things, we enter the kernel once and learn all M*N things, 2) the information we collect is self-consistent (which implies atomicity most of the time), 3) we retrieve the information we want in an efficient binary format, and 4) we don't pay to learn anything not in M. I've jotted down a quick sketch of the API below; I'm curious what everyone else thinks. It'd basically look like this: int process_getinfo(int nr_proc, int* proc, int flags, unsigned long mask, void* out_buf, size_t* inout_sz) We wouldn't use the return value for much: 0 on success and -1 on error with errno set. NR_PROC and PROC together would specify the objects we want to learn about, which would be either PIDs or PIDFDs or maybe nothing at all if FLAGS tells us to inspect every process on the system. MASK is a bitset of facts we want to learn, described below. OUT_BUF and INOUT_SZ are for actually communicating the result. On input, the caller would fill *INOUT_SZ with the size of the buffer to which OUT_BUF points; on success, we'd fill *INOUT_SZ with the number of bytes we actually used. If the output buffer is too small, we'll truncate the result, fail the system call with E2BIG, and fill *INOUT_SZ with the number of needed bytes, inviting the caller to try again. (If a caller supplies something huge like a reusable 512KB buffer on the first call, no reallocation and retries will be necessary in practice on a typically-sized system.) The actual returned buffer is a collection of structures and data blocks starting with a struct process_info. The structures in the returned buffer sometimes contain "pointers" to other structures encoded as byte offsets from the start of the information buffer. Using offsets instead of actual pointers keeps the format the same across 32- and 64-bit versions of process_getinfo and makes it possible to relocate the returned information buffer with memcpy. struct process_info { int first_procrec_offset; // struct procrec* // Examples of system-wide things we could ask for int mem_total_kb; int mem_free_kb; int mem_available_kb; char reserved[]; }; struct procrec { int next_procrec_offset; // struct procrec* // Following fields are self-explanatory and are only examples of the // kind of information we could provide. int tid; int tgid; char status; int oom_score_adj; struct { int real, effective, saved, fs; } uids; int prio; int comm_offset; // char* uint64_t rss_file_kb uint64_t rss_anon_fb; uint64_t vm_seq; int first_procrec_vma_offset; // struct procrec_vma* char reserved[]; }; struct procrec_vma { int next_procrec_vma_offset; // struct procrec_vma* unsigned long start; unsigned long end; int backing_file_name_offset; // char* int prot; char reserved[]; }; Callers would use the returned buffer by casting it to a struct process_info and following the internal "pointers". MASK would specify which bits of information we wanted: for example, if we asked for PROCESS_VM_MEMORY_MAP, the kernel would fill in each struct procret's memory_map field and have it point to a struct procrec_vma in the returned output buffer. If we omitted PROCESS_VM_MEMORY_MAP, we'd leave the memory_map field as NULL (encoded as offset zero). The kernel would embed any strings (like comm and VMA names) into the output buffer; the precise locations would be unspecified so long as callers could find these fields via output-buffer pointers. Because all the structures are variable-length and are chained together with explicit pointers (or offsets) instead of being stuffed into a literal array, we can add additional fields to the output structures any time we want without breaking binary compatibility. Callers would tell the kernel that they're interested in the added struct fields by asking for them via bits in MASK, and kernels that don't understand those fields would just fail the system call with EINVAL or something. Depending on how we call it, we can use this API as a bunch of different things: 1) Quick retrieval of system-wide memory counters, like /proc/meminfo 2) Quick snapshot of all process-thread identities (asking for the wildcard TID match via FLAGS) 3) Fast enumeration of one process's address space 4) Collecting process-summary VM counters (e.g., rss_anon and rss_file) for a set of processes 5) Retrieval of every VMA of every process on the system for debugging We can do all of this with one entry into the kernel and without opening any new file descriptors (unless we want to use pidfds as inputs). We can also make this operation as atomic as we want, e.g., taking mmap_sem while looking at each process and taking tasklist_lock so all the thread IDs line up with their processes. We don't necessarily need to take the mmap_sems of all processes we care about at the same time. Since this isn't a filesystem-based API, we don't have to deal with seq_file or deal with consistency issues arising from userspace programs doing strange things like reading procfs files very slowly in small chunks. Security-wise, we'd just use different access checks for different requested information bits in MASK, maybe supplying "no access" struct procrec entries if a caller doesn't happen to have access to a particular process. I suppose we can talk about whether access check failures should result in dummy values or syscall failure: maybe callers should select which behavior they want. Format-wise, we could also just return flatbuffer messages from the kernel, but I suspect that we don't want flatbuffer in the kernel right now. :-) The API I'm proposing accepts an array of processes to inspect. We could simplify it by accepting just one process and making the caller enter the kernel once per process it wants to learn about, but this simplification would make the API less useful for answering questions like "What's the RSS of every process on the system right now?". What do you think?
On Thu 30-05-19 17:02:14, Minchan Kim wrote: > On Thu, May 30, 2019 at 08:57:55AM +0200, Michal Hocko wrote: > > On Thu 30-05-19 11:17:48, Minchan Kim wrote: [...] > > > First time, I didn't think about atomicity about address range race > > > because MADV_COLD/PAGEOUT is not critical for the race. > > > However you raised the atomicity issue because people would extend > > > hints to destructive ones easily. I agree with that and that's why > > > we discussed how to guarantee the race and Daniel comes up with good idea. > > > > Just for the clarification, I didn't really mean atomicity but rather a > > _consistency_ (essentially time to check to time to use consistency). > > What do you mean by *consistency*? Could you elaborate it more? That you operate on the object you have got by some means. In other words that the range you want to call madvise on hasn't been remapped/replaced by a different mmap operation.
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index b9b51eeb8e1a..b8e230de84a6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -74,4 +74,9 @@ #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ PKEY_DISABLE_WRITE) +struct pr_madvise_param { + int size; /* the size of this structure */ + const struct iovec __user *vec; /* address range array */ +}; + #endif /* __ASM_GENERIC_MMAN_COMMON_H */ diff --git a/mm/madvise.c b/mm/madvise.c index af02aa17e5c1..f4f569dac2bd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; struct vm_area_struct *vma = walk->vma; unsigned long next; + long nr_pages = 0; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { @@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, ptep_test_and_clear_young(vma, addr, pte); deactivate_page(page); + nr_pages++; + } pte_unmap_unlock(orig_pte, ptl); + *(long *)walk->private += nr_pages; cond_resched(); return 0; @@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr, static void madvise_cool_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) + unsigned long addr, unsigned long end, + long *nr_pages) { struct mm_walk cool_walk = { .pmd_entry = madvise_cool_pte_range, .mm = vma->vm_mm, + .private = nr_pages }; tlb_start_vma(tlb, vma); @@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb, } static long madvise_cool(struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr) + unsigned long start_addr, unsigned long end_addr, + long *nr_pages) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; @@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma, lru_add_drain(); tlb_gather_mmu(&tlb, mm, start_addr, end_addr); - madvise_cool_page_range(&tlb, vma, start_addr, end_addr); + madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages); tlb_finish_mmu(&tlb, start_addr, end_addr); return 0; @@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, int isolated = 0; struct vm_area_struct *vma = walk->vma; unsigned long next; + long nr_pages = 0; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { @@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, list_add(&page->lru, &page_list); if (isolated >= SWAP_CLUSTER_MAX) { pte_unmap_unlock(orig_pte, ptl); - reclaim_pages(&page_list); + nr_pages += reclaim_pages(&page_list); isolated = 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); orig_pte = pte; @@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, } pte_unmap_unlock(orig_pte, ptl); - reclaim_pages(&page_list); + nr_pages += reclaim_pages(&page_list); cond_resched(); + *(long *)walk->private += nr_pages; return 0; } static void madvise_cold_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) + unsigned long addr, unsigned long end, + long *nr_pages) { struct mm_walk warm_walk = { .pmd_entry = madvise_cold_pte_range, .mm = vma->vm_mm, + .private = nr_pages, }; tlb_start_vma(tlb, vma); @@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, static long madvise_cold(struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr) + unsigned long start_addr, unsigned long end_addr, + long *nr_pages) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; @@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma, lru_add_drain(); tlb_gather_mmu(&tlb, mm, start_addr, end_addr); - madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages); tlb_finish_mmu(&tlb, start_addr, end_addr); return 0; @@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior, static long madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, int behavior) + unsigned long end, int behavior, long *nr_pages) { switch (behavior) { case MADV_REMOVE: @@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma, case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_COOL: - return madvise_cool(vma, start, end); + return madvise_cool(vma, start, end, nr_pages); case MADV_COLD: - return madvise_cold(vma, start, end); + return madvise_cold(vma, start, end, nr_pages); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(tsk, vma, prev, start, @@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior) } static int madvise_core(struct task_struct *tsk, unsigned long start, - size_t len_in, int behavior) + size_t len_in, int behavior, long *nr_pages) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; @@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, if (start & ~PAGE_MASK) return error; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ @@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, blk_start_plug(&plug); for (;;) { /* Still start < end. */ + long pages = 0; + error = -ENOMEM; if (!vma) goto out; @@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = madvise_vma(tsk, vma, &prev, start, tmp, behavior); + error = madvise_vma(tsk, vma, &prev, start, tmp, + behavior, &pages); if (error) goto out; + *nr_pages += pages; start = tmp; if (prev && start < prev->vm_end) start = prev->vm_end; @@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start, */ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { - return madvise_core(current, start, len_in, behavior); + unsigned long dummy; + + return madvise_core(current, start, len_in, behavior, &dummy); } -SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, - size_t, len_in, int, behavior) +static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param, + struct pr_madvise_param *param) +{ + u32 size; + int ret; + + memset(param, 0, sizeof(*param)); + + ret = get_user(size, &u_param->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) + return -E2BIG; + + if (!size || size > sizeof(struct pr_madvise_param)) + return -EINVAL; + + ret = copy_from_user(param, u_param, size); + if (ret) + return -EFAULT; + + return ret; +} + +static int process_madvise_core(struct task_struct *tsk, int *behaviors, + struct iov_iter *iter, + const struct iovec *range_vec, + unsigned long riovcnt, + unsigned long flags) +{ + int i; + long err; + + for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) { + long ret = 0; + + err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base, + range_vec[i].iov_len, behaviors[i], + &ret); + if (err) + ret = err; + + if (copy_to_iter(&ret, sizeof(long), iter) != + sizeof(long)) { + err = -EFAULT; + break; + } + + err = 0; + } + + return err; +} + +SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem, + const int __user *, hints, + struct pr_madvise_param __user *, results, + struct pr_madvise_param __user *, ranges, + unsigned long, flags) { int ret; struct fd f; struct pid *pid; struct task_struct *tsk; struct mm_struct *mm; + struct pr_madvise_param result_p, range_p; + const struct iovec __user *result_vec, __user *range_vec; + int *behaviors; + struct iovec iovstack_result[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_result; + struct iovec *iov_r = iovstack_r; + struct iov_iter iter; + + if (flags != 0) + return -EINVAL; + + ret = pr_madvise_copy_param(results, &result_p); + if (ret) + return ret; + + ret = pr_madvise_copy_param(ranges, &range_p); + if (ret) + return ret; + + result_vec = result_p.vec; + range_vec = range_p.vec; + + if (result_p.size != sizeof(struct pr_madvise_param) || + range_p.size != sizeof(struct pr_madvise_param)) + return -EINVAL; + + behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL); + if (!behaviors) + return -ENOMEM; + + ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem); + if (ret < 0) + goto free_behavior_vec; + + ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV, + &iov_l, &iter); + if (ret < 0) + goto free_behavior_vec; + + if (!iov_iter_count(&iter)) { + ret = -EINVAL; + goto free_iovecs; + } + + ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem, + UIO_FASTIOV, iovstack_r, &iov_r); + if (ret <= 0) + goto free_iovecs; f = fdget(pidfd); - if (!f.file) - return -EBADF; + if (!f.file) { + ret = -EBADF; + goto free_iovecs; + } pid = pidfd_to_pid(f.file); if (IS_ERR(pid)) { ret = PTR_ERR(pid); - goto err; + goto put_fd; } ret = -EINVAL; @@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, tsk = pid_task(pid, PIDTYPE_PID); if (!tsk) { rcu_read_unlock(); - goto err; + goto put_fd; } get_task_struct(tsk); rcu_read_unlock(); @@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start, ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; if (ret == -EACCES) ret = -EPERM; - goto err; + goto put_task; } - ret = madvise_core(tsk, start, len_in, behavior); + + ret = process_madvise_core(tsk, behaviors, &iter, iov_r, + nr_elem, flags); mmput(mm); +put_task: put_task_struct(tsk); -err: +put_fd: fdput(f); +free_iovecs: + if (iov_r != iovstack_r) + kfree(iov_r); + kfree(iov_l); +free_behavior_vec: + kfree(behaviors); + return ret; }
Currently, process_madvise syscall works for only one address range so user should call the syscall several times to give hints to multiple address range. This patch extends process_madvise syscall to support multiple hints, address ranges and return vaules so user could give hints all at once. struct pr_madvise_param { int size; /* the size of this structure */ const struct iovec __user *vec; /* address range array */ } int process_madvise(int pidfd, ssize_t nr_elem, int *behavior, struct pr_madvise_param *results, struct pr_madvise_param *ranges, unsigned long flags); - pidfd target process fd - nr_elem the number of elemenent of array behavior, results, ranges - behavior hints for each address range in remote process so that user could give different hints for each range. - results array of buffers to get results for associated remote address range action. - ranges array to buffers to have remote process's address ranges to be processed - flags extra argument for the future. It should be zero this moment. Example) struct pr_madvise_param { int size; const struct iovec *vec; }; int main(int argc, char *argv[]) { struct pr_madvise_param retp, rangep; struct iovec result_vec[2], range_vec[2]; int hints[2]; long ret[2]; void *addr[2]; pid_t pid; char cmd[64] = {0,}; addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (MAP_FAILED == addr[0]) return 1; addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE, MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (MAP_FAILED == addr[1]) return 1; hints[0] = MADV_COLD; range_vec[0].iov_base = addr[0]; range_vec[0].iov_len = ALLOC_SIZE; result_vec[0].iov_base = &ret[0]; result_vec[0].iov_len = sizeof(long); retp.vec = result_vec; retp.size = sizeof(struct pr_madvise_param); hints[1] = MADV_COOL; range_vec[1].iov_base = addr[1]; range_vec[1].iov_len = ALLOC_SIZE; result_vec[1].iov_base = &ret[1]; result_vec[1].iov_len = sizeof(long); rangep.vec = range_vec; rangep.size = sizeof(struct pr_madvise_param); pid = fork(); if (!pid) { sleep(10); } else { int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC); if (pidfd < 0) return 1; /* munmap to make pages private for the child */ munmap(addr[0], ALLOC_SIZE); munmap(addr[1], ALLOC_SIZE); system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); if (syscall(__NR_process_madvise, pidfd, 2, behaviors, &retp, &rangep, 0)) perror("process_madvise fail\n"); system("cat /proc/vmstat | egrep 'pswpout|deactivate'"); } return 0; } Signed-off-by: Minchan Kim <minchan@kernel.org> --- include/uapi/asm-generic/mman-common.h | 5 + mm/madvise.c | 184 +++++++++++++++++++++---- 2 files changed, 166 insertions(+), 23 deletions(-)