diff mbox series

[RFC,6/7] mm: extend process_madvise syscall to support vector arrary

Message ID 20190520035254.57579-7-minchan@kernel.org (mailing list archive)
State New, archived
Headers show
Series introduce memory hinting API for external process | expand

Commit Message

Minchan Kim May 20, 2019, 3:52 a.m. UTC
Currently, process_madvise syscall works for only one address range
so user should call the syscall several times to give hints to
multiple address range.

This patch extends process_madvise syscall to support multiple
hints, address ranges and return vaules so user could give hints
all at once.

struct pr_madvise_param {
    int size;                       /* the size of this structure */
    const struct iovec __user *vec; /* address range array */
}

int process_madvise(int pidfd, ssize_t nr_elem,
		    int *behavior,
		    struct pr_madvise_param *results,
		    struct pr_madvise_param *ranges,
		    unsigned long flags);

- pidfd

target process fd

- nr_elem

the number of elemenent of array behavior, results, ranges

- behavior

hints for each address range in remote process so that user could
give different hints for each range.

- results

array of buffers to get results for associated remote address range
action.

- ranges

array to buffers to have remote process's address ranges to be
processed

- flags

extra argument for the future. It should be zero this moment.

Example)

struct pr_madvise_param {
        int size;
        const struct iovec *vec;
};

int main(int argc, char *argv[])
{
        struct pr_madvise_param retp, rangep;
        struct iovec result_vec[2], range_vec[2];
        int hints[2];
        long ret[2];
        void *addr[2];

        pid_t pid;
        char cmd[64] = {0,};
        addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
                          MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);

        if (MAP_FAILED == addr[0])
                return 1;

        addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
                          MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);

        if (MAP_FAILED == addr[1])
                return 1;

        hints[0] = MADV_COLD;
	range_vec[0].iov_base = addr[0];
        range_vec[0].iov_len = ALLOC_SIZE;
        result_vec[0].iov_base = &ret[0];
        result_vec[0].iov_len = sizeof(long);
	retp.vec = result_vec;
        retp.size = sizeof(struct pr_madvise_param);

        hints[1] = MADV_COOL;
        range_vec[1].iov_base = addr[1];
        range_vec[1].iov_len = ALLOC_SIZE;
        result_vec[1].iov_base = &ret[1];
        result_vec[1].iov_len = sizeof(long);
        rangep.vec = range_vec;
        rangep.size = sizeof(struct pr_madvise_param);

        pid = fork();
        if (!pid) {
                sleep(10);
        } else {
                int pidfd = open(cmd,  O_DIRECTORY | O_CLOEXEC);
                if (pidfd < 0)
                        return 1;

                /* munmap to make pages private for the child */
                munmap(addr[0], ALLOC_SIZE);
                munmap(addr[1], ALLOC_SIZE);
                system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
                if (syscall(__NR_process_madvise, pidfd, 2, behaviors,
						&retp, &rangep, 0))
                        perror("process_madvise fail\n");
                system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
        }

        return 0;
}

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/uapi/asm-generic/mman-common.h |   5 +
 mm/madvise.c                           | 184 +++++++++++++++++++++----
 2 files changed, 166 insertions(+), 23 deletions(-)

Comments

Michal Hocko May 20, 2019, 9:22 a.m. UTC | #1
[Cc linux-api]

On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> Currently, process_madvise syscall works for only one address range
> so user should call the syscall several times to give hints to
> multiple address range.

Is that a problem? How big of a problem? Any numbers?

> This patch extends process_madvise syscall to support multiple
> hints, address ranges and return vaules so user could give hints
> all at once.
> 
> struct pr_madvise_param {
>     int size;                       /* the size of this structure */
>     const struct iovec __user *vec; /* address range array */
> }
> 
> int process_madvise(int pidfd, ssize_t nr_elem,
> 		    int *behavior,
> 		    struct pr_madvise_param *results,
> 		    struct pr_madvise_param *ranges,
> 		    unsigned long flags);
> 
> - pidfd
> 
> target process fd
> 
> - nr_elem
> 
> the number of elemenent of array behavior, results, ranges
> 
> - behavior
> 
> hints for each address range in remote process so that user could
> give different hints for each range.

What is the guarantee of a single call? Do all hints get applied or the
first failure backs of? What are the atomicity guarantees?

> 
> - results
> 
> array of buffers to get results for associated remote address range
> action.
> 
> - ranges
> 
> array to buffers to have remote process's address ranges to be
> processed
> 
> - flags
> 
> extra argument for the future. It should be zero this moment.
> 
> Example)
> 
> struct pr_madvise_param {
>         int size;
>         const struct iovec *vec;
> };
> 
> int main(int argc, char *argv[])
> {
>         struct pr_madvise_param retp, rangep;
>         struct iovec result_vec[2], range_vec[2];
>         int hints[2];
>         long ret[2];
>         void *addr[2];
> 
>         pid_t pid;
>         char cmd[64] = {0,};
>         addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
>                           MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> 
>         if (MAP_FAILED == addr[0])
>                 return 1;
> 
>         addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
>                           MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> 
>         if (MAP_FAILED == addr[1])
>                 return 1;
> 
>         hints[0] = MADV_COLD;
> 	range_vec[0].iov_base = addr[0];
>         range_vec[0].iov_len = ALLOC_SIZE;
>         result_vec[0].iov_base = &ret[0];
>         result_vec[0].iov_len = sizeof(long);
> 	retp.vec = result_vec;
>         retp.size = sizeof(struct pr_madvise_param);
> 
>         hints[1] = MADV_COOL;
>         range_vec[1].iov_base = addr[1];
>         range_vec[1].iov_len = ALLOC_SIZE;
>         result_vec[1].iov_base = &ret[1];
>         result_vec[1].iov_len = sizeof(long);
>         rangep.vec = range_vec;
>         rangep.size = sizeof(struct pr_madvise_param);
> 
>         pid = fork();
>         if (!pid) {
>                 sleep(10);
>         } else {
>                 int pidfd = open(cmd,  O_DIRECTORY | O_CLOEXEC);
>                 if (pidfd < 0)
>                         return 1;
> 
>                 /* munmap to make pages private for the child */
>                 munmap(addr[0], ALLOC_SIZE);
>                 munmap(addr[1], ALLOC_SIZE);
>                 system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
>                 if (syscall(__NR_process_madvise, pidfd, 2, behaviors,
> 						&retp, &rangep, 0))
>                         perror("process_madvise fail\n");
>                 system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
>         }
> 
>         return 0;
> }
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  include/uapi/asm-generic/mman-common.h |   5 +
>  mm/madvise.c                           | 184 +++++++++++++++++++++----
>  2 files changed, 166 insertions(+), 23 deletions(-)
> 
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index b9b51eeb8e1a..b8e230de84a6 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -74,4 +74,9 @@
>  #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
>  				 PKEY_DISABLE_WRITE)
>  
> +struct pr_madvise_param {
> +	int size;			/* the size of this structure */
> +	const struct iovec __user *vec;	/* address range array */
> +};
> +
>  #endif /* __ASM_GENERIC_MMAN_COMMON_H */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index af02aa17e5c1..f4f569dac2bd 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>  	struct page *page;
>  	struct vm_area_struct *vma = walk->vma;
>  	unsigned long next;
> +	long nr_pages = 0;
>  
>  	next = pmd_addr_end(addr, end);
>  	if (pmd_trans_huge(*pmd)) {
> @@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>  
>  		ptep_test_and_clear_young(vma, addr, pte);
>  		deactivate_page(page);
> +		nr_pages++;
> +
>  	}
>  
>  	pte_unmap_unlock(orig_pte, ptl);
> +	*(long *)walk->private += nr_pages;
>  	cond_resched();
>  
>  	return 0;
> @@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>  
>  static void madvise_cool_page_range(struct mmu_gather *tlb,
>  			     struct vm_area_struct *vma,
> -			     unsigned long addr, unsigned long end)
> +			     unsigned long addr, unsigned long end,
> +			     long *nr_pages)
>  {
>  	struct mm_walk cool_walk = {
>  		.pmd_entry = madvise_cool_pte_range,
>  		.mm = vma->vm_mm,
> +		.private = nr_pages
>  	};
>  
>  	tlb_start_vma(tlb, vma);
> @@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb,
>  }
>  
>  static long madvise_cool(struct vm_area_struct *vma,
> -			unsigned long start_addr, unsigned long end_addr)
> +			unsigned long start_addr, unsigned long end_addr,
> +			long *nr_pages)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct mmu_gather tlb;
> @@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma,
>  
>  	lru_add_drain();
>  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> -	madvise_cool_page_range(&tlb, vma, start_addr, end_addr);
> +	madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
>  	tlb_finish_mmu(&tlb, start_addr, end_addr);
>  
>  	return 0;
> @@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  	int isolated = 0;
>  	struct vm_area_struct *vma = walk->vma;
>  	unsigned long next;
> +	long nr_pages = 0;
>  
>  	next = pmd_addr_end(addr, end);
>  	if (pmd_trans_huge(*pmd)) {
> @@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  		list_add(&page->lru, &page_list);
>  		if (isolated >= SWAP_CLUSTER_MAX) {
>  			pte_unmap_unlock(orig_pte, ptl);
> -			reclaim_pages(&page_list);
> +			nr_pages += reclaim_pages(&page_list);
>  			isolated = 0;
>  			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>  			orig_pte = pte;
> @@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  	}
>  
>  	pte_unmap_unlock(orig_pte, ptl);
> -	reclaim_pages(&page_list);
> +	nr_pages += reclaim_pages(&page_list);
>  	cond_resched();
>  
> +	*(long *)walk->private += nr_pages;
>  	return 0;
>  }
>  
>  static void madvise_cold_page_range(struct mmu_gather *tlb,
>  			     struct vm_area_struct *vma,
> -			     unsigned long addr, unsigned long end)
> +			     unsigned long addr, unsigned long end,
> +			     long *nr_pages)
>  {
>  	struct mm_walk warm_walk = {
>  		.pmd_entry = madvise_cold_pte_range,
>  		.mm = vma->vm_mm,
> +		.private = nr_pages,
>  	};
>  
>  	tlb_start_vma(tlb, vma);
> @@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
>  
>  
>  static long madvise_cold(struct vm_area_struct *vma,
> -			unsigned long start_addr, unsigned long end_addr)
> +			unsigned long start_addr, unsigned long end_addr,
> +			long *nr_pages)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct mmu_gather tlb;
> @@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma,
>  
>  	lru_add_drain();
>  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> -	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
> +	madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
>  	tlb_finish_mmu(&tlb, start_addr, end_addr);
>  
>  	return 0;
> @@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior,
>  static long
>  madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
>  		struct vm_area_struct **prev, unsigned long start,
> -		unsigned long end, int behavior)
> +		unsigned long end, int behavior, long *nr_pages)
>  {
>  	switch (behavior) {
>  	case MADV_REMOVE:
> @@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
>  	case MADV_WILLNEED:
>  		return madvise_willneed(vma, prev, start, end);
>  	case MADV_COOL:
> -		return madvise_cool(vma, start, end);
> +		return madvise_cool(vma, start, end, nr_pages);
>  	case MADV_COLD:
> -		return madvise_cold(vma, start, end);
> +		return madvise_cold(vma, start, end, nr_pages);
>  	case MADV_FREE:
>  	case MADV_DONTNEED:
>  		return madvise_dontneed_free(tsk, vma, prev, start,
> @@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior)
>  }
>  
>  static int madvise_core(struct task_struct *tsk, unsigned long start,
> -			size_t len_in, int behavior)
> +			size_t len_in, int behavior, long *nr_pages)
>  {
>  	unsigned long end, tmp;
>  	struct vm_area_struct *vma, *prev;
> @@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  
>  	if (start & ~PAGE_MASK)
>  		return error;
> +
>  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
>  
>  	/* Check to see whether len was rounded up from small -ve to zero */
> @@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  	blk_start_plug(&plug);
>  	for (;;) {
>  		/* Still start < end. */
> +		long pages = 0;
> +
>  		error = -ENOMEM;
>  		if (!vma)
>  			goto out;
> @@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  			tmp = end;
>  
>  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
> -		error = madvise_vma(tsk, vma, &prev, start, tmp, behavior);
> +		error = madvise_vma(tsk, vma, &prev, start, tmp,
> +					behavior, &pages);
>  		if (error)
>  			goto out;
> +		*nr_pages += pages;
>  		start = tmp;
>  		if (prev && start < prev->vm_end)
>  			start = prev->vm_end;
> @@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>   */
>  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
>  {
> -	return madvise_core(current, start, len_in, behavior);
> +	unsigned long dummy;
> +
> +	return madvise_core(current, start, len_in, behavior, &dummy);
>  }
>  
> -SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> -		size_t, len_in, int, behavior)
> +static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,
> +		struct pr_madvise_param *param)
> +{
> +	u32 size;
> +	int ret;
> +
> +	memset(param, 0, sizeof(*param));
> +
> +	ret = get_user(size, &u_param->size);
> +	if (ret)
> +		return ret;
> +
> +	if (size > PAGE_SIZE)
> +		return -E2BIG;
> +
> +	if (!size || size > sizeof(struct pr_madvise_param))
> +		return -EINVAL;
> +
> +	ret = copy_from_user(param, u_param, size);
> +	if (ret)
> +		return -EFAULT;
> +
> +	return ret;
> +}
> +
> +static int process_madvise_core(struct task_struct *tsk, int *behaviors,
> +				struct iov_iter *iter,
> +				const struct iovec *range_vec,
> +				unsigned long riovcnt,
> +				unsigned long flags)
> +{
> +	int i;
> +	long err;
> +
> +	for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {
> +		long ret = 0;
> +
> +		err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,
> +				range_vec[i].iov_len, behaviors[i],
> +				&ret);
> +		if (err)
> +			ret = err;
> +
> +		if (copy_to_iter(&ret, sizeof(long), iter) !=
> +				sizeof(long)) {
> +			err = -EFAULT;
> +			break;
> +		}
> +
> +		err = 0;
> +	}
> +
> +	return err;
> +}
> +
> +SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,
> +			const int __user *, hints,
> +			struct pr_madvise_param __user *, results,
> +			struct pr_madvise_param __user *, ranges,
> +			unsigned long, flags)
>  {
>  	int ret;
>  	struct fd f;
>  	struct pid *pid;
>  	struct task_struct *tsk;
>  	struct mm_struct *mm;
> +	struct pr_madvise_param result_p, range_p;
> +	const struct iovec __user *result_vec, __user *range_vec;
> +	int *behaviors;
> +	struct iovec iovstack_result[UIO_FASTIOV];
> +	struct iovec iovstack_r[UIO_FASTIOV];
> +	struct iovec *iov_l = iovstack_result;
> +	struct iovec *iov_r = iovstack_r;
> +	struct iov_iter iter;
> +
> +	if (flags != 0)
> +		return -EINVAL;
> +
> +	ret = pr_madvise_copy_param(results, &result_p);
> +	if (ret)
> +		return ret;
> +
> +	ret = pr_madvise_copy_param(ranges, &range_p);
> +	if (ret)
> +		return ret;
> +
> +	result_vec = result_p.vec;
> +	range_vec = range_p.vec;
> +
> +	if (result_p.size != sizeof(struct pr_madvise_param) ||
> +			range_p.size != sizeof(struct pr_madvise_param))
> +		return -EINVAL;
> +
> +	behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);
> +	if (!behaviors)
> +		return -ENOMEM;
> +
> +	ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);
> +	if (ret < 0)
> +		goto free_behavior_vec;
> +
> +	ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,
> +				&iov_l, &iter);
> +	if (ret < 0)
> +		goto free_behavior_vec;
> +
> +	if (!iov_iter_count(&iter)) {
> +		ret = -EINVAL;
> +		goto free_iovecs;
> +	}
> +
> +	ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,
> +				UIO_FASTIOV, iovstack_r, &iov_r);
> +	if (ret <= 0)
> +		goto free_iovecs;
>  
>  	f = fdget(pidfd);
> -	if (!f.file)
> -		return -EBADF;
> +	if (!f.file) {
> +		ret = -EBADF;
> +		goto free_iovecs;
> +	}
>  
>  	pid = pidfd_to_pid(f.file);
>  	if (IS_ERR(pid)) {
>  		ret = PTR_ERR(pid);
> -		goto err;
> +		goto put_fd;
>  	}
>  
>  	ret = -EINVAL;
> @@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
>  	tsk = pid_task(pid, PIDTYPE_PID);
>  	if (!tsk) {
>  		rcu_read_unlock();
> -		goto err;
> +		goto put_fd;
>  	}
>  	get_task_struct(tsk);
>  	rcu_read_unlock();
> @@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
>  		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
>  		if (ret == -EACCES)
>  			ret = -EPERM;
> -		goto err;
> +		goto put_task;
>  	}
> -	ret = madvise_core(tsk, start, len_in, behavior);
> +
> +	ret = process_madvise_core(tsk, behaviors, &iter, iov_r,
> +					nr_elem, flags);
>  	mmput(mm);
> +put_task:
>  	put_task_struct(tsk);
> -err:
> +put_fd:
>  	fdput(f);
> +free_iovecs:
> +	if (iov_r != iovstack_r)
> +		kfree(iov_r);
> +	kfree(iov_l);
> +free_behavior_vec:
> +	kfree(behaviors);
> +
>  	return ret;
>  }
> -- 
> 2.21.0.1020.gf2820cf01a-goog
>
Minchan Kim May 21, 2019, 2:48 a.m. UTC | #2
On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> [Cc linux-api]
> 
> On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > Currently, process_madvise syscall works for only one address range
> > so user should call the syscall several times to give hints to
> > multiple address range.
> 
> Is that a problem? How big of a problem? Any numbers?

We easily have 2000+ vma so it's not trivial overhead. I will come up
with number in the description at respin.

> 
> > This patch extends process_madvise syscall to support multiple
> > hints, address ranges and return vaules so user could give hints
> > all at once.
> > 
> > struct pr_madvise_param {
> >     int size;                       /* the size of this structure */
> >     const struct iovec __user *vec; /* address range array */
> > }
> > 
> > int process_madvise(int pidfd, ssize_t nr_elem,
> > 		    int *behavior,
> > 		    struct pr_madvise_param *results,
> > 		    struct pr_madvise_param *ranges,
> > 		    unsigned long flags);
> > 
> > - pidfd
> > 
> > target process fd
> > 
> > - nr_elem
> > 
> > the number of elemenent of array behavior, results, ranges
> > 
> > - behavior
> > 
> > hints for each address range in remote process so that user could
> > give different hints for each range.
> 
> What is the guarantee of a single call? Do all hints get applied or the
> first failure backs of? What are the atomicity guarantees?

All hints will be tried even though one of them is failed. User will
see the success or failure from the restuls parameter.
For the single call, there is no guarantee of atomicity.

> 
> > 
> > - results
> > 
> > array of buffers to get results for associated remote address range
> > action.
> > 
> > - ranges
> > 
> > array to buffers to have remote process's address ranges to be
> > processed
> > 
> > - flags
> > 
> > extra argument for the future. It should be zero this moment.
> > 
> > Example)
> > 
> > struct pr_madvise_param {
> >         int size;
> >         const struct iovec *vec;
> > };
> > 
> > int main(int argc, char *argv[])
> > {
> >         struct pr_madvise_param retp, rangep;
> >         struct iovec result_vec[2], range_vec[2];
> >         int hints[2];
> >         long ret[2];
> >         void *addr[2];
> > 
> >         pid_t pid;
> >         char cmd[64] = {0,};
> >         addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
> >                           MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> > 
> >         if (MAP_FAILED == addr[0])
> >                 return 1;
> > 
> >         addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
> >                           MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> > 
> >         if (MAP_FAILED == addr[1])
> >                 return 1;
> > 
> >         hints[0] = MADV_COLD;
> > 	range_vec[0].iov_base = addr[0];
> >         range_vec[0].iov_len = ALLOC_SIZE;
> >         result_vec[0].iov_base = &ret[0];
> >         result_vec[0].iov_len = sizeof(long);
> > 	retp.vec = result_vec;
> >         retp.size = sizeof(struct pr_madvise_param);
> > 
> >         hints[1] = MADV_COOL;
> >         range_vec[1].iov_base = addr[1];
> >         range_vec[1].iov_len = ALLOC_SIZE;
> >         result_vec[1].iov_base = &ret[1];
> >         result_vec[1].iov_len = sizeof(long);
> >         rangep.vec = range_vec;
> >         rangep.size = sizeof(struct pr_madvise_param);
> > 
> >         pid = fork();
> >         if (!pid) {
> >                 sleep(10);
> >         } else {
> >                 int pidfd = open(cmd,  O_DIRECTORY | O_CLOEXEC);
> >                 if (pidfd < 0)
> >                         return 1;
> > 
> >                 /* munmap to make pages private for the child */
> >                 munmap(addr[0], ALLOC_SIZE);
> >                 munmap(addr[1], ALLOC_SIZE);
> >                 system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
> >                 if (syscall(__NR_process_madvise, pidfd, 2, behaviors,
> > 						&retp, &rangep, 0))
> >                         perror("process_madvise fail\n");
> >                 system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
> >         }
> > 
> >         return 0;
> > }
> > 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> >  include/uapi/asm-generic/mman-common.h |   5 +
> >  mm/madvise.c                           | 184 +++++++++++++++++++++----
> >  2 files changed, 166 insertions(+), 23 deletions(-)
> > 
> > diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> > index b9b51eeb8e1a..b8e230de84a6 100644
> > --- a/include/uapi/asm-generic/mman-common.h
> > +++ b/include/uapi/asm-generic/mman-common.h
> > @@ -74,4 +74,9 @@
> >  #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
> >  				 PKEY_DISABLE_WRITE)
> >  
> > +struct pr_madvise_param {
> > +	int size;			/* the size of this structure */
> > +	const struct iovec __user *vec;	/* address range array */
> > +};
> > +
> >  #endif /* __ASM_GENERIC_MMAN_COMMON_H */
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index af02aa17e5c1..f4f569dac2bd 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
> >  	struct page *page;
> >  	struct vm_area_struct *vma = walk->vma;
> >  	unsigned long next;
> > +	long nr_pages = 0;
> >  
> >  	next = pmd_addr_end(addr, end);
> >  	if (pmd_trans_huge(*pmd)) {
> > @@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
> >  
> >  		ptep_test_and_clear_young(vma, addr, pte);
> >  		deactivate_page(page);
> > +		nr_pages++;
> > +
> >  	}
> >  
> >  	pte_unmap_unlock(orig_pte, ptl);
> > +	*(long *)walk->private += nr_pages;
> >  	cond_resched();
> >  
> >  	return 0;
> > @@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
> >  
> >  static void madvise_cool_page_range(struct mmu_gather *tlb,
> >  			     struct vm_area_struct *vma,
> > -			     unsigned long addr, unsigned long end)
> > +			     unsigned long addr, unsigned long end,
> > +			     long *nr_pages)
> >  {
> >  	struct mm_walk cool_walk = {
> >  		.pmd_entry = madvise_cool_pte_range,
> >  		.mm = vma->vm_mm,
> > +		.private = nr_pages
> >  	};
> >  
> >  	tlb_start_vma(tlb, vma);
> > @@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb,
> >  }
> >  
> >  static long madvise_cool(struct vm_area_struct *vma,
> > -			unsigned long start_addr, unsigned long end_addr)
> > +			unsigned long start_addr, unsigned long end_addr,
> > +			long *nr_pages)
> >  {
> >  	struct mm_struct *mm = vma->vm_mm;
> >  	struct mmu_gather tlb;
> > @@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma,
> >  
> >  	lru_add_drain();
> >  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> > -	madvise_cool_page_range(&tlb, vma, start_addr, end_addr);
> > +	madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
> >  	tlb_finish_mmu(&tlb, start_addr, end_addr);
> >  
> >  	return 0;
> > @@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> >  	int isolated = 0;
> >  	struct vm_area_struct *vma = walk->vma;
> >  	unsigned long next;
> > +	long nr_pages = 0;
> >  
> >  	next = pmd_addr_end(addr, end);
> >  	if (pmd_trans_huge(*pmd)) {
> > @@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> >  		list_add(&page->lru, &page_list);
> >  		if (isolated >= SWAP_CLUSTER_MAX) {
> >  			pte_unmap_unlock(orig_pte, ptl);
> > -			reclaim_pages(&page_list);
> > +			nr_pages += reclaim_pages(&page_list);
> >  			isolated = 0;
> >  			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> >  			orig_pte = pte;
> > @@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> >  	}
> >  
> >  	pte_unmap_unlock(orig_pte, ptl);
> > -	reclaim_pages(&page_list);
> > +	nr_pages += reclaim_pages(&page_list);
> >  	cond_resched();
> >  
> > +	*(long *)walk->private += nr_pages;
> >  	return 0;
> >  }
> >  
> >  static void madvise_cold_page_range(struct mmu_gather *tlb,
> >  			     struct vm_area_struct *vma,
> > -			     unsigned long addr, unsigned long end)
> > +			     unsigned long addr, unsigned long end,
> > +			     long *nr_pages)
> >  {
> >  	struct mm_walk warm_walk = {
> >  		.pmd_entry = madvise_cold_pte_range,
> >  		.mm = vma->vm_mm,
> > +		.private = nr_pages,
> >  	};
> >  
> >  	tlb_start_vma(tlb, vma);
> > @@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
> >  
> >  
> >  static long madvise_cold(struct vm_area_struct *vma,
> > -			unsigned long start_addr, unsigned long end_addr)
> > +			unsigned long start_addr, unsigned long end_addr,
> > +			long *nr_pages)
> >  {
> >  	struct mm_struct *mm = vma->vm_mm;
> >  	struct mmu_gather tlb;
> > @@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma,
> >  
> >  	lru_add_drain();
> >  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> > -	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
> > +	madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
> >  	tlb_finish_mmu(&tlb, start_addr, end_addr);
> >  
> >  	return 0;
> > @@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior,
> >  static long
> >  madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
> >  		struct vm_area_struct **prev, unsigned long start,
> > -		unsigned long end, int behavior)
> > +		unsigned long end, int behavior, long *nr_pages)
> >  {
> >  	switch (behavior) {
> >  	case MADV_REMOVE:
> > @@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
> >  	case MADV_WILLNEED:
> >  		return madvise_willneed(vma, prev, start, end);
> >  	case MADV_COOL:
> > -		return madvise_cool(vma, start, end);
> > +		return madvise_cool(vma, start, end, nr_pages);
> >  	case MADV_COLD:
> > -		return madvise_cold(vma, start, end);
> > +		return madvise_cold(vma, start, end, nr_pages);
> >  	case MADV_FREE:
> >  	case MADV_DONTNEED:
> >  		return madvise_dontneed_free(tsk, vma, prev, start,
> > @@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior)
> >  }
> >  
> >  static int madvise_core(struct task_struct *tsk, unsigned long start,
> > -			size_t len_in, int behavior)
> > +			size_t len_in, int behavior, long *nr_pages)
> >  {
> >  	unsigned long end, tmp;
> >  	struct vm_area_struct *vma, *prev;
> > @@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> >  
> >  	if (start & ~PAGE_MASK)
> >  		return error;
> > +
> >  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
> >  
> >  	/* Check to see whether len was rounded up from small -ve to zero */
> > @@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> >  	blk_start_plug(&plug);
> >  	for (;;) {
> >  		/* Still start < end. */
> > +		long pages = 0;
> > +
> >  		error = -ENOMEM;
> >  		if (!vma)
> >  			goto out;
> > @@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> >  			tmp = end;
> >  
> >  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
> > -		error = madvise_vma(tsk, vma, &prev, start, tmp, behavior);
> > +		error = madvise_vma(tsk, vma, &prev, start, tmp,
> > +					behavior, &pages);
> >  		if (error)
> >  			goto out;
> > +		*nr_pages += pages;
> >  		start = tmp;
> >  		if (prev && start < prev->vm_end)
> >  			start = prev->vm_end;
> > @@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> >   */
> >  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
> >  {
> > -	return madvise_core(current, start, len_in, behavior);
> > +	unsigned long dummy;
> > +
> > +	return madvise_core(current, start, len_in, behavior, &dummy);
> >  }
> >  
> > -SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> > -		size_t, len_in, int, behavior)
> > +static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,
> > +		struct pr_madvise_param *param)
> > +{
> > +	u32 size;
> > +	int ret;
> > +
> > +	memset(param, 0, sizeof(*param));
> > +
> > +	ret = get_user(size, &u_param->size);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (size > PAGE_SIZE)
> > +		return -E2BIG;
> > +
> > +	if (!size || size > sizeof(struct pr_madvise_param))
> > +		return -EINVAL;
> > +
> > +	ret = copy_from_user(param, u_param, size);
> > +	if (ret)
> > +		return -EFAULT;
> > +
> > +	return ret;
> > +}
> > +
> > +static int process_madvise_core(struct task_struct *tsk, int *behaviors,
> > +				struct iov_iter *iter,
> > +				const struct iovec *range_vec,
> > +				unsigned long riovcnt,
> > +				unsigned long flags)
> > +{
> > +	int i;
> > +	long err;
> > +
> > +	for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {
> > +		long ret = 0;
> > +
> > +		err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,
> > +				range_vec[i].iov_len, behaviors[i],
> > +				&ret);
> > +		if (err)
> > +			ret = err;
> > +
> > +		if (copy_to_iter(&ret, sizeof(long), iter) !=
> > +				sizeof(long)) {
> > +			err = -EFAULT;
> > +			break;
> > +		}
> > +
> > +		err = 0;
> > +	}
> > +
> > +	return err;
> > +}
> > +
> > +SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,
> > +			const int __user *, hints,
> > +			struct pr_madvise_param __user *, results,
> > +			struct pr_madvise_param __user *, ranges,
> > +			unsigned long, flags)
> >  {
> >  	int ret;
> >  	struct fd f;
> >  	struct pid *pid;
> >  	struct task_struct *tsk;
> >  	struct mm_struct *mm;
> > +	struct pr_madvise_param result_p, range_p;
> > +	const struct iovec __user *result_vec, __user *range_vec;
> > +	int *behaviors;
> > +	struct iovec iovstack_result[UIO_FASTIOV];
> > +	struct iovec iovstack_r[UIO_FASTIOV];
> > +	struct iovec *iov_l = iovstack_result;
> > +	struct iovec *iov_r = iovstack_r;
> > +	struct iov_iter iter;
> > +
> > +	if (flags != 0)
> > +		return -EINVAL;
> > +
> > +	ret = pr_madvise_copy_param(results, &result_p);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = pr_madvise_copy_param(ranges, &range_p);
> > +	if (ret)
> > +		return ret;
> > +
> > +	result_vec = result_p.vec;
> > +	range_vec = range_p.vec;
> > +
> > +	if (result_p.size != sizeof(struct pr_madvise_param) ||
> > +			range_p.size != sizeof(struct pr_madvise_param))
> > +		return -EINVAL;
> > +
> > +	behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);
> > +	if (!behaviors)
> > +		return -ENOMEM;
> > +
> > +	ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);
> > +	if (ret < 0)
> > +		goto free_behavior_vec;
> > +
> > +	ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,
> > +				&iov_l, &iter);
> > +	if (ret < 0)
> > +		goto free_behavior_vec;
> > +
> > +	if (!iov_iter_count(&iter)) {
> > +		ret = -EINVAL;
> > +		goto free_iovecs;
> > +	}
> > +
> > +	ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,
> > +				UIO_FASTIOV, iovstack_r, &iov_r);
> > +	if (ret <= 0)
> > +		goto free_iovecs;
> >  
> >  	f = fdget(pidfd);
> > -	if (!f.file)
> > -		return -EBADF;
> > +	if (!f.file) {
> > +		ret = -EBADF;
> > +		goto free_iovecs;
> > +	}
> >  
> >  	pid = pidfd_to_pid(f.file);
> >  	if (IS_ERR(pid)) {
> >  		ret = PTR_ERR(pid);
> > -		goto err;
> > +		goto put_fd;
> >  	}
> >  
> >  	ret = -EINVAL;
> > @@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> >  	tsk = pid_task(pid, PIDTYPE_PID);
> >  	if (!tsk) {
> >  		rcu_read_unlock();
> > -		goto err;
> > +		goto put_fd;
> >  	}
> >  	get_task_struct(tsk);
> >  	rcu_read_unlock();
> > @@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> >  		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> >  		if (ret == -EACCES)
> >  			ret = -EPERM;
> > -		goto err;
> > +		goto put_task;
> >  	}
> > -	ret = madvise_core(tsk, start, len_in, behavior);
> > +
> > +	ret = process_madvise_core(tsk, behaviors, &iter, iov_r,
> > +					nr_elem, flags);
> >  	mmput(mm);
> > +put_task:
> >  	put_task_struct(tsk);
> > -err:
> > +put_fd:
> >  	fdput(f);
> > +free_iovecs:
> > +	if (iov_r != iovstack_r)
> > +		kfree(iov_r);
> > +	kfree(iov_l);
> > +free_behavior_vec:
> > +	kfree(behaviors);
> > +
> >  	return ret;
> >  }
> > -- 
> > 2.21.0.1020.gf2820cf01a-goog
> > 
> 
> -- 
> Michal Hocko
> SUSE Labs
Michal Hocko May 21, 2019, 6:24 a.m. UTC | #3
On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > [Cc linux-api]
> > 
> > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > Currently, process_madvise syscall works for only one address range
> > > so user should call the syscall several times to give hints to
> > > multiple address range.
> > 
> > Is that a problem? How big of a problem? Any numbers?
> 
> We easily have 2000+ vma so it's not trivial overhead. I will come up
> with number in the description at respin.

Does this really have to be a fast operation? I would expect the monitor
is by no means a fast path. The system call overhead is not what it used
to be, sigh, but still for something that is not a hot path it should be
tolerable, especially when the whole operation is quite expensive on its
own (wrt. the syscall entry/exit).

I am not saying we do not need a multiplexing API, I am just not sure
we need it right away. Btw. there was some demand for other MM syscalls
to provide a multiplexing API (e.g. mprotect), maybe it would be better
to handle those in one go?
Minchan Kim May 21, 2019, 10:26 a.m. UTC | #4
On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote:
> On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > > [Cc linux-api]
> > > 
> > > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > > Currently, process_madvise syscall works for only one address range
> > > > so user should call the syscall several times to give hints to
> > > > multiple address range.
> > > 
> > > Is that a problem? How big of a problem? Any numbers?
> > 
> > We easily have 2000+ vma so it's not trivial overhead. I will come up
> > with number in the description at respin.
> 
> Does this really have to be a fast operation? I would expect the monitor
> is by no means a fast path. The system call overhead is not what it used
> to be, sigh, but still for something that is not a hot path it should be
> tolerable, especially when the whole operation is quite expensive on its
> own (wrt. the syscall entry/exit).

What's different with process_vm_[readv|writev] and vmsplice?
If the range needed to be covered is a lot, vector operation makes senese
to me.

> 
> I am not saying we do not need a multiplexing API, I am just not sure
> we need it right away. Btw. there was some demand for other MM syscalls
> to provide a multiplexing API (e.g. mprotect), maybe it would be better
> to handle those in one go?

That's the exactly what Daniel Colascione suggested from internal
review. That would be a interesting approach if we could aggregate
all of system call in one go.

> -- 
> Michal Hocko
> SUSE Labs
Michal Hocko May 21, 2019, 10:37 a.m. UTC | #5
On Tue 21-05-19 19:26:13, Minchan Kim wrote:
> On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote:
> > On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > > > [Cc linux-api]
> > > > 
> > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > > > Currently, process_madvise syscall works for only one address range
> > > > > so user should call the syscall several times to give hints to
> > > > > multiple address range.
> > > > 
> > > > Is that a problem? How big of a problem? Any numbers?
> > > 
> > > We easily have 2000+ vma so it's not trivial overhead. I will come up
> > > with number in the description at respin.
> > 
> > Does this really have to be a fast operation? I would expect the monitor
> > is by no means a fast path. The system call overhead is not what it used
> > to be, sigh, but still for something that is not a hot path it should be
> > tolerable, especially when the whole operation is quite expensive on its
> > own (wrt. the syscall entry/exit).
> 
> What's different with process_vm_[readv|writev] and vmsplice?
> If the range needed to be covered is a lot, vector operation makes senese
> to me.

I am not saying that the vector API is wrong. All I am trying to say is
that the benefit is not really clear so far. If you want to push it
through then you should better get some supporting data.
Minchan Kim May 27, 2019, 7:49 a.m. UTC | #6
On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote:
> On Tue 21-05-19 19:26:13, Minchan Kim wrote:
> > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote:
> > > On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > > > > [Cc linux-api]
> > > > > 
> > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > > > > Currently, process_madvise syscall works for only one address range
> > > > > > so user should call the syscall several times to give hints to
> > > > > > multiple address range.
> > > > > 
> > > > > Is that a problem? How big of a problem? Any numbers?
> > > > 
> > > > We easily have 2000+ vma so it's not trivial overhead. I will come up
> > > > with number in the description at respin.
> > > 
> > > Does this really have to be a fast operation? I would expect the monitor
> > > is by no means a fast path. The system call overhead is not what it used
> > > to be, sigh, but still for something that is not a hot path it should be
> > > tolerable, especially when the whole operation is quite expensive on its
> > > own (wrt. the syscall entry/exit).
> > 
> > What's different with process_vm_[readv|writev] and vmsplice?
> > If the range needed to be covered is a lot, vector operation makes senese
> > to me.
> 
> I am not saying that the vector API is wrong. All I am trying to say is
> that the benefit is not really clear so far. If you want to push it
> through then you should better get some supporting data.

I measured 1000 madvise syscall vs. a vector range syscall with 1000
ranges on ARM64 mordern device. Even though I saw 15% improvement but
absoluate gain is just 1ms so I don't think it's worth to support.
I will drop vector support at next revision.

Thanks for the review, Michal!
Daniel Colascione May 29, 2019, 10:08 a.m. UTC | #7
On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote:
>
> On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote:
> > On Tue 21-05-19 19:26:13, Minchan Kim wrote:
> > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote:
> > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > > > > > [Cc linux-api]
> > > > > >
> > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > > > > > Currently, process_madvise syscall works for only one address range
> > > > > > > so user should call the syscall several times to give hints to
> > > > > > > multiple address range.
> > > > > >
> > > > > > Is that a problem? How big of a problem? Any numbers?
> > > > >
> > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up
> > > > > with number in the description at respin.
> > > >
> > > > Does this really have to be a fast operation? I would expect the monitor
> > > > is by no means a fast path. The system call overhead is not what it used
> > > > to be, sigh, but still for something that is not a hot path it should be
> > > > tolerable, especially when the whole operation is quite expensive on its
> > > > own (wrt. the syscall entry/exit).
> > >
> > > What's different with process_vm_[readv|writev] and vmsplice?
> > > If the range needed to be covered is a lot, vector operation makes senese
> > > to me.
> >
> > I am not saying that the vector API is wrong. All I am trying to say is
> > that the benefit is not really clear so far. If you want to push it
> > through then you should better get some supporting data.
>
> I measured 1000 madvise syscall vs. a vector range syscall with 1000
> ranges on ARM64 mordern device. Even though I saw 15% improvement but
> absoluate gain is just 1ms so I don't think it's worth to support.
> I will drop vector support at next revision.

Please do keep the vector support. Absolute timing is misleading,
since in a tight loop, you're not going to contend on mmap_sem. We've
seen tons of improvements in things like camera start come from
coalescing mprotect calls, with the gains coming from taking and
releasing various locks a lot less often and bouncing around less on
the contended lock paths. Raw throughput doesn't tell the whole story,
especially on mobile.
Michal Hocko May 29, 2019, 10:33 a.m. UTC | #8
On Wed 29-05-19 03:08:32, Daniel Colascione wrote:
> On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote:
> >
> > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote:
> > > On Tue 21-05-19 19:26:13, Minchan Kim wrote:
> > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote:
> > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > > > > > > [Cc linux-api]
> > > > > > >
> > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > > > > > > Currently, process_madvise syscall works for only one address range
> > > > > > > > so user should call the syscall several times to give hints to
> > > > > > > > multiple address range.
> > > > > > >
> > > > > > > Is that a problem? How big of a problem? Any numbers?
> > > > > >
> > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up
> > > > > > with number in the description at respin.
> > > > >
> > > > > Does this really have to be a fast operation? I would expect the monitor
> > > > > is by no means a fast path. The system call overhead is not what it used
> > > > > to be, sigh, but still for something that is not a hot path it should be
> > > > > tolerable, especially when the whole operation is quite expensive on its
> > > > > own (wrt. the syscall entry/exit).
> > > >
> > > > What's different with process_vm_[readv|writev] and vmsplice?
> > > > If the range needed to be covered is a lot, vector operation makes senese
> > > > to me.
> > >
> > > I am not saying that the vector API is wrong. All I am trying to say is
> > > that the benefit is not really clear so far. If you want to push it
> > > through then you should better get some supporting data.
> >
> > I measured 1000 madvise syscall vs. a vector range syscall with 1000
> > ranges on ARM64 mordern device. Even though I saw 15% improvement but
> > absoluate gain is just 1ms so I don't think it's worth to support.
> > I will drop vector support at next revision.
> 
> Please do keep the vector support. Absolute timing is misleading,
> since in a tight loop, you're not going to contend on mmap_sem. We've
> seen tons of improvements in things like camera start come from
> coalescing mprotect calls, with the gains coming from taking and
> releasing various locks a lot less often and bouncing around less on
> the contended lock paths. Raw throughput doesn't tell the whole story,
> especially on mobile.

This will always be a double edge sword. Taking a lock for longer can
improve a throughput of a single call but it would make a latency for
anybody contending on the lock much worse.

Besides that, please do not overcomplicate the thing from the early
beginning please. Let's start with a simple and well defined remote
madvise alternative first and build a vector API on top with some
numbers based on _real_ workloads.
Minchan Kim May 30, 2019, 12:35 a.m. UTC | #9
On Wed, May 29, 2019 at 12:14:47PM +0800, Hillf Danton wrote:
> 
> On Mon, 20 May 2019 12:52:53 +0900 Minchan Kim wrote:
> > Example)
> > 
> Better if the following stuff is stored somewhere under the
> tools/testing directory.

Sure, I will do once we figure out RFC stage.
Minchan Kim May 30, 2019, 2:17 a.m. UTC | #10
On Wed, May 29, 2019 at 12:33:52PM +0200, Michal Hocko wrote:
> On Wed 29-05-19 03:08:32, Daniel Colascione wrote:
> > On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote:
> > >
> > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote:
> > > > On Tue 21-05-19 19:26:13, Minchan Kim wrote:
> > > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote:
> > > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> > > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > > > > > > > [Cc linux-api]
> > > > > > > >
> > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > > > > > > > Currently, process_madvise syscall works for only one address range
> > > > > > > > > so user should call the syscall several times to give hints to
> > > > > > > > > multiple address range.
> > > > > > > >
> > > > > > > > Is that a problem? How big of a problem? Any numbers?
> > > > > > >
> > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up
> > > > > > > with number in the description at respin.
> > > > > >
> > > > > > Does this really have to be a fast operation? I would expect the monitor
> > > > > > is by no means a fast path. The system call overhead is not what it used
> > > > > > to be, sigh, but still for something that is not a hot path it should be
> > > > > > tolerable, especially when the whole operation is quite expensive on its
> > > > > > own (wrt. the syscall entry/exit).
> > > > >
> > > > > What's different with process_vm_[readv|writev] and vmsplice?
> > > > > If the range needed to be covered is a lot, vector operation makes senese
> > > > > to me.
> > > >
> > > > I am not saying that the vector API is wrong. All I am trying to say is
> > > > that the benefit is not really clear so far. If you want to push it
> > > > through then you should better get some supporting data.
> > >
> > > I measured 1000 madvise syscall vs. a vector range syscall with 1000
> > > ranges on ARM64 mordern device. Even though I saw 15% improvement but
> > > absoluate gain is just 1ms so I don't think it's worth to support.
> > > I will drop vector support at next revision.
> > 
> > Please do keep the vector support. Absolute timing is misleading,
> > since in a tight loop, you're not going to contend on mmap_sem. We've
> > seen tons of improvements in things like camera start come from
> > coalescing mprotect calls, with the gains coming from taking and
> > releasing various locks a lot less often and bouncing around less on
> > the contended lock paths. Raw throughput doesn't tell the whole story,
> > especially on mobile.
> 
> This will always be a double edge sword. Taking a lock for longer can
> improve a throughput of a single call but it would make a latency for
> anybody contending on the lock much worse.
> 
> Besides that, please do not overcomplicate the thing from the early
> beginning please. Let's start with a simple and well defined remote
> madvise alternative first and build a vector API on top with some
> numbers based on _real_ workloads.

First time, I didn't think about atomicity about address range race
because MADV_COLD/PAGEOUT is not critical for the race.
However you raised the atomicity issue because people would extend
hints to destructive ones easily. I agree with that and that's why
we discussed how to guarantee the race and Daniel comes up with good idea.

  - vma configuration seq number via process_getinfo(2).

We discussed the race issue without _read_ workloads/requests because
it's common sense that people might extend the syscall later.

Here is same. For current workload, we don't need to support vector
for perfomance point of view based on my experiment. However, it's
rather limited experiment. Some configuration might have 10000+ vmas
or really slow CPU. 

Furthermore, I want to have vector support due to atomicity issue
if it's really the one we should consider.
With vector support of the API and vma configuration sequence number
from Daniel, we could support address ranges operations's atomicity.
However, since we don't introduce vector at this moment, we need to
introduce *another syscall* later to be able to handle multile ranges
all at once atomically if it's okay.

Other thought:
Maybe we could extend address range batch syscall covers other MM
syscall like mmap/munmap/madvise/mprotect and so on because there
are multiple users that would benefit from this general batching
mechanism.
Michal Hocko May 30, 2019, 6:57 a.m. UTC | #11
On Thu 30-05-19 11:17:48, Minchan Kim wrote:
> On Wed, May 29, 2019 at 12:33:52PM +0200, Michal Hocko wrote:
> > On Wed 29-05-19 03:08:32, Daniel Colascione wrote:
> > > On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote:
> > > >
> > > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote:
> > > > > On Tue 21-05-19 19:26:13, Minchan Kim wrote:
> > > > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote:
> > > > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> > > > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > > > > > > > > [Cc linux-api]
> > > > > > > > >
> > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > > > > > > > > Currently, process_madvise syscall works for only one address range
> > > > > > > > > > so user should call the syscall several times to give hints to
> > > > > > > > > > multiple address range.
> > > > > > > > >
> > > > > > > > > Is that a problem? How big of a problem? Any numbers?
> > > > > > > >
> > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up
> > > > > > > > with number in the description at respin.
> > > > > > >
> > > > > > > Does this really have to be a fast operation? I would expect the monitor
> > > > > > > is by no means a fast path. The system call overhead is not what it used
> > > > > > > to be, sigh, but still for something that is not a hot path it should be
> > > > > > > tolerable, especially when the whole operation is quite expensive on its
> > > > > > > own (wrt. the syscall entry/exit).
> > > > > >
> > > > > > What's different with process_vm_[readv|writev] and vmsplice?
> > > > > > If the range needed to be covered is a lot, vector operation makes senese
> > > > > > to me.
> > > > >
> > > > > I am not saying that the vector API is wrong. All I am trying to say is
> > > > > that the benefit is not really clear so far. If you want to push it
> > > > > through then you should better get some supporting data.
> > > >
> > > > I measured 1000 madvise syscall vs. a vector range syscall with 1000
> > > > ranges on ARM64 mordern device. Even though I saw 15% improvement but
> > > > absoluate gain is just 1ms so I don't think it's worth to support.
> > > > I will drop vector support at next revision.
> > > 
> > > Please do keep the vector support. Absolute timing is misleading,
> > > since in a tight loop, you're not going to contend on mmap_sem. We've
> > > seen tons of improvements in things like camera start come from
> > > coalescing mprotect calls, with the gains coming from taking and
> > > releasing various locks a lot less often and bouncing around less on
> > > the contended lock paths. Raw throughput doesn't tell the whole story,
> > > especially on mobile.
> > 
> > This will always be a double edge sword. Taking a lock for longer can
> > improve a throughput of a single call but it would make a latency for
> > anybody contending on the lock much worse.
> > 
> > Besides that, please do not overcomplicate the thing from the early
> > beginning please. Let's start with a simple and well defined remote
> > madvise alternative first and build a vector API on top with some
> > numbers based on _real_ workloads.
> 
> First time, I didn't think about atomicity about address range race
> because MADV_COLD/PAGEOUT is not critical for the race.
> However you raised the atomicity issue because people would extend
> hints to destructive ones easily. I agree with that and that's why
> we discussed how to guarantee the race and Daniel comes up with good idea.

Just for the clarification, I didn't really mean atomicity but rather a
_consistency_ (essentially time to check to time to use consistency).
 
>   - vma configuration seq number via process_getinfo(2).
> 
> We discussed the race issue without _read_ workloads/requests because
> it's common sense that people might extend the syscall later.
> 
> Here is same. For current workload, we don't need to support vector
> for perfomance point of view based on my experiment. However, it's
> rather limited experiment. Some configuration might have 10000+ vmas
> or really slow CPU. 
> 
> Furthermore, I want to have vector support due to atomicity issue
> if it's really the one we should consider.
> With vector support of the API and vma configuration sequence number
> from Daniel, we could support address ranges operations's atomicity.

I am not sure what do you mean here. Perform all ranges atomicaly wrt.
other address space modifications? If yes I am not sure we want that
semantic because it can cause really long stalls for other operations
but that is a discussion on its own and I would rather focus on a simple
interface first.

> However, since we don't introduce vector at this moment, we need to
> introduce *another syscall* later to be able to handle multile ranges
> all at once atomically if it's okay.

Agreed.

> Other thought:
> Maybe we could extend address range batch syscall covers other MM
> syscall like mmap/munmap/madvise/mprotect and so on because there
> are multiple users that would benefit from this general batching
> mechanism.

Again a discussion on its own ;)
Minchan Kim May 30, 2019, 8:02 a.m. UTC | #12
On Thu, May 30, 2019 at 08:57:55AM +0200, Michal Hocko wrote:
> On Thu 30-05-19 11:17:48, Minchan Kim wrote:
> > On Wed, May 29, 2019 at 12:33:52PM +0200, Michal Hocko wrote:
> > > On Wed 29-05-19 03:08:32, Daniel Colascione wrote:
> > > > On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote:
> > > > >
> > > > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote:
> > > > > > On Tue 21-05-19 19:26:13, Minchan Kim wrote:
> > > > > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote:
> > > > > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> > > > > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > > > > > > > > > [Cc linux-api]
> > > > > > > > > >
> > > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > > > > > > > > > Currently, process_madvise syscall works for only one address range
> > > > > > > > > > > so user should call the syscall several times to give hints to
> > > > > > > > > > > multiple address range.
> > > > > > > > > >
> > > > > > > > > > Is that a problem? How big of a problem? Any numbers?
> > > > > > > > >
> > > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up
> > > > > > > > > with number in the description at respin.
> > > > > > > >
> > > > > > > > Does this really have to be a fast operation? I would expect the monitor
> > > > > > > > is by no means a fast path. The system call overhead is not what it used
> > > > > > > > to be, sigh, but still for something that is not a hot path it should be
> > > > > > > > tolerable, especially when the whole operation is quite expensive on its
> > > > > > > > own (wrt. the syscall entry/exit).
> > > > > > >
> > > > > > > What's different with process_vm_[readv|writev] and vmsplice?
> > > > > > > If the range needed to be covered is a lot, vector operation makes senese
> > > > > > > to me.
> > > > > >
> > > > > > I am not saying that the vector API is wrong. All I am trying to say is
> > > > > > that the benefit is not really clear so far. If you want to push it
> > > > > > through then you should better get some supporting data.
> > > > >
> > > > > I measured 1000 madvise syscall vs. a vector range syscall with 1000
> > > > > ranges on ARM64 mordern device. Even though I saw 15% improvement but
> > > > > absoluate gain is just 1ms so I don't think it's worth to support.
> > > > > I will drop vector support at next revision.
> > > > 
> > > > Please do keep the vector support. Absolute timing is misleading,
> > > > since in a tight loop, you're not going to contend on mmap_sem. We've
> > > > seen tons of improvements in things like camera start come from
> > > > coalescing mprotect calls, with the gains coming from taking and
> > > > releasing various locks a lot less often and bouncing around less on
> > > > the contended lock paths. Raw throughput doesn't tell the whole story,
> > > > especially on mobile.
> > > 
> > > This will always be a double edge sword. Taking a lock for longer can
> > > improve a throughput of a single call but it would make a latency for
> > > anybody contending on the lock much worse.
> > > 
> > > Besides that, please do not overcomplicate the thing from the early
> > > beginning please. Let's start with a simple and well defined remote
> > > madvise alternative first and build a vector API on top with some
> > > numbers based on _real_ workloads.
> > 
> > First time, I didn't think about atomicity about address range race
> > because MADV_COLD/PAGEOUT is not critical for the race.
> > However you raised the atomicity issue because people would extend
> > hints to destructive ones easily. I agree with that and that's why
> > we discussed how to guarantee the race and Daniel comes up with good idea.
> 
> Just for the clarification, I didn't really mean atomicity but rather a
> _consistency_ (essentially time to check to time to use consistency).

What do you mean by *consistency*? Could you elaborate it more?

>  
> >   - vma configuration seq number via process_getinfo(2).
> > 
> > We discussed the race issue without _read_ workloads/requests because
> > it's common sense that people might extend the syscall later.
> > 
> > Here is same. For current workload, we don't need to support vector
> > for perfomance point of view based on my experiment. However, it's
> > rather limited experiment. Some configuration might have 10000+ vmas
> > or really slow CPU. 
> > 
> > Furthermore, I want to have vector support due to atomicity issue
> > if it's really the one we should consider.
> > With vector support of the API and vma configuration sequence number
> > from Daniel, we could support address ranges operations's atomicity.
> 
> I am not sure what do you mean here. Perform all ranges atomicaly wrt.
> other address space modifications? If yes I am not sure we want that

Yub, I think it's *necessary* if we want to support destructive hints
via process_madvise.

> semantic because it can cause really long stalls for other operations

It could be or it couldn't be.

For example, if we could multiplex several syscalls which we should
enumerate all of page table lookup, it could be more effective rather
than doing each page table on each syscall.

> but that is a discussion on its own and I would rather focus on a simple
> interface first.

It seems it's time to send RFCv2 since we discussed a lot although we
don't have clear conclution yet. But still want to understand what you
meant _consistency_.

Thanks for the review, Michal! It's very helpful.

> 
> > However, since we don't introduce vector at this moment, we need to
> > introduce *another syscall* later to be able to handle multile ranges
> > all at once atomically if it's okay.
> 
> Agreed.
> 
> > Other thought:
> > Maybe we could extend address range batch syscall covers other MM
> > syscall like mmap/munmap/madvise/mprotect and so on because there
> > are multiple users that would benefit from this general batching
> > mechanism.
> 
> Again a discussion on its own ;)
> 
> -- 
> Michal Hocko
> SUSE Labs
Daniel Colascione May 30, 2019, 4:19 p.m. UTC | #13
On Thu, May 30, 2019 at 1:02 AM Minchan Kim <minchan@kernel.org> wrote:
>
> On Thu, May 30, 2019 at 08:57:55AM +0200, Michal Hocko wrote:
> > On Thu 30-05-19 11:17:48, Minchan Kim wrote:
> > > On Wed, May 29, 2019 at 12:33:52PM +0200, Michal Hocko wrote:
> > > > On Wed 29-05-19 03:08:32, Daniel Colascione wrote:
> > > > > On Mon, May 27, 2019 at 12:49 AM Minchan Kim <minchan@kernel.org> wrote:
> > > > > >
> > > > > > On Tue, May 21, 2019 at 12:37:26PM +0200, Michal Hocko wrote:
> > > > > > > On Tue 21-05-19 19:26:13, Minchan Kim wrote:
> > > > > > > > On Tue, May 21, 2019 at 08:24:21AM +0200, Michal Hocko wrote:
> > > > > > > > > On Tue 21-05-19 11:48:20, Minchan Kim wrote:
> > > > > > > > > > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> > > > > > > > > > > [Cc linux-api]
> > > > > > > > > > >
> > > > > > > > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > > > > > > > > > > > Currently, process_madvise syscall works for only one address range
> > > > > > > > > > > > so user should call the syscall several times to give hints to
> > > > > > > > > > > > multiple address range.
> > > > > > > > > > >
> > > > > > > > > > > Is that a problem? How big of a problem? Any numbers?
> > > > > > > > > >
> > > > > > > > > > We easily have 2000+ vma so it's not trivial overhead. I will come up
> > > > > > > > > > with number in the description at respin.
> > > > > > > > >
> > > > > > > > > Does this really have to be a fast operation? I would expect the monitor
> > > > > > > > > is by no means a fast path. The system call overhead is not what it used
> > > > > > > > > to be, sigh, but still for something that is not a hot path it should be
> > > > > > > > > tolerable, especially when the whole operation is quite expensive on its
> > > > > > > > > own (wrt. the syscall entry/exit).
> > > > > > > >
> > > > > > > > What's different with process_vm_[readv|writev] and vmsplice?
> > > > > > > > If the range needed to be covered is a lot, vector operation makes senese
> > > > > > > > to me.
> > > > > > >
> > > > > > > I am not saying that the vector API is wrong. All I am trying to say is
> > > > > > > that the benefit is not really clear so far. If you want to push it
> > > > > > > through then you should better get some supporting data.
> > > > > >
> > > > > > I measured 1000 madvise syscall vs. a vector range syscall with 1000
> > > > > > ranges on ARM64 mordern device. Even though I saw 15% improvement but
> > > > > > absoluate gain is just 1ms so I don't think it's worth to support.
> > > > > > I will drop vector support at next revision.
> > > > >
> > > > > Please do keep the vector support. Absolute timing is misleading,
> > > > > since in a tight loop, you're not going to contend on mmap_sem. We've
> > > > > seen tons of improvements in things like camera start come from
> > > > > coalescing mprotect calls, with the gains coming from taking and
> > > > > releasing various locks a lot less often and bouncing around less on
> > > > > the contended lock paths. Raw throughput doesn't tell the whole story,
> > > > > especially on mobile.
> > > >
> > > > This will always be a double edge sword. Taking a lock for longer can
> > > > improve a throughput of a single call but it would make a latency for
> > > > anybody contending on the lock much worse.
> > > >
> > > > Besides that, please do not overcomplicate the thing from the early
> > > > beginning please. Let's start with a simple and well defined remote
> > > > madvise alternative first and build a vector API on top with some
> > > > numbers based on _real_ workloads.
> > >
> > > First time, I didn't think about atomicity about address range race
> > > because MADV_COLD/PAGEOUT is not critical for the race.
> > > However you raised the atomicity issue because people would extend
> > > hints to destructive ones easily. I agree with that and that's why
> > > we discussed how to guarantee the race and Daniel comes up with good idea.
> >
> > Just for the clarification, I didn't really mean atomicity but rather a
> > _consistency_ (essentially time to check to time to use consistency).
>
> What do you mean by *consistency*? Could you elaborate it more?
>
> >
> > >   - vma configuration seq number via process_getinfo(2).
> > >
> > > We discussed the race issue without _read_ workloads/requests because
> > > it's common sense that people might extend the syscall later.
> > >
> > > Here is same. For current workload, we don't need to support vector
> > > for perfomance point of view based on my experiment. However, it's
> > > rather limited experiment. Some configuration might have 10000+ vmas
> > > or really slow CPU.
> > >
> > > Furthermore, I want to have vector support due to atomicity issue
> > > if it's really the one we should consider.
> > > With vector support of the API and vma configuration sequence number
> > > from Daniel, we could support address ranges operations's atomicity.
> >
> > I am not sure what do you mean here. Perform all ranges atomicaly wrt.
> > other address space modifications? If yes I am not sure we want that
>
> Yub, I think it's *necessary* if we want to support destructive hints
> via process_madvise.

[Puts on flame-proof suit]

Here's a quick sketch of what I have in mind for process_getinfo(2).
Keep in mind that it's still just a rough idea.

We've had trouble efficiently learning about process and memory state
of the system via procfs. Android background memory-use scans (the
android.bg daemon) consume quite a bit of CPU time for PSS; Minchan's
done a lot of thinking for how we can specify desired page sets for
compaction as part of this patch set; and the full procfs walks that
some trace collection tools need to undertake take more than 200ms to
collect (sometimes much more) due mostly to procfs iteration. ISTM we
can do better.

While procfs *works* on a functional level, it's inefficient due to
the splatting of information we want across several different files
(which need to be independently opened --- e.g.,
/proc/pid/oom_score_adj *and* /proc/pid/status), inefficient due to
the ad-hoc text formatting, inefficient due to information over-fetch,
and cumbersome due to the fundamental impedance mismatch between
filesystem APIs and process lifetimes. procfs itself is also optional,
which has caused various bits of awkwardness that you'll recall from
the pidfd discussions.

How about we solve the problem once and for all? I'm imagining a new
process_getinfo(2) that solves all of these problems at the same time.
I want something with a few properties:

1) if we want to learn M facts about N things, we enter the kernel
once and learn all M*N things,
2) the information we collect is self-consistent (which implies
atomicity most of the time),
3) we retrieve the information we want in an efficient binary format, and
4) we don't pay to learn anything not in M.

I've jotted down a quick sketch of the API below; I'm curious what
everyone else thinks. It'd basically look like this:

int process_getinfo(int nr_proc, int* proc, int flags, unsigned long
mask, void* out_buf, size_t* inout_sz)

We wouldn't use the return value for much: 0 on success and -1 on
error with errno set. NR_PROC and PROC together would specify the
objects we want to learn about, which would be either PIDs or PIDFDs
or maybe nothing at all if FLAGS tells us to inspect every process on
the system. MASK is a bitset of facts we want to learn, described
below. OUT_BUF and INOUT_SZ are for actually communicating the result.
On input, the caller would fill *INOUT_SZ with the size of the buffer
to which OUT_BUF points; on success, we'd fill *INOUT_SZ with the
number of bytes we actually used. If the output buffer is too small,
we'll truncate the result, fail the system call with E2BIG, and fill
*INOUT_SZ with the number of needed bytes, inviting the caller to try
again. (If a caller supplies something huge like a reusable 512KB
buffer on the first call, no reallocation and retries will be
necessary in practice on a typically-sized system.)

The actual returned buffer is a collection of structures and data
blocks starting with a struct process_info. The structures in the
returned buffer sometimes contain "pointers" to other structures
encoded as byte offsets from the start of the information buffer.
Using offsets instead of actual pointers keeps the format the same
across 32- and 64-bit versions of process_getinfo and makes it
possible to relocate the returned information buffer with memcpy.

struct process_info {
  int first_procrec_offset;  // struct procrec*
  //  Examples of system-wide things we could ask for
  int mem_total_kb;
  int mem_free_kb;
  int mem_available_kb;
  char reserved[];
};

struct procrec {
  int next_procrec_offset;  // struct procrec*
  // Following fields are self-explanatory and are only examples of the
  // kind of information we could provide.
  int tid;
  int tgid;
  char status;
  int oom_score_adj;
  struct { int real, effective, saved, fs; } uids;
  int prio;
  int comm_offset;  // char*
  uint64_t rss_file_kb
  uint64_t rss_anon_fb;
  uint64_t vm_seq;
  int first_procrec_vma_offset;  // struct procrec_vma*
  char reserved[];
};

struct procrec_vma {
  int next_procrec_vma_offset;  // struct procrec_vma*
  unsigned long start;
  unsigned long end;
  int backing_file_name_offset;  // char*
  int prot;
  char reserved[];
};

Callers would use the returned buffer by casting it to a struct
process_info and following the internal "pointers".

MASK would specify which bits of information we wanted: for example,
if we asked for PROCESS_VM_MEMORY_MAP, the kernel would fill in each
struct procret's memory_map field and have it point to a struct
procrec_vma in the returned output buffer. If we omitted
PROCESS_VM_MEMORY_MAP, we'd leave the memory_map field as NULL
(encoded as offset zero). The kernel would embed any strings (like
comm and VMA names) into the output buffer; the precise locations
would be unspecified so long as callers could find these fields via
output-buffer pointers.

Because all the structures are variable-length and are chained
together with explicit pointers (or offsets) instead of being stuffed
into a literal array, we can add additional fields to the output
structures any time we want without breaking binary compatibility.
Callers would tell the kernel that they're interested in the added
struct fields by asking for them via bits in MASK, and kernels that
don't understand those fields would just fail the system call with
EINVAL or something.

Depending on how we call it, we can use this API as a bunch of different things:

1) Quick retrieval of system-wide memory counters, like /proc/meminfo
2) Quick snapshot of all process-thread identities (asking for the
wildcard TID match via FLAGS)
3) Fast enumeration of one process's address space
4) Collecting process-summary VM counters (e.g., rss_anon and
rss_file) for a set of processes
5) Retrieval of every VMA of every process on the system for debugging

We can do all of this with one entry into the kernel and without
opening any new file descriptors (unless we want to use pidfds as
inputs). We can also make this operation as atomic as we want, e.g.,
taking mmap_sem while looking at each process and taking tasklist_lock
so all the thread IDs line up with their processes. We don't
necessarily need to take the mmap_sems of all processes we care about
at the same time.

Since this isn't a filesystem-based API, we don't have to deal with
seq_file or deal with consistency issues arising from userspace
programs doing strange things like reading procfs files very slowly in
small chunks. Security-wise, we'd just use different access checks for
different requested information bits in MASK, maybe supplying "no
access" struct procrec entries if a caller doesn't happen to have
access to a particular process. I suppose we can talk about whether
access check failures should result in dummy values or syscall
failure: maybe callers should select which behavior they want.
Format-wise, we could also just return flatbuffer messages from the
kernel, but I suspect that we don't want flatbuffer in the kernel
right now. :-)

The API I'm proposing accepts an array of processes to inspect. We
could simplify it by accepting just one process and making the caller
enter the kernel once per process it wants to learn about, but this
simplification would make the API less useful for answering questions
like "What's the RSS of every process on the system right now?".

What do you think?
Michal Hocko May 30, 2019, 6:47 p.m. UTC | #14
On Thu 30-05-19 17:02:14, Minchan Kim wrote:
> On Thu, May 30, 2019 at 08:57:55AM +0200, Michal Hocko wrote:
> > On Thu 30-05-19 11:17:48, Minchan Kim wrote:
[...]
> > > First time, I didn't think about atomicity about address range race
> > > because MADV_COLD/PAGEOUT is not critical for the race.
> > > However you raised the atomicity issue because people would extend
> > > hints to destructive ones easily. I agree with that and that's why
> > > we discussed how to guarantee the race and Daniel comes up with good idea.
> > 
> > Just for the clarification, I didn't really mean atomicity but rather a
> > _consistency_ (essentially time to check to time to use consistency).
> 
> What do you mean by *consistency*? Could you elaborate it more?

That you operate on the object you have got by some means. In other
words that the range you want to call madvise on hasn't been
remapped/replaced by a different mmap operation.
diff mbox series

Patch

diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index b9b51eeb8e1a..b8e230de84a6 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -74,4 +74,9 @@ 
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
 				 PKEY_DISABLE_WRITE)
 
+struct pr_madvise_param {
+	int size;			/* the size of this structure */
+	const struct iovec __user *vec;	/* address range array */
+};
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index af02aa17e5c1..f4f569dac2bd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -320,6 +320,7 @@  static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
 	struct page *page;
 	struct vm_area_struct *vma = walk->vma;
 	unsigned long next;
+	long nr_pages = 0;
 
 	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd)) {
@@ -380,9 +381,12 @@  static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
 
 		ptep_test_and_clear_young(vma, addr, pte);
 		deactivate_page(page);
+		nr_pages++;
+
 	}
 
 	pte_unmap_unlock(orig_pte, ptl);
+	*(long *)walk->private += nr_pages;
 	cond_resched();
 
 	return 0;
@@ -390,11 +394,13 @@  static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
 
 static void madvise_cool_page_range(struct mmu_gather *tlb,
 			     struct vm_area_struct *vma,
-			     unsigned long addr, unsigned long end)
+			     unsigned long addr, unsigned long end,
+			     long *nr_pages)
 {
 	struct mm_walk cool_walk = {
 		.pmd_entry = madvise_cool_pte_range,
 		.mm = vma->vm_mm,
+		.private = nr_pages
 	};
 
 	tlb_start_vma(tlb, vma);
@@ -403,7 +409,8 @@  static void madvise_cool_page_range(struct mmu_gather *tlb,
 }
 
 static long madvise_cool(struct vm_area_struct *vma,
-			unsigned long start_addr, unsigned long end_addr)
+			unsigned long start_addr, unsigned long end_addr,
+			long *nr_pages)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
@@ -413,7 +420,7 @@  static long madvise_cool(struct vm_area_struct *vma,
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
-	madvise_cool_page_range(&tlb, vma, start_addr, end_addr);
+	madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
 	tlb_finish_mmu(&tlb, start_addr, end_addr);
 
 	return 0;
@@ -429,6 +436,7 @@  static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
 	int isolated = 0;
 	struct vm_area_struct *vma = walk->vma;
 	unsigned long next;
+	long nr_pages = 0;
 
 	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd)) {
@@ -492,7 +500,7 @@  static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
 		list_add(&page->lru, &page_list);
 		if (isolated >= SWAP_CLUSTER_MAX) {
 			pte_unmap_unlock(orig_pte, ptl);
-			reclaim_pages(&page_list);
+			nr_pages += reclaim_pages(&page_list);
 			isolated = 0;
 			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 			orig_pte = pte;
@@ -500,19 +508,22 @@  static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
 	}
 
 	pte_unmap_unlock(orig_pte, ptl);
-	reclaim_pages(&page_list);
+	nr_pages += reclaim_pages(&page_list);
 	cond_resched();
 
+	*(long *)walk->private += nr_pages;
 	return 0;
 }
 
 static void madvise_cold_page_range(struct mmu_gather *tlb,
 			     struct vm_area_struct *vma,
-			     unsigned long addr, unsigned long end)
+			     unsigned long addr, unsigned long end,
+			     long *nr_pages)
 {
 	struct mm_walk warm_walk = {
 		.pmd_entry = madvise_cold_pte_range,
 		.mm = vma->vm_mm,
+		.private = nr_pages,
 	};
 
 	tlb_start_vma(tlb, vma);
@@ -522,7 +533,8 @@  static void madvise_cold_page_range(struct mmu_gather *tlb,
 
 
 static long madvise_cold(struct vm_area_struct *vma,
-			unsigned long start_addr, unsigned long end_addr)
+			unsigned long start_addr, unsigned long end_addr,
+			long *nr_pages)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
@@ -532,7 +544,7 @@  static long madvise_cold(struct vm_area_struct *vma,
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
-	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+	madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
 	tlb_finish_mmu(&tlb, start_addr, end_addr);
 
 	return 0;
@@ -922,7 +934,7 @@  static int madvise_inject_error(int behavior,
 static long
 madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
 		struct vm_area_struct **prev, unsigned long start,
-		unsigned long end, int behavior)
+		unsigned long end, int behavior, long *nr_pages)
 {
 	switch (behavior) {
 	case MADV_REMOVE:
@@ -930,9 +942,9 @@  madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
 	case MADV_WILLNEED:
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_COOL:
-		return madvise_cool(vma, start, end);
+		return madvise_cool(vma, start, end, nr_pages);
 	case MADV_COLD:
-		return madvise_cold(vma, start, end);
+		return madvise_cold(vma, start, end, nr_pages);
 	case MADV_FREE:
 	case MADV_DONTNEED:
 		return madvise_dontneed_free(tsk, vma, prev, start,
@@ -981,7 +993,7 @@  madvise_behavior_valid(int behavior)
 }
 
 static int madvise_core(struct task_struct *tsk, unsigned long start,
-			size_t len_in, int behavior)
+			size_t len_in, int behavior, long *nr_pages)
 {
 	unsigned long end, tmp;
 	struct vm_area_struct *vma, *prev;
@@ -996,6 +1008,7 @@  static int madvise_core(struct task_struct *tsk, unsigned long start,
 
 	if (start & ~PAGE_MASK)
 		return error;
+
 	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
 
 	/* Check to see whether len was rounded up from small -ve to zero */
@@ -1035,6 +1048,8 @@  static int madvise_core(struct task_struct *tsk, unsigned long start,
 	blk_start_plug(&plug);
 	for (;;) {
 		/* Still start < end. */
+		long pages = 0;
+
 		error = -ENOMEM;
 		if (!vma)
 			goto out;
@@ -1053,9 +1068,11 @@  static int madvise_core(struct task_struct *tsk, unsigned long start,
 			tmp = end;
 
 		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
-		error = madvise_vma(tsk, vma, &prev, start, tmp, behavior);
+		error = madvise_vma(tsk, vma, &prev, start, tmp,
+					behavior, &pages);
 		if (error)
 			goto out;
+		*nr_pages += pages;
 		start = tmp;
 		if (prev && start < prev->vm_end)
 			start = prev->vm_end;
@@ -1140,26 +1157,137 @@  static int madvise_core(struct task_struct *tsk, unsigned long start,
  */
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
-	return madvise_core(current, start, len_in, behavior);
+	unsigned long dummy;
+
+	return madvise_core(current, start, len_in, behavior, &dummy);
 }
 
-SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
-		size_t, len_in, int, behavior)
+static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,
+		struct pr_madvise_param *param)
+{
+	u32 size;
+	int ret;
+
+	memset(param, 0, sizeof(*param));
+
+	ret = get_user(size, &u_param->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)
+		return -E2BIG;
+
+	if (!size || size > sizeof(struct pr_madvise_param))
+		return -EINVAL;
+
+	ret = copy_from_user(param, u_param, size);
+	if (ret)
+		return -EFAULT;
+
+	return ret;
+}
+
+static int process_madvise_core(struct task_struct *tsk, int *behaviors,
+				struct iov_iter *iter,
+				const struct iovec *range_vec,
+				unsigned long riovcnt,
+				unsigned long flags)
+{
+	int i;
+	long err;
+
+	for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {
+		long ret = 0;
+
+		err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,
+				range_vec[i].iov_len, behaviors[i],
+				&ret);
+		if (err)
+			ret = err;
+
+		if (copy_to_iter(&ret, sizeof(long), iter) !=
+				sizeof(long)) {
+			err = -EFAULT;
+			break;
+		}
+
+		err = 0;
+	}
+
+	return err;
+}
+
+SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,
+			const int __user *, hints,
+			struct pr_madvise_param __user *, results,
+			struct pr_madvise_param __user *, ranges,
+			unsigned long, flags)
 {
 	int ret;
 	struct fd f;
 	struct pid *pid;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
+	struct pr_madvise_param result_p, range_p;
+	const struct iovec __user *result_vec, __user *range_vec;
+	int *behaviors;
+	struct iovec iovstack_result[UIO_FASTIOV];
+	struct iovec iovstack_r[UIO_FASTIOV];
+	struct iovec *iov_l = iovstack_result;
+	struct iovec *iov_r = iovstack_r;
+	struct iov_iter iter;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	ret = pr_madvise_copy_param(results, &result_p);
+	if (ret)
+		return ret;
+
+	ret = pr_madvise_copy_param(ranges, &range_p);
+	if (ret)
+		return ret;
+
+	result_vec = result_p.vec;
+	range_vec = range_p.vec;
+
+	if (result_p.size != sizeof(struct pr_madvise_param) ||
+			range_p.size != sizeof(struct pr_madvise_param))
+		return -EINVAL;
+
+	behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);
+	if (!behaviors)
+		return -ENOMEM;
+
+	ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);
+	if (ret < 0)
+		goto free_behavior_vec;
+
+	ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,
+				&iov_l, &iter);
+	if (ret < 0)
+		goto free_behavior_vec;
+
+	if (!iov_iter_count(&iter)) {
+		ret = -EINVAL;
+		goto free_iovecs;
+	}
+
+	ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,
+				UIO_FASTIOV, iovstack_r, &iov_r);
+	if (ret <= 0)
+		goto free_iovecs;
 
 	f = fdget(pidfd);
-	if (!f.file)
-		return -EBADF;
+	if (!f.file) {
+		ret = -EBADF;
+		goto free_iovecs;
+	}
 
 	pid = pidfd_to_pid(f.file);
 	if (IS_ERR(pid)) {
 		ret = PTR_ERR(pid);
-		goto err;
+		goto put_fd;
 	}
 
 	ret = -EINVAL;
@@ -1167,7 +1295,7 @@  SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
 	tsk = pid_task(pid, PIDTYPE_PID);
 	if (!tsk) {
 		rcu_read_unlock();
-		goto err;
+		goto put_fd;
 	}
 	get_task_struct(tsk);
 	rcu_read_unlock();
@@ -1176,12 +1304,22 @@  SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
 		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
 		if (ret == -EACCES)
 			ret = -EPERM;
-		goto err;
+		goto put_task;
 	}
-	ret = madvise_core(tsk, start, len_in, behavior);
+
+	ret = process_madvise_core(tsk, behaviors, &iter, iov_r,
+					nr_elem, flags);
 	mmput(mm);
+put_task:
 	put_task_struct(tsk);
-err:
+put_fd:
 	fdput(f);
+free_iovecs:
+	if (iov_r != iovstack_r)
+		kfree(iov_r);
+	kfree(iov_l);
+free_behavior_vec:
+	kfree(behaviors);
+
 	return ret;
 }