[2/9] mm: Provide new get_vaddr_frames() helper
diff mbox

Message ID 1430897296-5469-3-git-send-email-jack@suse.cz
State New
Headers show

Commit Message

Jan Kara May 6, 2015, 7:28 a.m. UTC
Provide new function get_vaddr_frames().  This function maps virtual
addresses from given start and fills given array with page frame numbers of
the corresponding pages. If given start belongs to a normal vma, the function
grabs reference to each of the pages to pin them in memory. If start
belongs to VM_IO | VM_PFNMAP vma, we don't touch page structures. Caller
must make sure pfns aren't reused for anything else while he is using
them.

This function is created for various drivers to simplify handling of
their buffers.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/mm.h |  44 +++++++++++
 mm/gup.c           | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 258 insertions(+)

Comments

Vlastimil Babka May 6, 2015, 10:45 a.m. UTC | #1
On 05/06/2015 09:28 AM, Jan Kara wrote:
> Provide new function get_vaddr_frames().  This function maps virtual
> addresses from given start and fills given array with page frame numbers of
> the corresponding pages. If given start belongs to a normal vma, the function
> grabs reference to each of the pages to pin them in memory. If start
> belongs to VM_IO | VM_PFNMAP vma, we don't touch page structures. Caller
> must make sure pfns aren't reused for anything else while he is using
> them.
>
> This function is created for various drivers to simplify handling of
> their buffers.
>
> Signed-off-by: Jan Kara <jack@suse.cz>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

With some nitpicks below...

> ---
>   include/linux/mm.h |  44 +++++++++++
>   mm/gup.c           | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 258 insertions(+)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0755b9fd03a7..dcd1f02a78e9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -20,6 +20,7 @@
>   #include <linux/shrinker.h>
>   #include <linux/resource.h>
>   #include <linux/page_ext.h>
> +#include <linux/err.h>
>
>   struct mempolicy;
>   struct anon_vma;
> @@ -1197,6 +1198,49 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
>   		    int write, int force, struct page **pages);
>   int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>   			struct page **pages);
> +
> +/* Container for pinned pfns / pages */
> +struct frame_vector {
> +	unsigned int nr_allocated;	/* Number of frames we have space for */
> +	unsigned int nr_frames;	/* Number of frames stored in ptrs array */
> +	bool got_ref;		/* Did we pin pages by getting page ref? */
> +	bool is_pfns;		/* Does array contain pages or pfns? */
> +	void *ptrs[0];		/* Array of pinned pfns / pages. Use
> +				 * pfns_vector_pages() or pfns_vector_pfns()
> +				 * for access */
> +};
> +
> +struct frame_vector *frame_vector_create(unsigned int nr_frames);
> +void frame_vector_destroy(struct frame_vector *vec);
> +int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
> +		     bool write, bool force, struct frame_vector *vec);
> +void put_vaddr_frames(struct frame_vector *vec);
> +int frame_vector_to_pages(struct frame_vector *vec);
> +void frame_vector_to_pfns(struct frame_vector *vec);
> +
> +static inline unsigned int frame_vector_count(struct frame_vector *vec)
> +{
> +	return vec->nr_frames;
> +}
> +
> +static inline struct page **frame_vector_pages(struct frame_vector *vec)
> +{
> +	if (vec->is_pfns) {
> +		int err = frame_vector_to_pages(vec);
> +
> +		if (err)
> +			return ERR_PTR(err);
> +	}
> +	return (struct page **)(vec->ptrs);
> +}
> +
> +static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
> +{
> +	if (!vec->is_pfns)
> +		frame_vector_to_pfns(vec);
> +	return (unsigned long *)(vec->ptrs);
> +}
> +
>   struct kvec;
>   int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
>   			struct page **pages);
> diff --git a/mm/gup.c b/mm/gup.c
> index 6297f6bccfb1..8db5c40e65c4 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -8,6 +8,7 @@
>   #include <linux/rmap.h>
>   #include <linux/swap.h>
>   #include <linux/swapops.h>
> +#include <linux/vmalloc.h>
>
>   #include <linux/sched.h>
>   #include <linux/rwsem.h>
> @@ -936,6 +937,219 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
>   	return ret;	/* 0 or negative error code */
>   }
>
> +/*
> + * get_vaddr_frames() - map virtual addresses to pfns
> + * @start:	starting user address
> + * @nr_frames:	number of pages / pfns from start to map
> + * @write:	whether pages will be written to by the caller
> + * @force:	whether to force write access even if user mapping is
> + *		readonly. This will result in the page being COWed even
> + *		in MAP_SHARED mappings. You do not want this.

"You do not want this" and yet some of the drivers in later patches do. 
That looks weird. Explain better?

> + * @vec:	structure which receives pages / pfns of the addresses mapped.
> + *		It should have space for at least nr_frames entries.
> + *
> + * This function maps virtual addresses from @start and fills @vec structure
> + * with page frame numbers or page pointers to corresponding pages (choice
> + * depends on the type of the vma underlying the virtual address). If @start
> + * belongs to a normal vma, the function grabs reference to each of the pages
> + * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
> + * touch page structures and the caller must make sure pfns aren't reused for
> + * anything else while he is using them.
> + *
> + * The function returns number of pages mapped which may be less than
> + * @nr_frames. In particular we stop mapping if there are more vmas of
> + * different type underlying the specified range of virtual addresses.

The function can also return e.g. -EFAULT, shouldn't that be documented too?

> + *
> + * This function takes care of grabbing mmap_sem as necessary.
> + */
> +int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
> +		     bool write, bool force, struct frame_vector *vec)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *vma;
> +	int ret = 0;
> +	int err;
> +	int locked = 1;

It looks strange that locked is set to 1 before taking the actual 
mmap_sem. Also future-proofing.

> +
> +	if (nr_frames == 0)
> +		return 0;
> +
> +	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
> +		nr_frames = vec->nr_allocated;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_vma_intersection(mm, start, start + 1);
> +	if (!vma) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
> +		vec->got_ref = 1;
> +		vec->is_pfns = 0;
> +		ret = get_user_pages_locked(current, mm, start, nr_frames,
> +			write, force, (struct page **)(vec->ptrs), &locked);
> +		goto out;
> +	}
> +
> +	vec->got_ref = 0;
> +	vec->is_pfns = 1;
> +	do {
> +		unsigned long *nums = frame_vector_pfns(vec);
> +
> +		while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
> +			err = follow_pfn(vma, start, &nums[ret]);
> +			if (err) {
> +				if (ret == 0)
> +					ret = err;
> +				goto out;
> +			}
> +			start += PAGE_SIZE;
> +			ret++;
> +		}
> +		/*
> +		 * We stop if we have enough pages or if VMA doesn't completely
> +		 * cover the tail page.
> +		 */
> +		if (ret >= nr_frames || start < vma->vm_end)
> +			break;
> +		vma = find_vma_intersection(mm, start, start + 1);
> +	} while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
> +out:
> +	if (locked)
> +		up_read(&mm->mmap_sem);
> +	if (!ret)
> +		ret = -EFAULT;
> +	if (ret > 0)
> +		vec->nr_frames = ret;
> +	return ret;
> +}
> +EXPORT_SYMBOL(get_vaddr_frames);
> +
> +/**
> + * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
> + *			them
> + * @vec:	frame vector to put
> + *
> + * Drop references to pages if get_vaddr_frames() acquired them. We also
> + * invalidate the frame vector so that it is prepared for the next call into
> + * get_vaddr_frames().
> + */
> +void put_vaddr_frames(struct frame_vector *vec)
> +{
> +	int i;
> +	struct page **pages;
> +
> +	if (!vec->got_ref)
> +		goto out;
> +	pages = frame_vector_pages(vec);
> +	/*
> +	 * frame_vector_pages() might needed to do a conversion when we

                                                           remove "we" ^

> +	 * get_vaddr_frames() got pages but vec was later converted to pfns.
> +	 * But it shouldn't really fail to convert pfns back...
> +	 */
> +	BUG_ON(IS_ERR(pages));
> +	for (i = 0; i < vec->nr_frames; i++)
> +		put_page(pages[i]);
> +	vec->got_ref = 0;
> +out:
> +	vec->nr_frames = 0;
> +}
> +EXPORT_SYMBOL(put_vaddr_frames);
> +
> +/**
> + * frame_vector_to_pages - convert frame vector to contain page pointers
> + * @vec:	frame vector to convert
> + *
> + * Convert @vec to contain array of page pointers.  If the conversion is
> + * successful, return 0. Otherwise return an error.
> + */
> +int frame_vector_to_pages(struct frame_vector *vec)
> +{
> +	int i;
> +	unsigned long *nums;
> +	struct page **pages;
> +
> +	if (!vec->is_pfns)
> +		return 0;
> +	nums = frame_vector_pfns(vec);
> +	for (i = 0; i < vec->nr_frames; i++)
> +		if (!pfn_valid(nums[i]))
> +			return -EINVAL;
> +	pages = (struct page **)nums;
> +	for (i = 0; i < vec->nr_frames; i++)
> +		pages[i] = pfn_to_page(nums[i]);
> +	vec->is_pfns = 0;
> +	return 0;
> +}
> +EXPORT_SYMBOL(frame_vector_to_pages);
> +
> +/**
> + * frame_vector_to_pfns - convert frame vector to contain pfns
> + * @vec:	frame vector to convert
> + *
> + * Convert @vec to contain array of pfns.
> + */
> +void frame_vector_to_pfns(struct frame_vector *vec)
> +{
> +	int i;
> +	unsigned long *nums;
> +	struct page **pages;
> +
> +	if (vec->is_pfns)
> +		return;
> +	pages = (struct page **)(vec->ptrs);
> +	nums = (unsigned long *)pages;
> +	for (i = 0; i < vec->nr_frames; i++)
> +		nums[i] = page_to_pfn(pages[i]);
> +	vec->is_pfns = 1;
> +}
> +EXPORT_SYMBOL(frame_vector_to_pfns);
> +
> +/**
> + * frame_vector_create() - allocate & initialize structure for pinned pfns
> + * @nr_frames:	number of pfns slots we should reserve
> + *
> + * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
> + * pfns.
> + */
> +struct frame_vector *frame_vector_create(unsigned int nr_frames)
> +{
> +	struct frame_vector *vec;
> +	int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
> +
> +	if (WARN_ON_ONCE(nr_frames == 0))
> +		return NULL;
> +	/*
> +	 * Avoid higher order allocations, use vmalloc instead. It should
> +	 * be rare anyway.
> +	 */
> +	if (size <= PAGE_SIZE)
> +		vec = kmalloc(size, GFP_KERNEL);
> +	else
> +		vec = vmalloc(size);
> +	if (!vec)
> +		return NULL;
> +	vec->nr_allocated = nr_frames;
> +	vec->nr_frames = 0;
> +	return vec;
> +}
> +EXPORT_SYMBOL(frame_vector_create);
> +
> +/**
> + * frame_vector_destroy() - free memory allocated to carry frame vector
> + * @vec:	Frame vector to free
> + *
> + * Free structure allocated by frame_vector_create() to carry frames.
> + */
> +void frame_vector_destroy(struct frame_vector *vec)
> +{

Add some VM_BUG_ON()'s for nr_frames and got_ref perhaps?

> +	if (!is_vmalloc_addr(vec))
> +		kfree(vec);
> +	else
> +		vfree(vec);
> +}
> +EXPORT_SYMBOL(frame_vector_destroy);
> +
>   /**
>    * get_dump_page() - pin user page in memory while writing it to core dump
>    * @addr: user address
>
Jan Kara May 6, 2015, 2:58 p.m. UTC | #2
On Wed 06-05-15 12:45:51, Vlastimil Babka wrote:
> On 05/06/2015 09:28 AM, Jan Kara wrote:
> >Provide new function get_vaddr_frames().  This function maps virtual
> >addresses from given start and fills given array with page frame numbers of
> >the corresponding pages. If given start belongs to a normal vma, the function
> >grabs reference to each of the pages to pin them in memory. If start
> >belongs to VM_IO | VM_PFNMAP vma, we don't touch page structures. Caller
> >must make sure pfns aren't reused for anything else while he is using
> >them.
> >
> >This function is created for various drivers to simplify handling of
> >their buffers.
> >
> >Signed-off-by: Jan Kara <jack@suse.cz>
> 
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> 
> With some nitpicks below...
> 
> >---
> >  include/linux/mm.h |  44 +++++++++++
> >  mm/gup.c           | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 258 insertions(+)
> >
> >diff --git a/include/linux/mm.h b/include/linux/mm.h
> >index 0755b9fd03a7..dcd1f02a78e9 100644
> >--- a/include/linux/mm.h
> >+++ b/include/linux/mm.h
> >@@ -20,6 +20,7 @@
> >  #include <linux/shrinker.h>
> >  #include <linux/resource.h>
> >  #include <linux/page_ext.h>
> >+#include <linux/err.h>
> >
> >  struct mempolicy;
> >  struct anon_vma;
> >@@ -1197,6 +1198,49 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
> >  		    int write, int force, struct page **pages);
> >  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> >  			struct page **pages);
> >+
> >+/* Container for pinned pfns / pages */
> >+struct frame_vector {
> >+	unsigned int nr_allocated;	/* Number of frames we have space for */
> >+	unsigned int nr_frames;	/* Number of frames stored in ptrs array */
> >+	bool got_ref;		/* Did we pin pages by getting page ref? */
> >+	bool is_pfns;		/* Does array contain pages or pfns? */
> >+	void *ptrs[0];		/* Array of pinned pfns / pages. Use
> >+				 * pfns_vector_pages() or pfns_vector_pfns()
> >+				 * for access */
> >+};
> >+
> >+struct frame_vector *frame_vector_create(unsigned int nr_frames);
> >+void frame_vector_destroy(struct frame_vector *vec);
> >+int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
> >+		     bool write, bool force, struct frame_vector *vec);
> >+void put_vaddr_frames(struct frame_vector *vec);
> >+int frame_vector_to_pages(struct frame_vector *vec);
> >+void frame_vector_to_pfns(struct frame_vector *vec);
> >+
> >+static inline unsigned int frame_vector_count(struct frame_vector *vec)
> >+{
> >+	return vec->nr_frames;
> >+}
> >+
> >+static inline struct page **frame_vector_pages(struct frame_vector *vec)
> >+{
> >+	if (vec->is_pfns) {
> >+		int err = frame_vector_to_pages(vec);
> >+
> >+		if (err)
> >+			return ERR_PTR(err);
> >+	}
> >+	return (struct page **)(vec->ptrs);
> >+}
> >+
> >+static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
> >+{
> >+	if (!vec->is_pfns)
> >+		frame_vector_to_pfns(vec);
> >+	return (unsigned long *)(vec->ptrs);
> >+}
> >+
> >  struct kvec;
> >  int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
> >  			struct page **pages);
> >diff --git a/mm/gup.c b/mm/gup.c
> >index 6297f6bccfb1..8db5c40e65c4 100644
> >--- a/mm/gup.c
> >+++ b/mm/gup.c
> >@@ -8,6 +8,7 @@
> >  #include <linux/rmap.h>
> >  #include <linux/swap.h>
> >  #include <linux/swapops.h>
> >+#include <linux/vmalloc.h>
> >
> >  #include <linux/sched.h>
> >  #include <linux/rwsem.h>
> >@@ -936,6 +937,219 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
> >  	return ret;	/* 0 or negative error code */
> >  }
> >
> >+/*
> >+ * get_vaddr_frames() - map virtual addresses to pfns
> >+ * @start:	starting user address
> >+ * @nr_frames:	number of pages / pfns from start to map
> >+ * @write:	whether pages will be written to by the caller
> >+ * @force:	whether to force write access even if user mapping is
> >+ *		readonly. This will result in the page being COWed even
> >+ *		in MAP_SHARED mappings. You do not want this.
> 
> "You do not want this" and yet some of the drivers in later patches
> do. That looks weird. Explain better?
  That was copied from get_user_pages() comment but that got changed in the
mean time. I've referenced that instead of copying it.

> >+ * @vec:	structure which receives pages / pfns of the addresses mapped.
> >+ *		It should have space for at least nr_frames entries.
> >+ *
> >+ * This function maps virtual addresses from @start and fills @vec structure
> >+ * with page frame numbers or page pointers to corresponding pages (choice
> >+ * depends on the type of the vma underlying the virtual address). If @start
> >+ * belongs to a normal vma, the function grabs reference to each of the pages
> >+ * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
> >+ * touch page structures and the caller must make sure pfns aren't reused for
> >+ * anything else while he is using them.
> >+ *
> >+ * The function returns number of pages mapped which may be less than
> >+ * @nr_frames. In particular we stop mapping if there are more vmas of
> >+ * different type underlying the specified range of virtual addresses.
> 
> The function can also return e.g. -EFAULT, shouldn't that be documented too?
  I've added a comment that the function can return error.

> >+ *
> >+ * This function takes care of grabbing mmap_sem as necessary.
> >+ */
> >+int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
> >+		     bool write, bool force, struct frame_vector *vec)
> >+{
> >+	struct mm_struct *mm = current->mm;
> >+	struct vm_area_struct *vma;
> >+	int ret = 0;
> >+	int err;
> >+	int locked = 1;
> 
> It looks strange that locked is set to 1 before taking the actual
> mmap_sem. Also future-proofing.
  OK.

> >+/**
> >+ * frame_vector_destroy() - free memory allocated to carry frame vector
> >+ * @vec:	Frame vector to free
> >+ *
> >+ * Free structure allocated by frame_vector_create() to carry frames.
> >+ */
> >+void frame_vector_destroy(struct frame_vector *vec)
> >+{
> 
> Add some VM_BUG_ON()'s for nr_frames and got_ref perhaps?
  Good idea. Added:
VM_BUG_ON(vec->nr_frames > 0);
  Asserting got_ref when nr_frames is 0 is pointless... Thanks for review!

								Honza
Mel Gorman May 8, 2015, 2:49 p.m. UTC | #3
On Wed, May 06, 2015 at 09:28:09AM +0200, Jan Kara wrote:
> Provide new function get_vaddr_frames().  This function maps virtual
> addresses from given start and fills given array with page frame numbers of
> the corresponding pages. If given start belongs to a normal vma, the function
> grabs reference to each of the pages to pin them in memory. If start
> belongs to VM_IO | VM_PFNMAP vma, we don't touch page structures. Caller
> must make sure pfns aren't reused for anything else while he is using
> them.
> 
> This function is created for various drivers to simplify handling of
> their buffers.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>

Trivial comments only;

> ---
>  include/linux/mm.h |  44 +++++++++++
>  mm/gup.c           | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 258 insertions(+)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0755b9fd03a7..dcd1f02a78e9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -20,6 +20,7 @@
>  #include <linux/shrinker.h>
>  #include <linux/resource.h>
>  #include <linux/page_ext.h>
> +#include <linux/err.h>
>  
>  struct mempolicy;
>  struct anon_vma;
> @@ -1197,6 +1198,49 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
>  		    int write, int force, struct page **pages);
>  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>  			struct page **pages);
> +
> +/* Container for pinned pfns / pages */
> +struct frame_vector {
> +	unsigned int nr_allocated;	/* Number of frames we have space for */
> +	unsigned int nr_frames;	/* Number of frames stored in ptrs array */
> +	bool got_ref;		/* Did we pin pages by getting page ref? */
> +	bool is_pfns;		/* Does array contain pages or pfns? */
> +	void *ptrs[0];		/* Array of pinned pfns / pages. Use
> +				 * pfns_vector_pages() or pfns_vector_pfns()
> +				 * for access */
> +};
> +
> +struct frame_vector *frame_vector_create(unsigned int nr_frames);
> +void frame_vector_destroy(struct frame_vector *vec);
> +int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
> +		     bool write, bool force, struct frame_vector *vec);
> +void put_vaddr_frames(struct frame_vector *vec);
> +int frame_vector_to_pages(struct frame_vector *vec);
> +void frame_vector_to_pfns(struct frame_vector *vec);
> +
> +static inline unsigned int frame_vector_count(struct frame_vector *vec)
> +{
> +	return vec->nr_frames;
> +}
> +
> +static inline struct page **frame_vector_pages(struct frame_vector *vec)
> +{
> +	if (vec->is_pfns) {
> +		int err = frame_vector_to_pages(vec);
> +
> +		if (err)
> +			return ERR_PTR(err);
> +	}
> +	return (struct page **)(vec->ptrs);
> +}
> +
> +static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
> +{
> +	if (!vec->is_pfns)
> +		frame_vector_to_pfns(vec);
> +	return (unsigned long *)(vec->ptrs);
> +}
> +
>  struct kvec;
>  int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
>  			struct page **pages);
> diff --git a/mm/gup.c b/mm/gup.c
> index 6297f6bccfb1..8db5c40e65c4 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -8,6 +8,7 @@
>  #include <linux/rmap.h>
>  #include <linux/swap.h>
>  #include <linux/swapops.h>
> +#include <linux/vmalloc.h>
>  
>  #include <linux/sched.h>
>  #include <linux/rwsem.h>
> @@ -936,6 +937,219 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
>  	return ret;	/* 0 or negative error code */
>  }
>  
> +/*
> + * get_vaddr_frames() - map virtual addresses to pfns
> + * @start:	starting user address
> + * @nr_frames:	number of pages / pfns from start to map
> + * @write:	whether pages will be written to by the caller
> + * @force:	whether to force write access even if user mapping is
> + *		readonly. This will result in the page being COWed even
> + *		in MAP_SHARED mappings. You do not want this.

If they don't want it, why does it exist?

> + * @vec:	structure which receives pages / pfns of the addresses mapped.
> + *		It should have space for at least nr_frames entries.
> + *
> + * This function maps virtual addresses from @start and fills @vec structure
> + * with page frame numbers or page pointers to corresponding pages (choice
> + * depends on the type of the vma underlying the virtual address). If @start
> + * belongs to a normal vma, the function grabs reference to each of the pages
> + * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
> + * touch page structures and the caller must make sure pfns aren't reused for
> + * anything else while he is using them.
> + *
> + * The function returns number of pages mapped which may be less than
> + * @nr_frames. In particular we stop mapping if there are more vmas of
> + * different type underlying the specified range of virtual addresses.
> + *
> + * This function takes care of grabbing mmap_sem as necessary.
> + */
> +int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
> +		     bool write, bool force, struct frame_vector *vec)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *vma;
> +	int ret = 0;
> +	int err;
> +	int locked = 1;
> +

bool locked.

> +	if (nr_frames == 0)
> +		return 0;
> +
> +	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
> +		nr_frames = vec->nr_allocated;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_vma_intersection(mm, start, start + 1);
> +	if (!vma) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
> +		vec->got_ref = 1;
> +		vec->is_pfns = 0;

They're bools and while correct, it looks weird. There are a few
instances of this but I won't comment on it again.

> +		ret = get_user_pages_locked(current, mm, start, nr_frames,
> +			write, force, (struct page **)(vec->ptrs), &locked);
> +		goto out;
> +	}
> +
> +	vec->got_ref = 0;
> +	vec->is_pfns = 1;
> +	do {
> +		unsigned long *nums = frame_vector_pfns(vec);
> +
> +		while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
> +			err = follow_pfn(vma, start, &nums[ret]);
> +			if (err) {
> +				if (ret == 0)
> +					ret = err;
> +				goto out;
> +			}
> +			start += PAGE_SIZE;
> +			ret++;
> +		}
> +		/*
> +		 * We stop if we have enough pages or if VMA doesn't completely
> +		 * cover the tail page.
> +		 */
> +		if (ret >= nr_frames || start < vma->vm_end)
> +			break;
> +		vma = find_vma_intersection(mm, start, start + 1);
> +	} while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
> +out:
> +	if (locked)
> +		up_read(&mm->mmap_sem);
> +	if (!ret)
> +		ret = -EFAULT;
> +	if (ret > 0)
> +		vec->nr_frames = ret;
> +	return ret;
> +}
> +EXPORT_SYMBOL(get_vaddr_frames);
> +
> +/**
> + * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
> + *			them
> + * @vec:	frame vector to put
> + *
> + * Drop references to pages if get_vaddr_frames() acquired them. We also
> + * invalidate the frame vector so that it is prepared for the next call into
> + * get_vaddr_frames().
> + */
> +void put_vaddr_frames(struct frame_vector *vec)
> +{
> +	int i;
> +	struct page **pages;
> +
> +	if (!vec->got_ref)
> +		goto out;
> +	pages = frame_vector_pages(vec);
> +	/*
> +	 * frame_vector_pages() might needed to do a conversion when we
> +	 * get_vaddr_frames() got pages but vec was later converted to pfns.
> +	 * But it shouldn't really fail to convert pfns back...
> +	 */
> +	BUG_ON(IS_ERR(pages));

It's hard to see how it could but it is recoverable so BUG_ON is
overkill. WARN_ON and return even though we potentially leaked now.

> +	for (i = 0; i < vec->nr_frames; i++)
> +		put_page(pages[i]);
> +	vec->got_ref = 0;
> +out:
> +	vec->nr_frames = 0;
> +}
> +EXPORT_SYMBOL(put_vaddr_frames);
> +
> +/**
> + * frame_vector_to_pages - convert frame vector to contain page pointers
> + * @vec:	frame vector to convert
> + *
> + * Convert @vec to contain array of page pointers.  If the conversion is
> + * successful, return 0. Otherwise return an error.
> + */

If you do another version, it would not hurt to mention that a page
reference is not taken when doing this conversion.

> +int frame_vector_to_pages(struct frame_vector *vec)
> +{

I think it's probably best to make the relevant counters in frame_vector
signed and limit the maximum possible size of it. It's still not putting
any practical limit on the size of the frame_vector.
Jan Kara May 11, 2015, 2 p.m. UTC | #4
On Fri 08-05-15 15:49:22, Mel Gorman wrote:
> On Wed, May 06, 2015 at 09:28:09AM +0200, Jan Kara wrote:
> > Provide new function get_vaddr_frames().  This function maps virtual
> > addresses from given start and fills given array with page frame numbers of
> > the corresponding pages. If given start belongs to a normal vma, the function
> > grabs reference to each of the pages to pin them in memory. If start
> > belongs to VM_IO | VM_PFNMAP vma, we don't touch page structures. Caller
> > must make sure pfns aren't reused for anything else while he is using
> > them.
> > 
> > This function is created for various drivers to simplify handling of
> > their buffers.
> > 
> > Signed-off-by: Jan Kara <jack@suse.cz>
> 
> Trivial comments only;
...
> > @@ -936,6 +937,219 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
> >  	return ret;	/* 0 or negative error code */
> >  }
> >  
> > +/*
> > + * get_vaddr_frames() - map virtual addresses to pfns
> > + * @start:	starting user address
> > + * @nr_frames:	number of pages / pfns from start to map
> > + * @write:	whether pages will be written to by the caller
> > + * @force:	whether to force write access even if user mapping is
> > + *		readonly. This will result in the page being COWed even
> > + *		in MAP_SHARED mappings. You do not want this.
> 
> If they don't want it, why does it exist?
  This was copied from get_user_pages(). The original comment changed in
the mean time so I've updated this one somewhat and referenced comment at
get_user_pages() for details.

> > + * @vec:	structure which receives pages / pfns of the addresses mapped.
> > + *		It should have space for at least nr_frames entries.
> > + *
> > + * This function maps virtual addresses from @start and fills @vec structure
> > + * with page frame numbers or page pointers to corresponding pages (choice
> > + * depends on the type of the vma underlying the virtual address). If @start
> > + * belongs to a normal vma, the function grabs reference to each of the pages
> > + * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
> > + * touch page structures and the caller must make sure pfns aren't reused for
> > + * anything else while he is using them.
> > + *
> > + * The function returns number of pages mapped which may be less than
> > + * @nr_frames. In particular we stop mapping if there are more vmas of
> > + * different type underlying the specified range of virtual addresses.
> > + *
> > + * This function takes care of grabbing mmap_sem as necessary.
> > + */
> > +int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
> > +		     bool write, bool force, struct frame_vector *vec)
> > +{
> > +	struct mm_struct *mm = current->mm;
> > +	struct vm_area_struct *vma;
> > +	int ret = 0;
> > +	int err;
> > +	int locked = 1;
> > +
> 
> bool locked.
  It cannot be bool. It is passed to get_user_pages_locked() which expects
int *.

> > +	if (nr_frames == 0)
> > +		return 0;
> > +
> > +	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
> > +		nr_frames = vec->nr_allocated;
> > +
> > +	down_read(&mm->mmap_sem);
> > +	vma = find_vma_intersection(mm, start, start + 1);
> > +	if (!vma) {
> > +		ret = -EFAULT;
> > +		goto out;
> > +	}
> > +	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
> > +		vec->got_ref = 1;
> > +		vec->is_pfns = 0;
> 
> They're bools and while correct, it looks weird. There are a few
> instances of this but I won't comment on it again.
  Thanks. Fixed.

> > +		ret = get_user_pages_locked(current, mm, start, nr_frames,
> > +			write, force, (struct page **)(vec->ptrs), &locked);
> > +		goto out;
> > +	}
> > +
> > +	vec->got_ref = 0;
> > +	vec->is_pfns = 1;
> > +	do {
> > +		unsigned long *nums = frame_vector_pfns(vec);
> > +
> > +		while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
> > +			err = follow_pfn(vma, start, &nums[ret]);
> > +			if (err) {
> > +				if (ret == 0)
> > +					ret = err;
> > +				goto out;
> > +			}
> > +			start += PAGE_SIZE;
> > +			ret++;
> > +		}
> > +		/*
> > +		 * We stop if we have enough pages or if VMA doesn't completely
> > +		 * cover the tail page.
> > +		 */
> > +		if (ret >= nr_frames || start < vma->vm_end)
> > +			break;
> > +		vma = find_vma_intersection(mm, start, start + 1);
> > +	} while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
> > +out:
> > +	if (locked)
> > +		up_read(&mm->mmap_sem);
> > +	if (!ret)
> > +		ret = -EFAULT;
> > +	if (ret > 0)
> > +		vec->nr_frames = ret;
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL(get_vaddr_frames);
> > +
> > +/**
> > + * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
> > + *			them
> > + * @vec:	frame vector to put
> > + *
> > + * Drop references to pages if get_vaddr_frames() acquired them. We also
> > + * invalidate the frame vector so that it is prepared for the next call into
> > + * get_vaddr_frames().
> > + */
> > +void put_vaddr_frames(struct frame_vector *vec)
> > +{
> > +	int i;
> > +	struct page **pages;
> > +
> > +	if (!vec->got_ref)
> > +		goto out;
> > +	pages = frame_vector_pages(vec);
> > +	/*
> > +	 * frame_vector_pages() might needed to do a conversion when we
> > +	 * get_vaddr_frames() got pages but vec was later converted to pfns.
> > +	 * But it shouldn't really fail to convert pfns back...
> > +	 */
> > +	BUG_ON(IS_ERR(pages));
> 
> It's hard to see how it could but it is recoverable so BUG_ON is
> overkill. WARN_ON and return even though we potentially leaked now.
  OK.

> > +	for (i = 0; i < vec->nr_frames; i++)
> > +		put_page(pages[i]);
> > +	vec->got_ref = 0;
> > +out:
> > +	vec->nr_frames = 0;
> > +}
> > +EXPORT_SYMBOL(put_vaddr_frames);
> > +
> > +/**
> > + * frame_vector_to_pages - convert frame vector to contain page pointers
> > + * @vec:	frame vector to convert
> > + *
> > + * Convert @vec to contain array of page pointers.  If the conversion is
> > + * successful, return 0. Otherwise return an error.
> > + */
> 
> If you do another version, it would not hurt to mention that a page
> reference is not taken when doing this conversion.
  OK, done.

> > +int frame_vector_to_pages(struct frame_vector *vec)
> > +{
> 
> I think it's probably best to make the relevant counters in frame_vector
> signed and limit the maximum possible size of it. It's still not putting
> any practical limit on the size of the frame_vector.
  I don't see a reason why counters in frame_vector should be signed... Can
you share your reason?  I've added a check into frame_vector_create() to
limit number of frames to INT_MAX / sizeof(void *) / 2 to avoid arithmetics
overflow. Thanks for review!

								Honza
Mel Gorman May 11, 2015, 2:47 p.m. UTC | #5
On Mon, May 11, 2015 at 04:00:19PM +0200, Jan Kara wrote:
> > > +int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
> > > +		     bool write, bool force, struct frame_vector *vec)
> > > +{
> > > +	struct mm_struct *mm = current->mm;
> > > +	struct vm_area_struct *vma;
> > > +	int ret = 0;
> > > +	int err;
> > > +	int locked = 1;
> > > +
> > 
> > bool locked.
>   It cannot be bool. It is passed to get_user_pages_locked() which expects
> int *.
> 

My bad.

> > > +int frame_vector_to_pages(struct frame_vector *vec)
> > > +{
> > 
> > I think it's probably best to make the relevant counters in frame_vector
> > signed and limit the maximum possible size of it. It's still not putting
> > any practical limit on the size of the frame_vector.
>
>   I don't see a reason why counters in frame_vector should be signed... Can
> you share your reason?  I've added a check into frame_vector_create() to
> limit number of frames to INT_MAX / sizeof(void *) / 2 to avoid arithmetics
> overflow. Thanks for review!
> 

Only that the return value of frame_vector_to_pages() returns int where
as the potential range that is converted is unsigned int. I don't think
there are any mistakes dealing with signed/unsigned but I don't see any
advantage of using unsigned either and limiting it to INT_MAX either.
It's not a big deal.

Patch
diff mbox

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0755b9fd03a7..dcd1f02a78e9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -20,6 +20,7 @@ 
 #include <linux/shrinker.h>
 #include <linux/resource.h>
 #include <linux/page_ext.h>
+#include <linux/err.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1197,6 +1198,49 @@  long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 		    int write, int force, struct page **pages);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
+
+/* Container for pinned pfns / pages */
+struct frame_vector {
+	unsigned int nr_allocated;	/* Number of frames we have space for */
+	unsigned int nr_frames;	/* Number of frames stored in ptrs array */
+	bool got_ref;		/* Did we pin pages by getting page ref? */
+	bool is_pfns;		/* Does array contain pages or pfns? */
+	void *ptrs[0];		/* Array of pinned pfns / pages. Use
+				 * pfns_vector_pages() or pfns_vector_pfns()
+				 * for access */
+};
+
+struct frame_vector *frame_vector_create(unsigned int nr_frames);
+void frame_vector_destroy(struct frame_vector *vec);
+int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
+		     bool write, bool force, struct frame_vector *vec);
+void put_vaddr_frames(struct frame_vector *vec);
+int frame_vector_to_pages(struct frame_vector *vec);
+void frame_vector_to_pfns(struct frame_vector *vec);
+
+static inline unsigned int frame_vector_count(struct frame_vector *vec)
+{
+	return vec->nr_frames;
+}
+
+static inline struct page **frame_vector_pages(struct frame_vector *vec)
+{
+	if (vec->is_pfns) {
+		int err = frame_vector_to_pages(vec);
+
+		if (err)
+			return ERR_PTR(err);
+	}
+	return (struct page **)(vec->ptrs);
+}
+
+static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
+{
+	if (!vec->is_pfns)
+		frame_vector_to_pfns(vec);
+	return (unsigned long *)(vec->ptrs);
+}
+
 struct kvec;
 int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
 			struct page **pages);
diff --git a/mm/gup.c b/mm/gup.c
index 6297f6bccfb1..8db5c40e65c4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -8,6 +8,7 @@ 
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/vmalloc.h>
 
 #include <linux/sched.h>
 #include <linux/rwsem.h>
@@ -936,6 +937,219 @@  int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 	return ret;	/* 0 or negative error code */
 }
 
+/*
+ * get_vaddr_frames() - map virtual addresses to pfns
+ * @start:	starting user address
+ * @nr_frames:	number of pages / pfns from start to map
+ * @write:	whether pages will be written to by the caller
+ * @force:	whether to force write access even if user mapping is
+ *		readonly. This will result in the page being COWed even
+ *		in MAP_SHARED mappings. You do not want this.
+ * @vec:	structure which receives pages / pfns of the addresses mapped.
+ *		It should have space for at least nr_frames entries.
+ *
+ * This function maps virtual addresses from @start and fills @vec structure
+ * with page frame numbers or page pointers to corresponding pages (choice
+ * depends on the type of the vma underlying the virtual address). If @start
+ * belongs to a normal vma, the function grabs reference to each of the pages
+ * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
+ * touch page structures and the caller must make sure pfns aren't reused for
+ * anything else while he is using them.
+ *
+ * The function returns number of pages mapped which may be less than
+ * @nr_frames. In particular we stop mapping if there are more vmas of
+ * different type underlying the specified range of virtual addresses.
+ *
+ * This function takes care of grabbing mmap_sem as necessary.
+ */
+int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
+		     bool write, bool force, struct frame_vector *vec)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int ret = 0;
+	int err;
+	int locked = 1;
+
+	if (nr_frames == 0)
+		return 0;
+
+	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
+		nr_frames = vec->nr_allocated;
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma_intersection(mm, start, start + 1);
+	if (!vma) {
+		ret = -EFAULT;
+		goto out;
+	}
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
+		vec->got_ref = 1;
+		vec->is_pfns = 0;
+		ret = get_user_pages_locked(current, mm, start, nr_frames,
+			write, force, (struct page **)(vec->ptrs), &locked);
+		goto out;
+	}
+
+	vec->got_ref = 0;
+	vec->is_pfns = 1;
+	do {
+		unsigned long *nums = frame_vector_pfns(vec);
+
+		while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
+			err = follow_pfn(vma, start, &nums[ret]);
+			if (err) {
+				if (ret == 0)
+					ret = err;
+				goto out;
+			}
+			start += PAGE_SIZE;
+			ret++;
+		}
+		/*
+		 * We stop if we have enough pages or if VMA doesn't completely
+		 * cover the tail page.
+		 */
+		if (ret >= nr_frames || start < vma->vm_end)
+			break;
+		vma = find_vma_intersection(mm, start, start + 1);
+	} while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
+out:
+	if (locked)
+		up_read(&mm->mmap_sem);
+	if (!ret)
+		ret = -EFAULT;
+	if (ret > 0)
+		vec->nr_frames = ret;
+	return ret;
+}
+EXPORT_SYMBOL(get_vaddr_frames);
+
+/**
+ * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
+ *			them
+ * @vec:	frame vector to put
+ *
+ * Drop references to pages if get_vaddr_frames() acquired them. We also
+ * invalidate the frame vector so that it is prepared for the next call into
+ * get_vaddr_frames().
+ */
+void put_vaddr_frames(struct frame_vector *vec)
+{
+	int i;
+	struct page **pages;
+
+	if (!vec->got_ref)
+		goto out;
+	pages = frame_vector_pages(vec);
+	/*
+	 * frame_vector_pages() might needed to do a conversion when we
+	 * get_vaddr_frames() got pages but vec was later converted to pfns.
+	 * But it shouldn't really fail to convert pfns back...
+	 */
+	BUG_ON(IS_ERR(pages));
+	for (i = 0; i < vec->nr_frames; i++)
+		put_page(pages[i]);
+	vec->got_ref = 0;
+out:
+	vec->nr_frames = 0;
+}
+EXPORT_SYMBOL(put_vaddr_frames);
+
+/**
+ * frame_vector_to_pages - convert frame vector to contain page pointers
+ * @vec:	frame vector to convert
+ *
+ * Convert @vec to contain array of page pointers.  If the conversion is
+ * successful, return 0. Otherwise return an error.
+ */
+int frame_vector_to_pages(struct frame_vector *vec)
+{
+	int i;
+	unsigned long *nums;
+	struct page **pages;
+
+	if (!vec->is_pfns)
+		return 0;
+	nums = frame_vector_pfns(vec);
+	for (i = 0; i < vec->nr_frames; i++)
+		if (!pfn_valid(nums[i]))
+			return -EINVAL;
+	pages = (struct page **)nums;
+	for (i = 0; i < vec->nr_frames; i++)
+		pages[i] = pfn_to_page(nums[i]);
+	vec->is_pfns = 0;
+	return 0;
+}
+EXPORT_SYMBOL(frame_vector_to_pages);
+
+/**
+ * frame_vector_to_pfns - convert frame vector to contain pfns
+ * @vec:	frame vector to convert
+ *
+ * Convert @vec to contain array of pfns.
+ */
+void frame_vector_to_pfns(struct frame_vector *vec)
+{
+	int i;
+	unsigned long *nums;
+	struct page **pages;
+
+	if (vec->is_pfns)
+		return;
+	pages = (struct page **)(vec->ptrs);
+	nums = (unsigned long *)pages;
+	for (i = 0; i < vec->nr_frames; i++)
+		nums[i] = page_to_pfn(pages[i]);
+	vec->is_pfns = 1;
+}
+EXPORT_SYMBOL(frame_vector_to_pfns);
+
+/**
+ * frame_vector_create() - allocate & initialize structure for pinned pfns
+ * @nr_frames:	number of pfns slots we should reserve
+ *
+ * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
+ * pfns.
+ */
+struct frame_vector *frame_vector_create(unsigned int nr_frames)
+{
+	struct frame_vector *vec;
+	int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
+
+	if (WARN_ON_ONCE(nr_frames == 0))
+		return NULL;
+	/*
+	 * Avoid higher order allocations, use vmalloc instead. It should
+	 * be rare anyway.
+	 */
+	if (size <= PAGE_SIZE)
+		vec = kmalloc(size, GFP_KERNEL);
+	else
+		vec = vmalloc(size);
+	if (!vec)
+		return NULL;
+	vec->nr_allocated = nr_frames;
+	vec->nr_frames = 0;
+	return vec;
+}
+EXPORT_SYMBOL(frame_vector_create);
+
+/**
+ * frame_vector_destroy() - free memory allocated to carry frame vector
+ * @vec:	Frame vector to free
+ *
+ * Free structure allocated by frame_vector_create() to carry frames.
+ */
+void frame_vector_destroy(struct frame_vector *vec)
+{
+	if (!is_vmalloc_addr(vec))
+		kfree(vec);
+	else
+		vfree(vec);
+}
+EXPORT_SYMBOL(frame_vector_destroy);
+
 /**
  * get_dump_page() - pin user page in memory while writing it to core dump
  * @addr: user address