diff mbox series

[2/6] iov_iter: new iov_iter_pin_pages*(), for FOLL_PIN pages

Message ID 20220227093434.2889464-3-jhubbard@nvidia.com (mailing list archive)
State New
Headers show
Series block, fs: convert most Direct IO cases to FOLL_PIN | expand

Commit Message

jhubbard.send.patches@gmail.com Feb. 27, 2022, 9:34 a.m. UTC
From: John Hubbard <jhubbard@nvidia.com>

Provide two new specialized routines that only handle user space pages,
and invoke pin_user_pages_fast() on them: iov_iter_pin_pages() and
iov_iter_pin_pages_alloc().

This allows subsequent patches to convert various callers of
iov_iter_get_pages*(), to the new calls, without having to attempt a
mass conversion all at once.

Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 include/linux/uio.h |  4 +++
 lib/iov_iter.c      | 78 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

Comments

Jens Axboe Feb. 27, 2022, 9:57 p.m. UTC | #1
> +ssize_t iov_iter_pin_pages(struct iov_iter *i,
> +		   struct page **pages, size_t maxsize, unsigned int maxpages,
> +		   size_t *start)
> +{
> +	size_t len;
> +	int n, res;
> +
> +	if (maxsize > i->count)
> +		maxsize = i->count;
> +	if (!maxsize)
> +		return 0;
> +
> +	WARN_ON_ONCE(!iter_is_iovec(i));
> +
> +	if (likely(iter_is_iovec(i))) {
> +		unsigned int gup_flags = 0;
> +		unsigned long addr;
> +
> +		if (iov_iter_rw(i) != WRITE)
> +			gup_flags |= FOLL_WRITE;
> +		if (i->nofault)
> +			gup_flags |= FOLL_NOFAULT;
> +
> +		addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
> +		n = DIV_ROUND_UP(len, PAGE_SIZE);
> +		res = pin_user_pages_fast(addr, n, gup_flags, pages);
> +		if (unlikely(res <= 0))
> +			return res;
> +		return (res == n ? len : res * PAGE_SIZE) - *start;

Trying to be clever like that just makes the code a lot less readable. I
should not have to reason about a return value. Same in the other
function.
John Hubbard Feb. 27, 2022, 10:09 p.m. UTC | #2
On 2/27/22 13:57, Jens Axboe wrote:
...
>> +			return res;
>> +		return (res == n ? len : res * PAGE_SIZE) - *start;
> 
> Trying to be clever like that just makes the code a lot less readable. I
> should not have to reason about a return value. Same in the other
> function.
> 

No argument there. This was shamelessly lifted from
iov_iter_get_pages(), and I initially calculated that keeping it
identical to that known-good code, where possible, was better than
fixing it up.

However, I'll go ahead and simplify it--with pleasure--based on this
feedback.


thanks,
John Hubbard Feb. 28, 2022, 10:49 p.m. UTC | #3
On 2/27/22 13:57, Jens Axboe wrote:
>> +ssize_t iov_iter_pin_pages(struct iov_iter *i,
>> +		   struct page **pages, size_t maxsize, unsigned int maxpages,
>> +		   size_t *start)
>> +{
>> +	size_t len;
>> +	int n, res;
>> +
>> +	if (maxsize > i->count)
>> +		maxsize = i->count;
>> +	if (!maxsize)
>> +		return 0;
>> +
>> +	WARN_ON_ONCE(!iter_is_iovec(i));
>> +
>> +	if (likely(iter_is_iovec(i))) {
>> +		unsigned int gup_flags = 0;
>> +		unsigned long addr;
>> +
>> +		if (iov_iter_rw(i) != WRITE)
>> +			gup_flags |= FOLL_WRITE;
>> +		if (i->nofault)
>> +			gup_flags |= FOLL_NOFAULT;
>> +
>> +		addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
>> +		n = DIV_ROUND_UP(len, PAGE_SIZE);
>> +		res = pin_user_pages_fast(addr, n, gup_flags, pages);
>> +		if (unlikely(res <= 0))
>> +			return res;
>> +		return (res == n ? len : res * PAGE_SIZE) - *start;
> 
> Trying to be clever like that just makes the code a lot less readable. I
> should not have to reason about a return value. Same in the other
> function.
> 

Here is a differential patch on top of this one, and only showing one of
the two routines. How does this direction look to you?


diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e64e8e4edd0c..8e96f1e9ebc6 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1588,7 +1588,17 @@ ssize_t iov_iter_pin_pages(struct iov_iter *i,
  		res = pin_user_pages_fast(addr, n, gup_flags, pages);
  		if (unlikely(res <= 0))
  			return res;
-		return (res == n ? len : res * PAGE_SIZE) - *start;
+
+		/* Cap len at the number of pages that were actually pinned: */
+		if (res < n)
+			len = res * PAGE_SIZE;
+
+		/*
+		 * The return value is the amount pinned in bytes that the
+		 * caller will actually use. So, reduce it by the offset into
+		 * the first page:
+		 */
+		return len - *start;
  	}

  	return -EFAULT;

thanks,
diff mbox series

Patch

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 739285fe5a2f..208020c2b75a 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -236,6 +236,10 @@  ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 			size_t maxsize, unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
 			size_t maxsize, size_t *start);
+ssize_t iov_iter_pin_pages(struct iov_iter *i, struct page **pages,
+			size_t maxsize, unsigned int maxpages, size_t *start);
+ssize_t iov_iter_pin_pages_alloc(struct iov_iter *i, struct page ***pages,
+			size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
 
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 6dd5330f7a99..e64e8e4edd0c 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1560,6 +1560,41 @@  ssize_t iov_iter_get_pages(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages);
 
+ssize_t iov_iter_pin_pages(struct iov_iter *i,
+		   struct page **pages, size_t maxsize, unsigned int maxpages,
+		   size_t *start)
+{
+	size_t len;
+	int n, res;
+
+	if (maxsize > i->count)
+		maxsize = i->count;
+	if (!maxsize)
+		return 0;
+
+	WARN_ON_ONCE(!iter_is_iovec(i));
+
+	if (likely(iter_is_iovec(i))) {
+		unsigned int gup_flags = 0;
+		unsigned long addr;
+
+		if (iov_iter_rw(i) != WRITE)
+			gup_flags |= FOLL_WRITE;
+		if (i->nofault)
+			gup_flags |= FOLL_NOFAULT;
+
+		addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
+		n = DIV_ROUND_UP(len, PAGE_SIZE);
+		res = pin_user_pages_fast(addr, n, gup_flags, pages);
+		if (unlikely(res <= 0))
+			return res;
+		return (res == n ? len : res * PAGE_SIZE) - *start;
+	}
+
+	return -EFAULT;
+}
+EXPORT_SYMBOL(iov_iter_pin_pages);
+
 static struct page **get_pages_array(size_t n)
 {
 	return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
@@ -1696,6 +1731,49 @@  ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
 
+ssize_t iov_iter_pin_pages_alloc(struct iov_iter *i,
+		   struct page ***pages, size_t maxsize,
+		   size_t *start)
+{
+	struct page **p;
+	size_t len;
+	int n, res;
+
+	if (maxsize > i->count)
+		maxsize = i->count;
+	if (!maxsize)
+		return 0;
+
+	WARN_ON_ONCE(!iter_is_iovec(i));
+
+	if (likely(iter_is_iovec(i))) {
+		unsigned int gup_flags = 0;
+		unsigned long addr;
+
+		if (iov_iter_rw(i) != WRITE)
+			gup_flags |= FOLL_WRITE;
+		if (i->nofault)
+			gup_flags |= FOLL_NOFAULT;
+
+		addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
+		n = DIV_ROUND_UP(len, PAGE_SIZE);
+		p = get_pages_array(n);
+		if (!p)
+			return -ENOMEM;
+		res = pin_user_pages_fast(addr, n, gup_flags, p);
+		if (unlikely(res <= 0)) {
+			kvfree(p);
+			*pages = NULL;
+			return res;
+		}
+		*pages = p;
+		return (res == n ? len : res * PAGE_SIZE) - *start;
+	}
+
+	return -EFAULT;
+}
+EXPORT_SYMBOL(iov_iter_pin_pages_alloc);
+
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 			       struct iov_iter *i)
 {