diff mbox

[v2] ceph/iov_iter: fix bad iov_iter handling in ceph splice codepaths

Message ID 1484053051-23685-1-git-send-email-jlayton@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Layton Jan. 10, 2017, 12:57 p.m. UTC
v2: fix bug in offset handling in iov_iter_pvec_size

xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses
fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying
to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it
to pick up a wrong offset and get stuck in an infinite loop while trying
to populate the page array. dio_get_pagev_size has a similar problem.

To fix the first problem, add a new iov_iter helper to determine the
offset into the page for the current segment and have ceph call that.
I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc,
but that will only return a single page at a time for ITER_BVEC and
it's better to make larger requests when possible.

For the second problem, we simply replace it with a new helper that does
what it does, but properly for all iov_iter types.

Since we're moving that into generic code, we can also utilize the
iterate_all_kinds macro to simplify this. That means that we need to
rework the logic a bit since we can't advance to the next vector while
checking the current one.

Link: http://tracker.ceph.com/issues/18130
Cc: "Zhu, Caifeng" <zhucaifeng@unissoft-nj.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/ceph/file.c      | 28 ++-------------------
 include/linux/uio.h |  2 ++
 lib/iov_iter.c      | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 26 deletions(-)

Comments

Yan, Zheng Jan. 11, 2017, 2:42 a.m. UTC | #1
> On 10 Jan 2017, at 20:57, Jeff Layton <jlayton@redhat.com> wrote:
> 
> v2: fix bug in offset handling in iov_iter_pvec_size
> 
> xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses
> fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying
> to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it
> to pick up a wrong offset and get stuck in an infinite loop while trying
> to populate the page array. dio_get_pagev_size has a similar problem.
> 
> To fix the first problem, add a new iov_iter helper to determine the
> offset into the page for the current segment and have ceph call that.
> I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc,
> but that will only return a single page at a time for ITER_BVEC and
> it's better to make larger requests when possible.
> 
> For the second problem, we simply replace it with a new helper that does
> what it does, but properly for all iov_iter types.
> 
> Since we're moving that into generic code, we can also utilize the
> iterate_all_kinds macro to simplify this. That means that we need to
> rework the logic a bit since we can't advance to the next vector while
> checking the current one.
> 
> Link: http://tracker.ceph.com/issues/18130
> Cc: "Zhu, Caifeng" <zhucaifeng@unissoft-nj.com>
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
> fs/ceph/file.c      | 28 ++-------------------
> include/linux/uio.h |  2 ++
> lib/iov_iter.c      | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 75 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index f633165f3fdc..0cd9fc68a04e 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -35,29 +35,6 @@
>  */
> 
> /*
> - * Calculate the length sum of direct io vectors that can
> - * be combined into one page vector.
> - */
> -static size_t dio_get_pagev_size(const struct iov_iter *it)
> -{
> -    const struct iovec *iov = it->iov;
> -    const struct iovec *iovend = iov + it->nr_segs;
> -    size_t size;
> -
> -    size = iov->iov_len - it->iov_offset;
> -    /*
> -     * An iov can be page vectored when both the current tail
> -     * and the next base are page aligned.
> -     */
> -    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
> -           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
> -        size += iov->iov_len;
> -    }
> -    dout("dio_get_pagevlen len = %zu\n", size);
> -    return size;
> -}
> -
> -/*
>  * Allocate a page vector based on (@it, @nbytes).
>  * The return value is the tuple describing a page vector,
>  * that is (@pages, @page_align, @num_pages).
> @@ -71,8 +48,7 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
> 	struct page **pages;
> 	int ret = 0, idx, npages;
> 
> -	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
> -		(PAGE_SIZE - 1);
> +	align = iov_iter_single_seg_page_offset(it);
> 	npages = calc_pages_for(align, nbytes);
> 	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
> 	if (!pages) {
> @@ -927,7 +903,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
> 	}
> 
> 	while (iov_iter_count(iter) > 0) {
> -		u64 size = dio_get_pagev_size(iter);
> +		u64 size = iov_iter_pvec_size(iter);
> 		size_t start = 0;
> 		ssize_t len;
> 
> diff --git a/include/linux/uio.h b/include/linux/uio.h
> index 6e22b544d039..46fdfff3d7d6 100644
> --- a/include/linux/uio.h
> +++ b/include/linux/uio.h
> @@ -83,6 +83,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
> void iov_iter_advance(struct iov_iter *i, size_t bytes);
> int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
> size_t iov_iter_single_seg_count(const struct iov_iter *i);
> +size_t iov_iter_single_seg_page_offset(const struct iov_iter *i);
> size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
> 			 struct iov_iter *i);
> size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
> @@ -93,6 +94,7 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
> size_t iov_iter_zero(size_t bytes, struct iov_iter *);
> unsigned long iov_iter_alignment(const struct iov_iter *i);
> unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
> +size_t iov_iter_pvec_size(const struct iov_iter *i);
> void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
> 			unsigned long nr_segs, size_t count);
> void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec,
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index 6b415b5a100d..b8fa377b0cef 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -745,6 +745,24 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
> }
> EXPORT_SYMBOL(iov_iter_single_seg_count);
> 
> +/* Return offset into page for current iov_iter segment */
> +size_t iov_iter_single_seg_page_offset(const struct iov_iter *i)
> +{
> +	size_t	base;
> +
> +	if (i->type & ITER_PIPE)
> +		base = i->pipe->bufs[i->idx].offset;
> +	else if (i->type & ITER_BVEC)
> +		base = i->bvec->bv_offset;
> +	else if (i->type & ITER_KVEC)
> +		base = (size_t)i->kvec->iov_base;
> +	else
> +		base = (size_t)i->iov->iov_base;
> +
> +	return (base + i->iov_offset) & (PAGE_SIZE - 1);
> +}
> +EXPORT_SYMBOL(iov_iter_single_seg_page_offset);
> +
> void iov_iter_kvec(struct iov_iter *i, int direction,
> 			const struct kvec *kvec, unsigned long nr_segs,
> 			size_t count)
> @@ -830,6 +848,59 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
> }
> EXPORT_SYMBOL(iov_iter_gap_alignment);
> 
> +/**
> + * iov_iter_pvec_size - find length of page aligned iovecs in iov_iter
> + * @i: iov_iter to in which to find the size
> + *
> + * Some filesystems can stitch together multiple iovecs into a single
> + * page vector when both the previous tail and current base are page
> + * aligned. This function discovers the length that can fit in a single
> + * pagevec and returns it.
> + */
> +size_t iov_iter_pvec_size(const struct iov_iter *i)
> +{
> +	size_t size = i->count;
> +	size_t pv_size = 0;
> +	bool contig = false, first = true;
> +
> +	if (!size)
> +		return 0;
> +
> +	/* Pipes are naturally aligned for this */
> +	if (unlikely(i->type & ITER_PIPE))
> +		return size;
> +
> +	/*
> +	 * An iov can be page vectored when the current base and previous
> +	 * tail are both page aligned. Note that we don't require that the
> +	 * initial base in the first iovec also be page aligned.
> +	 */
> +	iterate_all_kinds(i, size, v,
> +		({
> +		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
> +			pv_size += v.iov_len;
> +			first = false;
> +			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
> +		 }; 0;
> +		 }),
> +		({
> +		 if (first || (contig && v.bv_offset == 0)) {
> +			pv_size += v.bv_len;
> +			first = false;
> +			contig = PAGE_ALIGNED(v.bv_offset + v.bv_len);
> +		 }
> +		 }),
> +		({
> +		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
> +			pv_size += v.iov_len;
> +			first = false;
> +			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
> +		 }
> +		 }))
> +	return pv_size;
> +}
> +EXPORT_SYMBOL(iov_iter_pvec_size);
> +
> static inline size_t __pipe_get_pages(struct iov_iter *i,
> 				size_t maxsize,
> 				struct page **pages,

Reviewed-by: Yan, Zheng <zyan@redhat.com>
> -- 
> 2.7.4
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Jan. 12, 2017, 7:59 a.m. UTC | #2
On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote:
> v2: fix bug in offset handling in iov_iter_pvec_size
> 
> xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses
> fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying
> to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it
> to pick up a wrong offset and get stuck in an infinite loop while trying
> to populate the page array. dio_get_pagev_size has a similar problem.
> 
> To fix the first problem, add a new iov_iter helper to determine the
> offset into the page for the current segment and have ceph call that.
> I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc,
> but that will only return a single page at a time for ITER_BVEC and
> it's better to make larger requests when possible.
> 
> For the second problem, we simply replace it with a new helper that does
> what it does, but properly for all iov_iter types.
> 
> Since we're moving that into generic code, we can also utilize the
> iterate_all_kinds macro to simplify this. That means that we need to
> rework the logic a bit since we can't advance to the next vector while
> checking the current one.

Yecchhh...  That really looks like exposing way too low-level stuff instead
of coming up with saner primitive ;-/

Is page vector + offset in the first page + number of bytes really what
ceph wants?  Would e.g. an array of bio_vec be saner?  Because _that_
would make a lot more natural iov_iter_get_pages_alloc() analogue...

And yes, I realize that you have ->pages wired into the struct ceph_osd_request;
how painful would it be to have it switched to struct bio_vec array instead?
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov Jan. 12, 2017, 11:13 a.m. UTC | #3
On Thu, Jan 12, 2017 at 8:59 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote:
>> v2: fix bug in offset handling in iov_iter_pvec_size
>>
>> xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses
>> fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying
>> to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it
>> to pick up a wrong offset and get stuck in an infinite loop while trying
>> to populate the page array. dio_get_pagev_size has a similar problem.
>>
>> To fix the first problem, add a new iov_iter helper to determine the
>> offset into the page for the current segment and have ceph call that.
>> I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc,
>> but that will only return a single page at a time for ITER_BVEC and
>> it's better to make larger requests when possible.
>>
>> For the second problem, we simply replace it with a new helper that does
>> what it does, but properly for all iov_iter types.
>>
>> Since we're moving that into generic code, we can also utilize the
>> iterate_all_kinds macro to simplify this. That means that we need to
>> rework the logic a bit since we can't advance to the next vector while
>> checking the current one.
>
> Yecchhh...  That really looks like exposing way too low-level stuff instead
> of coming up with saner primitive ;-/
>
> Is page vector + offset in the first page + number of bytes really what
> ceph wants?  Would e.g. an array of bio_vec be saner?  Because _that_
> would make a lot more natural iov_iter_get_pages_alloc() analogue...
>
> And yes, I realize that you have ->pages wired into the struct ceph_osd_request;
> how painful would it be to have it switched to struct bio_vec array instead?

It would be a significant and wide-reaching change, but I've been
meaning to look into switching to iov_iter for a couple of releases
now.  There is a lot of ugly code in net/ceph/messenger.c to hangle
iteration over "page vectors", "page lists" and "bio lists".  All of it
predates iov_iter proliferation and is mostly incomplete anyway: IIRC
you can send out of a pagelist but can't recv into a pagelist, etc.

That said, Jeff's patch doesn't look too bad to me...

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Jan. 12, 2017, 11:27 a.m. UTC | #4
On Thu, 2017-01-12 at 07:59 +0000, Al Viro wrote:
> On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote:
> > 
> > v2: fix bug in offset handling in iov_iter_pvec_size
> > 
> > xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses
> > fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying
> > to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it
> > to pick up a wrong offset and get stuck in an infinite loop while trying
> > to populate the page array. dio_get_pagev_size has a similar problem.
> > 
> > To fix the first problem, add a new iov_iter helper to determine the
> > offset into the page for the current segment and have ceph call that.
> > I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc,
> > but that will only return a single page at a time for ITER_BVEC and
> > it's better to make larger requests when possible.
> > 
> > For the second problem, we simply replace it with a new helper that does
> > what it does, but properly for all iov_iter types.
> > 
> > Since we're moving that into generic code, we can also utilize the
> > iterate_all_kinds macro to simplify this. That means that we need to
> > rework the logic a bit since we can't advance to the next vector while
> > checking the current one.
> 
> Yecchhh...  That really looks like exposing way too low-level stuff instead
> of coming up with saner primitive ;-/
> 

Fair point. That said, I'm not terribly thrilled with how
iov_iter_get_pages* works right now.

Note that it only ever touches the first vector. Would it not be better
to keep getting page references if the bvec/iov elements are aligned
properly? It seems quite plausible that they often would be, and being
able to hand back a larger list of pages in most cases would be
advantageous.

IOW, should we have iov_iter_get_pages basically do what
dio_get_pages_alloc does -- try to build as long an array of pages as
possible before returning, provided that the alignment works out?

The NFS DIO code, for instance, could also benefit there. I know we've
had reports there in the past that sending down a bunch of small iovecs
causes a lot of small-sized requests on the wire.

> Is page vector + offset in the first page + number of bytes really what
> ceph wants?  Would e.g. an array of bio_vec be saner?  Because _that_
> would make a lot more natural iov_iter_get_pages_alloc() analogue...
> 
> And yes, I realize that you have ->pages wired into the struct ceph_osd_request;
> how painful would it be to have it switched to struct bio_vec array instead?

Actually...it looks like that might not be too hard. The low-level OSD
handling code can already handle bio_vec arrays in order to service RBD.
It looks like we could switch cephfs to use
osd_req_op_extent_osd_data_bio instead of
osd_req_op_extent_osd_data_pages. That would add a dependency in cephfs
on CONFIG_BLOCK, but I think we could probably live with that.
Ilya Dryomov Jan. 12, 2017, 11:37 a.m. UTC | #5
On Thu, Jan 12, 2017 at 12:27 PM, Jeff Layton <jlayton@redhat.com> wrote:
> On Thu, 2017-01-12 at 07:59 +0000, Al Viro wrote:
>> On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote:
>> >
>> > v2: fix bug in offset handling in iov_iter_pvec_size
>> >
>> > xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses
>> > fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying
>> > to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it
>> > to pick up a wrong offset and get stuck in an infinite loop while trying
>> > to populate the page array. dio_get_pagev_size has a similar problem.
>> >
>> > To fix the first problem, add a new iov_iter helper to determine the
>> > offset into the page for the current segment and have ceph call that.
>> > I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc,
>> > but that will only return a single page at a time for ITER_BVEC and
>> > it's better to make larger requests when possible.
>> >
>> > For the second problem, we simply replace it with a new helper that does
>> > what it does, but properly for all iov_iter types.
>> >
>> > Since we're moving that into generic code, we can also utilize the
>> > iterate_all_kinds macro to simplify this. That means that we need to
>> > rework the logic a bit since we can't advance to the next vector while
>> > checking the current one.
>>
>> Yecchhh...  That really looks like exposing way too low-level stuff instead
>> of coming up with saner primitive ;-/
>>
>
> Fair point. That said, I'm not terribly thrilled with how
> iov_iter_get_pages* works right now.
>
> Note that it only ever touches the first vector. Would it not be better
> to keep getting page references if the bvec/iov elements are aligned
> properly? It seems quite plausible that they often would be, and being
> able to hand back a larger list of pages in most cases would be
> advantageous.
>
> IOW, should we have iov_iter_get_pages basically do what
> dio_get_pages_alloc does -- try to build as long an array of pages as
> possible before returning, provided that the alignment works out?
>
> The NFS DIO code, for instance, could also benefit there. I know we've
> had reports there in the past that sending down a bunch of small iovecs
> causes a lot of small-sized requests on the wire.
>
>> Is page vector + offset in the first page + number of bytes really what
>> ceph wants?  Would e.g. an array of bio_vec be saner?  Because _that_
>> would make a lot more natural iov_iter_get_pages_alloc() analogue...
>>
>> And yes, I realize that you have ->pages wired into the struct ceph_osd_request;
>> how painful would it be to have it switched to struct bio_vec array instead?
>
> Actually...it looks like that might not be too hard. The low-level OSD
> handling code can already handle bio_vec arrays in order to service RBD.
> It looks like we could switch cephfs to use
> osd_req_op_extent_osd_data_bio instead of
> osd_req_op_extent_osd_data_pages. That would add a dependency in cephfs
> on CONFIG_BLOCK, but I think we could probably live with that.

Ah, just that part might be easy enough ;)

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Jan. 12, 2017, 11:37 a.m. UTC | #6
On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote:

> It would be a significant and wide-reaching change, but I've been
> meaning to look into switching to iov_iter for a couple of releases
> now.  There is a lot of ugly code in net/ceph/messenger.c to hangle
> iteration over "page vectors", "page lists" and "bio lists".  All of it
> predates iov_iter proliferation and is mostly incomplete anyway: IIRC
> you can send out of a pagelist but can't recv into a pagelist, etc.

Wait a sec...  Is it done from the same thread that has issued a syscall?
If so, we certainly could just pass iov_iter without bothering with any
form of ..._get_pages(); if not, we'll need at least to get from iovec
to bio_vec, since userland addresses make sense only in the caller's
context...
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov Jan. 12, 2017, 11:46 a.m. UTC | #7
On Thu, Jan 12, 2017 at 12:37 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote:
>
>> It would be a significant and wide-reaching change, but I've been
>> meaning to look into switching to iov_iter for a couple of releases
>> now.  There is a lot of ugly code in net/ceph/messenger.c to hangle
>> iteration over "page vectors", "page lists" and "bio lists".  All of it
>> predates iov_iter proliferation and is mostly incomplete anyway: IIRC
>> you can send out of a pagelist but can't recv into a pagelist, etc.
>
> Wait a sec...  Is it done from the same thread that has issued a syscall?
> If so, we certainly could just pass iov_iter without bothering with any
> form of ..._get_pages(); if not, we'll need at least to get from iovec
> to bio_vec, since userland addresses make sense only in the caller's
> context...

No, not necessarily - it's also used by rbd (all of net/ceph has two
users: fs/ceph and drivers/block/rbd.c).

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Jan. 12, 2017, 11:53 a.m. UTC | #8
On Thu, Jan 12, 2017 at 12:46:42PM +0100, Ilya Dryomov wrote:
> On Thu, Jan 12, 2017 at 12:37 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote:
> >
> >> It would be a significant and wide-reaching change, but I've been
> >> meaning to look into switching to iov_iter for a couple of releases
> >> now.  There is a lot of ugly code in net/ceph/messenger.c to hangle
> >> iteration over "page vectors", "page lists" and "bio lists".  All of it
> >> predates iov_iter proliferation and is mostly incomplete anyway: IIRC
> >> you can send out of a pagelist but can't recv into a pagelist, etc.
> >
> > Wait a sec...  Is it done from the same thread that has issued a syscall?
> > If so, we certainly could just pass iov_iter without bothering with any
> > form of ..._get_pages(); if not, we'll need at least to get from iovec
> > to bio_vec, since userland addresses make sense only in the caller's
> > context...
> 
> No, not necessarily - it's also used by rbd (all of net/ceph has two
> users: fs/ceph and drivers/block/rbd.c).

Yes, but rbd doesn't deal with userland-pointing iovecs at all, does it?
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Jan. 12, 2017, 11:57 a.m. UTC | #9
On Thu, 2017-01-12 at 12:46 +0100, Ilya Dryomov wrote:
> On Thu, Jan 12, 2017 at 12:37 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > 
> > On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote:
> > 
> > > 
> > > It would be a significant and wide-reaching change, but I've been
> > > meaning to look into switching to iov_iter for a couple of releases
> > > now.  There is a lot of ugly code in net/ceph/messenger.c to hangle
> > > iteration over "page vectors", "page lists" and "bio lists".  All of it
> > > predates iov_iter proliferation and is mostly incomplete anyway: IIRC
> > > you can send out of a pagelist but can't recv into a pagelist, etc.
> > 
> > Wait a sec...  Is it done from the same thread that has issued a syscall?
> > If so, we certainly could just pass iov_iter without bothering with any
> > form of ..._get_pages(); if not, we'll need at least to get from iovec
> > to bio_vec, since userland addresses make sense only in the caller's
> > context...
> 
> No, not necessarily - it's also used by rbd (all of net/ceph has two
> users: fs/ceph and drivers/block/rbd.c).
> 
> 

...and note that the actual send/receive is done from workqueue context
(AFAICT), so we might be operating over the array from a completely
different thread context from where it was submitted. I think we do need
to get page references at the point of submission like this.
Ilya Dryomov Jan. 12, 2017, 12:17 p.m. UTC | #10
On Thu, Jan 12, 2017 at 12:53 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, Jan 12, 2017 at 12:46:42PM +0100, Ilya Dryomov wrote:
>> On Thu, Jan 12, 2017 at 12:37 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>> > On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote:
>> >
>> >> It would be a significant and wide-reaching change, but I've been
>> >> meaning to look into switching to iov_iter for a couple of releases
>> >> now.  There is a lot of ugly code in net/ceph/messenger.c to hangle
>> >> iteration over "page vectors", "page lists" and "bio lists".  All of it
>> >> predates iov_iter proliferation and is mostly incomplete anyway: IIRC
>> >> you can send out of a pagelist but can't recv into a pagelist, etc.
>> >
>> > Wait a sec...  Is it done from the same thread that has issued a syscall?
>> > If so, we certainly could just pass iov_iter without bothering with any
>> > form of ..._get_pages(); if not, we'll need at least to get from iovec
>> > to bio_vec, since userland addresses make sense only in the caller's
>> > context...
>>
>> No, not necessarily - it's also used by rbd (all of net/ceph has two
>> users: fs/ceph and drivers/block/rbd.c).
>
> Yes, but rbd doesn't deal with userland-pointing iovecs at all, does it?

Correct.  Normal I/O is all bios + currently it uses some of that
custom page vector/list machinery for maintenance operations (rbd
metadata, locks, etc).  No userland pointers whatsoever for rbd, but
the messenger (checksumming + {send,recv}{msg,page}) runs out of the
kworker for both rbd and cephfs.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Jan. 18, 2017, 12:14 p.m. UTC | #11
On Thu, 2017-01-12 at 12:37 +0100, Ilya Dryomov wrote:
> On Thu, Jan 12, 2017 at 12:27 PM, Jeff Layton <jlayton@redhat.com> wrote:
> > On Thu, 2017-01-12 at 07:59 +0000, Al Viro wrote:
> > > On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote:
> > > > 
> > > > v2: fix bug in offset handling in iov_iter_pvec_size
> > > > 
> > > > xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses
> > > > fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying
> > > > to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it
> > > > to pick up a wrong offset and get stuck in an infinite loop while trying
> > > > to populate the page array. dio_get_pagev_size has a similar problem.
> > > > 
> > > > To fix the first problem, add a new iov_iter helper to determine the
> > > > offset into the page for the current segment and have ceph call that.
> > > > I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc,
> > > > but that will only return a single page at a time for ITER_BVEC and
> > > > it's better to make larger requests when possible.
> > > > 
> > > > For the second problem, we simply replace it with a new helper that does
> > > > what it does, but properly for all iov_iter types.
> > > > 
> > > > Since we're moving that into generic code, we can also utilize the
> > > > iterate_all_kinds macro to simplify this. That means that we need to
> > > > rework the logic a bit since we can't advance to the next vector while
> > > > checking the current one.
> > > 
> > > Yecchhh...  That really looks like exposing way too low-level stuff instead
> > > of coming up with saner primitive ;-/
> > > 
> > 
> > Fair point. That said, I'm not terribly thrilled with how
> > iov_iter_get_pages* works right now.
> > 
> > Note that it only ever touches the first vector. Would it not be better
> > to keep getting page references if the bvec/iov elements are aligned
> > properly? It seems quite plausible that they often would be, and being
> > able to hand back a larger list of pages in most cases would be
> > advantageous.
> > 
> > IOW, should we have iov_iter_get_pages basically do what
> > dio_get_pages_alloc does -- try to build as long an array of pages as
> > possible before returning, provided that the alignment works out?
> > 
> > The NFS DIO code, for instance, could also benefit there. I know we've
> > had reports there in the past that sending down a bunch of small iovecs
> > causes a lot of small-sized requests on the wire.
> > 
> > > Is page vector + offset in the first page + number of bytes really what
> > > ceph wants?  Would e.g. an array of bio_vec be saner?  Because _that_
> > > would make a lot more natural iov_iter_get_pages_alloc() analogue...
> > > 
> > > And yes, I realize that you have ->pages wired into the struct ceph_osd_request;
> > > how painful would it be to have it switched to struct bio_vec array instead?
> > 
> > Actually...it looks like that might not be too hard. The low-level OSD
> > handling code can already handle bio_vec arrays in order to service RBD.
> > It looks like we could switch cephfs to use
> > osd_req_op_extent_osd_data_bio instead of
> > osd_req_op_extent_osd_data_pages. That would add a dependency in cephfs
> > on CONFIG_BLOCK, but I think we could probably live with that.
> 
> Ah, just that part might be easy enough ;)
> 
> 

Yeah, that part doesn't look too bad. Regardless though, I think we need
to get a fix in for this sooner rather than later as it's trivial to get
the kernel stuck in this loop today, by any user with write access to a
ceph mount.

Al, when you mentioned switching this over to a bio_vec based interface,
were you planning to roll up the iov_iter->bio_vec array helper for
this, or should I be looking into doing that?

Thanks,
diff mbox

Patch

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f633165f3fdc..0cd9fc68a04e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -35,29 +35,6 @@ 
  */
 
 /*
- * Calculate the length sum of direct io vectors that can
- * be combined into one page vector.
- */
-static size_t dio_get_pagev_size(const struct iov_iter *it)
-{
-    const struct iovec *iov = it->iov;
-    const struct iovec *iovend = iov + it->nr_segs;
-    size_t size;
-
-    size = iov->iov_len - it->iov_offset;
-    /*
-     * An iov can be page vectored when both the current tail
-     * and the next base are page aligned.
-     */
-    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
-           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
-        size += iov->iov_len;
-    }
-    dout("dio_get_pagevlen len = %zu\n", size);
-    return size;
-}
-
-/*
  * Allocate a page vector based on (@it, @nbytes).
  * The return value is the tuple describing a page vector,
  * that is (@pages, @page_align, @num_pages).
@@ -71,8 +48,7 @@  dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
 	struct page **pages;
 	int ret = 0, idx, npages;
 
-	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
-		(PAGE_SIZE - 1);
+	align = iov_iter_single_seg_page_offset(it);
 	npages = calc_pages_for(align, nbytes);
 	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
 	if (!pages) {
@@ -927,7 +903,7 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	while (iov_iter_count(iter) > 0) {
-		u64 size = dio_get_pagev_size(iter);
+		u64 size = iov_iter_pvec_size(iter);
 		size_t start = 0;
 		ssize_t len;
 
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 6e22b544d039..46fdfff3d7d6 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -83,6 +83,7 @@  size_t iov_iter_copy_from_user_atomic(struct page *page,
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
+size_t iov_iter_single_seg_page_offset(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
@@ -93,6 +94,7 @@  size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
+size_t iov_iter_pvec_size(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
 void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 6b415b5a100d..b8fa377b0cef 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -745,6 +745,24 @@  size_t iov_iter_single_seg_count(const struct iov_iter *i)
 }
 EXPORT_SYMBOL(iov_iter_single_seg_count);
 
+/* Return offset into page for current iov_iter segment */
+size_t iov_iter_single_seg_page_offset(const struct iov_iter *i)
+{
+	size_t	base;
+
+	if (i->type & ITER_PIPE)
+		base = i->pipe->bufs[i->idx].offset;
+	else if (i->type & ITER_BVEC)
+		base = i->bvec->bv_offset;
+	else if (i->type & ITER_KVEC)
+		base = (size_t)i->kvec->iov_base;
+	else
+		base = (size_t)i->iov->iov_base;
+
+	return (base + i->iov_offset) & (PAGE_SIZE - 1);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_page_offset);
+
 void iov_iter_kvec(struct iov_iter *i, int direction,
 			const struct kvec *kvec, unsigned long nr_segs,
 			size_t count)
@@ -830,6 +848,59 @@  unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 }
 EXPORT_SYMBOL(iov_iter_gap_alignment);
 
+/**
+ * iov_iter_pvec_size - find length of page aligned iovecs in iov_iter
+ * @i: iov_iter to in which to find the size
+ *
+ * Some filesystems can stitch together multiple iovecs into a single
+ * page vector when both the previous tail and current base are page
+ * aligned. This function discovers the length that can fit in a single
+ * pagevec and returns it.
+ */
+size_t iov_iter_pvec_size(const struct iov_iter *i)
+{
+	size_t size = i->count;
+	size_t pv_size = 0;
+	bool contig = false, first = true;
+
+	if (!size)
+		return 0;
+
+	/* Pipes are naturally aligned for this */
+	if (unlikely(i->type & ITER_PIPE))
+		return size;
+
+	/*
+	 * An iov can be page vectored when the current base and previous
+	 * tail are both page aligned. Note that we don't require that the
+	 * initial base in the first iovec also be page aligned.
+	 */
+	iterate_all_kinds(i, size, v,
+		({
+		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
+			pv_size += v.iov_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
+		 }; 0;
+		 }),
+		({
+		 if (first || (contig && v.bv_offset == 0)) {
+			pv_size += v.bv_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.bv_offset + v.bv_len);
+		 }
+		 }),
+		({
+		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
+			pv_size += v.iov_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
+		 }
+		 }))
+	return pv_size;
+}
+EXPORT_SYMBOL(iov_iter_pvec_size);
+
 static inline size_t __pipe_get_pages(struct iov_iter *i,
 				size_t maxsize,
 				struct page **pages,