diff mbox series

ceph: refactor ceph_sync_read()

Message ID 20180930015619.3136-2-zyan@redhat.com (mailing list archive)
State New, archived
Headers show
Series ceph: refactor ceph_sync_read() | expand

Commit Message

Yan, Zheng Sept. 30, 2018, 1:56 a.m. UTC
Make ceph_sync_read() do OSD request per object. This avoid large memory
allocation during large read.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/file.c | 213 +++++++++++++++++++++++--------------------------
 1 file changed, 101 insertions(+), 112 deletions(-)

Comments

Ilya Dryomov Oct. 3, 2018, 8:31 a.m. UTC | #1
On Sun, Sep 30, 2018 at 3:58 AM Yan, Zheng <zyan@redhat.com> wrote:
>
> Make ceph_sync_read() do OSD request per object. This avoid large memory
> allocation during large read.
>
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/file.c | 213 +++++++++++++++++++++++--------------------------
>  1 file changed, 101 insertions(+), 112 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 92ab20433682..79c610cb5f54 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -556,75 +556,6 @@ enum {
>         READ_INLINE =  3,
>  };
>
> -/*
> - * Read a range of bytes striped over one or more objects.  Iterate over
> - * objects we stripe over.  (That's not atomic, but good enough for now.)
> - *
> - * If we get a short result from the OSD, check against i_size; we need to
> - * only return a short read to the caller if we hit EOF.
> - */
> -static int striped_read(struct inode *inode,
> -                       u64 pos, u64 len,
> -                       struct page **pages, int num_pages,
> -                       int page_align, int *checkeof)
> -{
> -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> -       struct ceph_inode_info *ci = ceph_inode(inode);
> -       u64 this_len;
> -       loff_t i_size;
> -       int page_idx;
> -       int ret, read = 0;
> -       bool hit_stripe, was_short;
> -
> -       /*
> -        * we may need to do multiple reads.  not atomic, unfortunately.
> -        */
> -more:
> -       this_len = len;
> -       page_idx = (page_align + read) >> PAGE_SHIFT;
> -       ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
> -                                 &ci->i_layout, pos, &this_len,
> -                                 ci->i_truncate_seq, ci->i_truncate_size,
> -                                 pages + page_idx, num_pages - page_idx,
> -                                 ((page_align + read) & ~PAGE_MASK));
> -       if (ret == -ENOENT)
> -               ret = 0;
> -       hit_stripe = this_len < len;
> -       was_short = ret >= 0 && ret < this_len;
> -       dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
> -            ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
> -
> -       i_size = i_size_read(inode);
> -       if (ret >= 0) {
> -               if (was_short && (pos + ret < i_size)) {
> -                       int zlen = min(this_len - ret, i_size - pos - ret);
> -                       int zoff = page_align + read + ret;
> -                       dout(" zero gap %llu to %llu\n",
> -                            pos + ret, pos + ret + zlen);
> -                       ceph_zero_page_vector_range(zoff, zlen, pages);
> -                       ret += zlen;
> -               }
> -
> -               read += ret;
> -               pos += ret;
> -               len -= ret;
> -
> -               /* hit stripe and need continue*/
> -               if (len && hit_stripe && pos < i_size)
> -                       goto more;
> -       }
> -
> -       if (read > 0) {
> -               ret = read;
> -               /* did we bounce off eof? */
> -               if (pos + len > i_size)
> -                       *checkeof = CHECK_EOF;
> -       }
> -
> -       dout("striped_read returns %d\n", ret);
> -       return ret;
> -}
> -
>  /*
>   * Completely synchronous read and write methods.  Direct from __user
>   * buffer to osd, or directly to user pages (if O_DIRECT).
> @@ -632,15 +563,16 @@ static int striped_read(struct inode *inode,
>   * If the read spans object boundary, just do multiple reads.
>   */
>  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
> -                             int *checkeof)
> +                             int *retry_op)
>  {
>         struct file *file = iocb->ki_filp;
>         struct inode *inode = file_inode(file);
> -       struct page **pages;
> -       u64 off = iocb->ki_pos;
> -       int num_pages;
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> +       struct ceph_osd_client *osdc = &fsc->client->osdc;
>         ssize_t ret;
> -       size_t len = iov_iter_count(to);
> +       u64 off = iocb->ki_pos;
> +       u64 len = iov_iter_count(to);
>
>         dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
>              (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
> @@ -653,61 +585,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>          * but it will at least behave sensibly when they are
>          * in sequence.
>          */
> -       ret = filemap_write_and_wait_range(inode->i_mapping, off,
> -                                               off + len);
> +       ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
>         if (ret < 0)
>                 return ret;
>
> -       if (unlikely(to->type & ITER_PIPE)) {
> +       ret = 0;
> +       while ((len = iov_iter_count(to)) > 0) {
> +               struct ceph_osd_request *req;
> +               struct page **pages;
> +               int num_pages;
>                 size_t page_off;
> -               ret = iov_iter_get_pages_alloc(to, &pages, len,
> -                                              &page_off);
> -               if (ret <= 0)
> -                       return -ENOMEM;
> -               num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
> +               u64 i_size;
> +               bool more;
> +
> +               req = ceph_osdc_new_request(osdc, &ci->i_layout,
> +                                       ci->i_vino, off, &len, 0, 1,
> +                                       CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
> +                                       NULL, ci->i_truncate_seq,
> +                                       ci->i_truncate_size, false);
> +               if (IS_ERR(req)) {
> +                       ret = PTR_ERR(req);
> +                       break;
> +               }
>
> -               ret = striped_read(inode, off, ret, pages, num_pages,
> -                                  page_off, checkeof);
> -               if (ret > 0) {
> -                       iov_iter_advance(to, ret);
> -                       off += ret;
> +               more = len < iov_iter_count(to);
> +
> +               if (unlikely(to->type & ITER_PIPE)) {
> +                       ret = iov_iter_get_pages_alloc(to, &pages, len,
> +                                                      &page_off);
> +                       if (ret <= 0) {
> +                               ceph_osdc_put_request(req);
> +                               ret = -ENOMEM;
> +                               break;
> +                       }
> +                       num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
> +                       if (ret < len) {
> +                               len = ret;
> +                               osd_req_op_extent_update(req, 0, len);
> +                               more = false;
> +                       }
>                 } else {
> -                       iov_iter_advance(to, 0);
> +                       num_pages = calc_pages_for(off, len);
> +                       page_off = off & ~PAGE_MASK;
> +                       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
> +                       if (IS_ERR(pages)) {
> +                               ceph_osdc_put_request(req);
> +                               ret = PTR_ERR(pages);
> +                               break;
> +                       }
>                 }
> -               ceph_put_page_vector(pages, num_pages, false);
> -       } else {
> -               num_pages = calc_pages_for(off, len);
> -               pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
> -               if (IS_ERR(pages))
> -                       return PTR_ERR(pages);
> -
> -               ret = striped_read(inode, off, len, pages, num_pages,
> -                                  (off & ~PAGE_MASK), checkeof);
> -               if (ret > 0) {
> -                       int l, k = 0;
> -                       size_t left = ret;
> -
> -                       while (left) {
> -                               size_t page_off = off & ~PAGE_MASK;
> -                               size_t copy = min_t(size_t, left,
> -                                                   PAGE_SIZE - page_off);
> -                               l = copy_page_to_iter(pages[k++], page_off,
> -                                                     copy, to);
> -                               off += l;
> -                               left -= l;
> -                               if (l < copy)
> +
> +               osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
> +                                                false, false);
> +               ret = ceph_osdc_start_request(osdc, req, false);
> +               if (!ret)
> +                       ret = ceph_osdc_wait_request(osdc, req);
> +               ceph_osdc_put_request(req);
> +
> +               i_size = i_size_read(inode);
> +               dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
> +                    off, len, ret, i_size, (more ? " MORE" : ""));
> +
> +               if (ret == -ENOENT)
> +                       ret = 0;
> +               if (ret >= 0 && ret < len && (off + ret < i_size)) {
> +                       int zlen = min(len - ret, i_size - off - ret);
> +                       int zoff = page_off + ret;
> +                       dout("sync_read zero gap %llu~%llu\n",
> +                             off + ret, off + ret + zlen);
> +                       ceph_zero_page_vector_range(zoff, zlen, pages);
> +                       ret += zlen;
> +               }
> +
> +               if (unlikely(to->type & ITER_PIPE)) {
> +                       if (ret > 0) {
> +                               iov_iter_advance(to, ret);
> +                               off += ret;
> +                       } else {
> +                               iov_iter_advance(to, 0);
> +                       }
> +                       ceph_put_page_vector(pages, num_pages, false);
> +               } else {
> +                       int idx = 0;
> +                       size_t left = ret > 0 ? ret : 0;
> +                       while (left > 0) {
> +                               size_t len, copied;
> +                               page_off = off & ~PAGE_MASK;
> +                               len = min_t(size_t, left, PAGE_SIZE - page_off);
> +                               copied = copy_page_to_iter(pages[idx++],
> +                                                          page_off, len, to);
> +                               off += copied;
> +                               left -= copied;
> +                               if (copied < len) {
> +                                       ret = -EFAULT;
>                                         break;
> +                               }
>                         }
> +                       ceph_release_page_vector(pages, num_pages);
>                 }
> -               ceph_release_page_vector(pages, num_pages);
> +
> +               if (ret <= 0 || off >= i_size || !more)
> +                       break;
>         }
>
>         if (off > iocb->ki_pos) {
> +               if (ret >= 0 &&
> +                   iov_iter_count(to) > 0 && off >= i_size_read(inode))
> +                       *retry_op = CHECK_EOF;
>                 ret = off - iocb->ki_pos;
>                 iocb->ki_pos = off;
>         }
>
> -       dout("sync_read result %zd\n", ret);
> +       dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
>         return ret;
>  }

This is a step forward -- a fixed size allocation vs something
arbitrary up to 2G worth of pages (MAX_RW_COUNT).  But upon deeper
look, why allocate any temporary pages at all?  Why not just read
directly into iov_iter instead, like you do in the ITER_PIPE case?

I don't see a reason to keep special casing ITER_PIPE.

Thanks,

                Ilya
Ilya Dryomov Oct. 8, 2018, 4:46 p.m. UTC | #2
On Mon, Oct 8, 2018 at 3:16 AM Yan, Zheng <zyan@redhat.com> wrote:
>
>
>
> On Oct 3, 2018, at 16:31, Ilya Dryomov <idryomov@gmail.com> wrote:
>
> On Sun, Sep 30, 2018 at 3:58 AM Yan, Zheng <zyan@redhat.com> wrote:
>
>
> Make ceph_sync_read() do OSD request per object. This avoid large memory
> allocation during large read.
>
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
> fs/ceph/file.c | 213 +++++++++++++++++++++++--------------------------
> 1 file changed, 101 insertions(+), 112 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 92ab20433682..79c610cb5f54 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -556,75 +556,6 @@ enum {
>        READ_INLINE =  3,
> };
>
> -/*
> - * Read a range of bytes striped over one or more objects.  Iterate over
> - * objects we stripe over.  (That's not atomic, but good enough for now.)
> - *
> - * If we get a short result from the OSD, check against i_size; we need to
> - * only return a short read to the caller if we hit EOF.
> - */
> -static int striped_read(struct inode *inode,
> -                       u64 pos, u64 len,
> -                       struct page **pages, int num_pages,
> -                       int page_align, int *checkeof)
> -{
> -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> -       struct ceph_inode_info *ci = ceph_inode(inode);
> -       u64 this_len;
> -       loff_t i_size;
> -       int page_idx;
> -       int ret, read = 0;
> -       bool hit_stripe, was_short;
> -
> -       /*
> -        * we may need to do multiple reads.  not atomic, unfortunately.
> -        */
> -more:
> -       this_len = len;
> -       page_idx = (page_align + read) >> PAGE_SHIFT;
> -       ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
> -                                 &ci->i_layout, pos, &this_len,
> -                                 ci->i_truncate_seq, ci->i_truncate_size,
> -                                 pages + page_idx, num_pages - page_idx,
> -                                 ((page_align + read) & ~PAGE_MASK));
> -       if (ret == -ENOENT)
> -               ret = 0;
> -       hit_stripe = this_len < len;
> -       was_short = ret >= 0 && ret < this_len;
> -       dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
> -            ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
> -
> -       i_size = i_size_read(inode);
> -       if (ret >= 0) {
> -               if (was_short && (pos + ret < i_size)) {
> -                       int zlen = min(this_len - ret, i_size - pos - ret);
> -                       int zoff = page_align + read + ret;
> -                       dout(" zero gap %llu to %llu\n",
> -                            pos + ret, pos + ret + zlen);
> -                       ceph_zero_page_vector_range(zoff, zlen, pages);
> -                       ret += zlen;
> -               }
> -
> -               read += ret;
> -               pos += ret;
> -               len -= ret;
> -
> -               /* hit stripe and need continue*/
> -               if (len && hit_stripe && pos < i_size)
> -                       goto more;
> -       }
> -
> -       if (read > 0) {
> -               ret = read;
> -               /* did we bounce off eof? */
> -               if (pos + len > i_size)
> -                       *checkeof = CHECK_EOF;
> -       }
> -
> -       dout("striped_read returns %d\n", ret);
> -       return ret;
> -}
> -
> /*
>  * Completely synchronous read and write methods. Direct from __user
>  * buffer to osd, or directly to user pages (if O_DIRECT).
> @@ -632,15 +563,16 @@ static int striped_read(struct inode *inode,
>  * If the read spans object boundary, just do multiple reads.
>  */
> static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
> -                             int *checkeof)
> +                             int *retry_op)
> {
>        struct file *file = iocb->ki_filp;
>        struct inode *inode = file_inode(file);
> -       struct page **pages;
> -       u64 off = iocb->ki_pos;
> -       int num_pages;
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> +       struct ceph_osd_client *osdc = &fsc->client->osdc;
>        ssize_t ret;
> -       size_t len = iov_iter_count(to);
> +       u64 off = iocb->ki_pos;
> +       u64 len = iov_iter_count(to);
>
>        dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
>             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
> @@ -653,61 +585,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>         * but it will at least behave sensibly when they are
>         * in sequence.
>         */
> -       ret = filemap_write_and_wait_range(inode->i_mapping, off,
> -                                               off + len);
> +       ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
>        if (ret < 0)
>                return ret;
>
> -       if (unlikely(to->type & ITER_PIPE)) {
> +       ret = 0;
> +       while ((len = iov_iter_count(to)) > 0) {
> +               struct ceph_osd_request *req;
> +               struct page **pages;
> +               int num_pages;
>                size_t page_off;
> -               ret = iov_iter_get_pages_alloc(to, &pages, len,
> -                                              &page_off);
> -               if (ret <= 0)
> -                       return -ENOMEM;
> -               num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
> +               u64 i_size;
> +               bool more;
> +
> +               req = ceph_osdc_new_request(osdc, &ci->i_layout,
> +                                       ci->i_vino, off, &len, 0, 1,
> +                                       CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
> +                                       NULL, ci->i_truncate_seq,
> +                                       ci->i_truncate_size, false);
> +               if (IS_ERR(req)) {
> +                       ret = PTR_ERR(req);
> +                       break;
> +               }
>
> -               ret = striped_read(inode, off, ret, pages, num_pages,
> -                                  page_off, checkeof);
> -               if (ret > 0) {
> -                       iov_iter_advance(to, ret);
> -                       off += ret;
> +               more = len < iov_iter_count(to);
> +
> +               if (unlikely(to->type & ITER_PIPE)) {
> +                       ret = iov_iter_get_pages_alloc(to, &pages, len,
> +                                                      &page_off);
> +                       if (ret <= 0) {
> +                               ceph_osdc_put_request(req);
> +                               ret = -ENOMEM;
> +                               break;
> +                       }
> +                       num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
> +                       if (ret < len) {
> +                               len = ret;
> +                               osd_req_op_extent_update(req, 0, len);
> +                               more = false;
> +                       }
>                } else {
> -                       iov_iter_advance(to, 0);
> +                       num_pages = calc_pages_for(off, len);
> +                       page_off = off & ~PAGE_MASK;
> +                       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
> +                       if (IS_ERR(pages)) {
> +                               ceph_osdc_put_request(req);
> +                               ret = PTR_ERR(pages);
> +                               break;
> +                       }
>                }
> -               ceph_put_page_vector(pages, num_pages, false);
> -       } else {
> -               num_pages = calc_pages_for(off, len);
> -               pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
> -               if (IS_ERR(pages))
> -                       return PTR_ERR(pages);
> -
> -               ret = striped_read(inode, off, len, pages, num_pages,
> -                                  (off & ~PAGE_MASK), checkeof);
> -               if (ret > 0) {
> -                       int l, k = 0;
> -                       size_t left = ret;
> -
> -                       while (left) {
> -                               size_t page_off = off & ~PAGE_MASK;
> -                               size_t copy = min_t(size_t, left,
> -                                                   PAGE_SIZE - page_off);
> -                               l = copy_page_to_iter(pages[k++], page_off,
> -                                                     copy, to);
> -                               off += l;
> -                               left -= l;
> -                               if (l < copy)
> +
> +               osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
> +                                                false, false);
> +               ret = ceph_osdc_start_request(osdc, req, false);
> +               if (!ret)
> +                       ret = ceph_osdc_wait_request(osdc, req);
> +               ceph_osdc_put_request(req);
> +
> +               i_size = i_size_read(inode);
> +               dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
> +                    off, len, ret, i_size, (more ? " MORE" : ""));
> +
> +               if (ret == -ENOENT)
> +                       ret = 0;
> +               if (ret >= 0 && ret < len && (off + ret < i_size)) {
> +                       int zlen = min(len - ret, i_size - off - ret);
> +                       int zoff = page_off + ret;
> +                       dout("sync_read zero gap %llu~%llu\n",
> +                             off + ret, off + ret + zlen);
> +                       ceph_zero_page_vector_range(zoff, zlen, pages);
> +                       ret += zlen;
> +               }
> +
> +               if (unlikely(to->type & ITER_PIPE)) {
> +                       if (ret > 0) {
> +                               iov_iter_advance(to, ret);
> +                               off += ret;
> +                       } else {
> +                               iov_iter_advance(to, 0);
> +                       }
> +                       ceph_put_page_vector(pages, num_pages, false);
> +               } else {
> +                       int idx = 0;
> +                       size_t left = ret > 0 ? ret : 0;
> +                       while (left > 0) {
> +                               size_t len, copied;
> +                               page_off = off & ~PAGE_MASK;
> +                               len = min_t(size_t, left, PAGE_SIZE - page_off);
> +                               copied = copy_page_to_iter(pages[idx++],
> +                                                          page_off, len, to);
> +                               off += copied;
> +                               left -= copied;
> +                               if (copied < len) {
> +                                       ret = -EFAULT;
>                                        break;
> +                               }
>                        }
> +                       ceph_release_page_vector(pages, num_pages);
>                }
> -               ceph_release_page_vector(pages, num_pages);
> +
> +               if (ret <= 0 || off >= i_size || !more)
> +                       break;
>        }
>
>        if (off > iocb->ki_pos) {
> +               if (ret >= 0 &&
> +                   iov_iter_count(to) > 0 && off >= i_size_read(inode))
> +                       *retry_op = CHECK_EOF;
>                ret = off - iocb->ki_pos;
>                iocb->ki_pos = off;
>        }
>
> -       dout("sync_read result %zd\n", ret);
> +       dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
>        return ret;
> }
>
>
> This is a step forward -- a fixed size allocation vs something
> arbitrary up to 2G worth of pages (MAX_RW_COUNT).  But upon deeper
> look, why allocate any temporary pages at all?  Why not just read
> directly into iov_iter instead, like you do in the ITER_PIPE case?
>
> I don't see a reason to keep special casing ITER_PIPE.
>
>
> Sorry for the delay.  This is not direct-IO , I think reading directly into iov_iter is not good.

Ah, I missed that, although it looks like it might get called in the
O_DIRECT case too, on a retry after EOF?

Thanks,

                Ilya
Yan, Zheng Oct. 9, 2018, 1:31 a.m. UTC | #3
On Tue, Oct 9, 2018 at 12:48 AM Ilya Dryomov <idryomov@gmail.com> wrote:
>
> On Mon, Oct 8, 2018 at 3:16 AM Yan, Zheng <zyan@redhat.com> wrote:
> >
> >
> >
> > On Oct 3, 2018, at 16:31, Ilya Dryomov <idryomov@gmail.com> wrote:
> >
> > On Sun, Sep 30, 2018 at 3:58 AM Yan, Zheng <zyan@redhat.com> wrote:
> >
> >
> > Make ceph_sync_read() do OSD request per object. This avoid large memory
> > allocation during large read.
> >
> > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > ---
> > fs/ceph/file.c | 213 +++++++++++++++++++++++--------------------------
> > 1 file changed, 101 insertions(+), 112 deletions(-)
> >
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 92ab20433682..79c610cb5f54 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -556,75 +556,6 @@ enum {
> >        READ_INLINE =  3,
> > };
> >
> > -/*
> > - * Read a range of bytes striped over one or more objects.  Iterate over
> > - * objects we stripe over.  (That's not atomic, but good enough for now.)
> > - *
> > - * If we get a short result from the OSD, check against i_size; we need to
> > - * only return a short read to the caller if we hit EOF.
> > - */
> > -static int striped_read(struct inode *inode,
> > -                       u64 pos, u64 len,
> > -                       struct page **pages, int num_pages,
> > -                       int page_align, int *checkeof)
> > -{
> > -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > -       struct ceph_inode_info *ci = ceph_inode(inode);
> > -       u64 this_len;
> > -       loff_t i_size;
> > -       int page_idx;
> > -       int ret, read = 0;
> > -       bool hit_stripe, was_short;
> > -
> > -       /*
> > -        * we may need to do multiple reads.  not atomic, unfortunately.
> > -        */
> > -more:
> > -       this_len = len;
> > -       page_idx = (page_align + read) >> PAGE_SHIFT;
> > -       ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
> > -                                 &ci->i_layout, pos, &this_len,
> > -                                 ci->i_truncate_seq, ci->i_truncate_size,
> > -                                 pages + page_idx, num_pages - page_idx,
> > -                                 ((page_align + read) & ~PAGE_MASK));
> > -       if (ret == -ENOENT)
> > -               ret = 0;
> > -       hit_stripe = this_len < len;
> > -       was_short = ret >= 0 && ret < this_len;
> > -       dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
> > -            ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
> > -
> > -       i_size = i_size_read(inode);
> > -       if (ret >= 0) {
> > -               if (was_short && (pos + ret < i_size)) {
> > -                       int zlen = min(this_len - ret, i_size - pos - ret);
> > -                       int zoff = page_align + read + ret;
> > -                       dout(" zero gap %llu to %llu\n",
> > -                            pos + ret, pos + ret + zlen);
> > -                       ceph_zero_page_vector_range(zoff, zlen, pages);
> > -                       ret += zlen;
> > -               }
> > -
> > -               read += ret;
> > -               pos += ret;
> > -               len -= ret;
> > -
> > -               /* hit stripe and need continue*/
> > -               if (len && hit_stripe && pos < i_size)
> > -                       goto more;
> > -       }
> > -
> > -       if (read > 0) {
> > -               ret = read;
> > -               /* did we bounce off eof? */
> > -               if (pos + len > i_size)
> > -                       *checkeof = CHECK_EOF;
> > -       }
> > -
> > -       dout("striped_read returns %d\n", ret);
> > -       return ret;
> > -}
> > -
> > /*
> >  * Completely synchronous read and write methods. Direct from __user
> >  * buffer to osd, or directly to user pages (if O_DIRECT).
> > @@ -632,15 +563,16 @@ static int striped_read(struct inode *inode,
> >  * If the read spans object boundary, just do multiple reads.
> >  */
> > static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
> > -                             int *checkeof)
> > +                             int *retry_op)
> > {
> >        struct file *file = iocb->ki_filp;
> >        struct inode *inode = file_inode(file);
> > -       struct page **pages;
> > -       u64 off = iocb->ki_pos;
> > -       int num_pages;
> > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > +       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > +       struct ceph_osd_client *osdc = &fsc->client->osdc;
> >        ssize_t ret;
> > -       size_t len = iov_iter_count(to);
> > +       u64 off = iocb->ki_pos;
> > +       u64 len = iov_iter_count(to);
> >
> >        dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
> >             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
> > @@ -653,61 +585,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
> >         * but it will at least behave sensibly when they are
> >         * in sequence.
> >         */
> > -       ret = filemap_write_and_wait_range(inode->i_mapping, off,
> > -                                               off + len);
> > +       ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
> >        if (ret < 0)
> >                return ret;
> >
> > -       if (unlikely(to->type & ITER_PIPE)) {
> > +       ret = 0;
> > +       while ((len = iov_iter_count(to)) > 0) {
> > +               struct ceph_osd_request *req;
> > +               struct page **pages;
> > +               int num_pages;
> >                size_t page_off;
> > -               ret = iov_iter_get_pages_alloc(to, &pages, len,
> > -                                              &page_off);
> > -               if (ret <= 0)
> > -                       return -ENOMEM;
> > -               num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
> > +               u64 i_size;
> > +               bool more;
> > +
> > +               req = ceph_osdc_new_request(osdc, &ci->i_layout,
> > +                                       ci->i_vino, off, &len, 0, 1,
> > +                                       CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
> > +                                       NULL, ci->i_truncate_seq,
> > +                                       ci->i_truncate_size, false);
> > +               if (IS_ERR(req)) {
> > +                       ret = PTR_ERR(req);
> > +                       break;
> > +               }
> >
> > -               ret = striped_read(inode, off, ret, pages, num_pages,
> > -                                  page_off, checkeof);
> > -               if (ret > 0) {
> > -                       iov_iter_advance(to, ret);
> > -                       off += ret;
> > +               more = len < iov_iter_count(to);
> > +
> > +               if (unlikely(to->type & ITER_PIPE)) {
> > +                       ret = iov_iter_get_pages_alloc(to, &pages, len,
> > +                                                      &page_off);
> > +                       if (ret <= 0) {
> > +                               ceph_osdc_put_request(req);
> > +                               ret = -ENOMEM;
> > +                               break;
> > +                       }
> > +                       num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
> > +                       if (ret < len) {
> > +                               len = ret;
> > +                               osd_req_op_extent_update(req, 0, len);
> > +                               more = false;
> > +                       }
> >                } else {
> > -                       iov_iter_advance(to, 0);
> > +                       num_pages = calc_pages_for(off, len);
> > +                       page_off = off & ~PAGE_MASK;
> > +                       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
> > +                       if (IS_ERR(pages)) {
> > +                               ceph_osdc_put_request(req);
> > +                               ret = PTR_ERR(pages);
> > +                               break;
> > +                       }
> >                }
> > -               ceph_put_page_vector(pages, num_pages, false);
> > -       } else {
> > -               num_pages = calc_pages_for(off, len);
> > -               pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
> > -               if (IS_ERR(pages))
> > -                       return PTR_ERR(pages);
> > -
> > -               ret = striped_read(inode, off, len, pages, num_pages,
> > -                                  (off & ~PAGE_MASK), checkeof);
> > -               if (ret > 0) {
> > -                       int l, k = 0;
> > -                       size_t left = ret;
> > -
> > -                       while (left) {
> > -                               size_t page_off = off & ~PAGE_MASK;
> > -                               size_t copy = min_t(size_t, left,
> > -                                                   PAGE_SIZE - page_off);
> > -                               l = copy_page_to_iter(pages[k++], page_off,
> > -                                                     copy, to);
> > -                               off += l;
> > -                               left -= l;
> > -                               if (l < copy)
> > +
> > +               osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
> > +                                                false, false);
> > +               ret = ceph_osdc_start_request(osdc, req, false);
> > +               if (!ret)
> > +                       ret = ceph_osdc_wait_request(osdc, req);
> > +               ceph_osdc_put_request(req);
> > +
> > +               i_size = i_size_read(inode);
> > +               dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
> > +                    off, len, ret, i_size, (more ? " MORE" : ""));
> > +
> > +               if (ret == -ENOENT)
> > +                       ret = 0;
> > +               if (ret >= 0 && ret < len && (off + ret < i_size)) {
> > +                       int zlen = min(len - ret, i_size - off - ret);
> > +                       int zoff = page_off + ret;
> > +                       dout("sync_read zero gap %llu~%llu\n",
> > +                             off + ret, off + ret + zlen);
> > +                       ceph_zero_page_vector_range(zoff, zlen, pages);
> > +                       ret += zlen;
> > +               }
> > +
> > +               if (unlikely(to->type & ITER_PIPE)) {
> > +                       if (ret > 0) {
> > +                               iov_iter_advance(to, ret);
> > +                               off += ret;
> > +                       } else {
> > +                               iov_iter_advance(to, 0);
> > +                       }
> > +                       ceph_put_page_vector(pages, num_pages, false);
> > +               } else {
> > +                       int idx = 0;
> > +                       size_t left = ret > 0 ? ret : 0;
> > +                       while (left > 0) {
> > +                               size_t len, copied;
> > +                               page_off = off & ~PAGE_MASK;
> > +                               len = min_t(size_t, left, PAGE_SIZE - page_off);
> > +                               copied = copy_page_to_iter(pages[idx++],
> > +                                                          page_off, len, to);
> > +                               off += copied;
> > +                               left -= copied;
> > +                               if (copied < len) {
> > +                                       ret = -EFAULT;
> >                                        break;
> > +                               }
> >                        }
> > +                       ceph_release_page_vector(pages, num_pages);
> >                }
> > -               ceph_release_page_vector(pages, num_pages);
> > +
> > +               if (ret <= 0 || off >= i_size || !more)
> > +                       break;
> >        }
> >
> >        if (off > iocb->ki_pos) {
> > +               if (ret >= 0 &&
> > +                   iov_iter_count(to) > 0 && off >= i_size_read(inode))
> > +                       *retry_op = CHECK_EOF;
> >                ret = off - iocb->ki_pos;
> >                iocb->ki_pos = off;
> >        }
> >
> > -       dout("sync_read result %zd\n", ret);
> > +       dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
> >        return ret;
> > }
> >
> >
> > This is a step forward -- a fixed size allocation vs something
> > arbitrary up to 2G worth of pages (MAX_RW_COUNT).  But upon deeper
> > look, why allocate any temporary pages at all?  Why not just read
> > directly into iov_iter instead, like you do in the ITER_PIPE case?
> >
> > I don't see a reason to keep special casing ITER_PIPE.
> >
> >
> > Sorry for the delay.  This is not direct-IO , I think reading directly into iov_iter is not good.
>
> Ah, I missed that, although it looks like it might get called in the
> O_DIRECT case too, on a retry after EOF?
>

Yes. But we need to use bio vector for direct read (see commit
fc218544fbc80). The problem is there is no helper function that copies
data bio vector into iov_iter.

Regards
Yan, Zheng


> Thanks,
>
>                 Ilya
diff mbox series

Patch

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 92ab20433682..79c610cb5f54 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -556,75 +556,6 @@  enum {
 	READ_INLINE =  3,
 };
 
-/*
- * Read a range of bytes striped over one or more objects.  Iterate over
- * objects we stripe over.  (That's not atomic, but good enough for now.)
- *
- * If we get a short result from the OSD, check against i_size; we need to
- * only return a short read to the caller if we hit EOF.
- */
-static int striped_read(struct inode *inode,
-			u64 pos, u64 len,
-			struct page **pages, int num_pages,
-			int page_align, int *checkeof)
-{
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	u64 this_len;
-	loff_t i_size;
-	int page_idx;
-	int ret, read = 0;
-	bool hit_stripe, was_short;
-
-	/*
-	 * we may need to do multiple reads.  not atomic, unfortunately.
-	 */
-more:
-	this_len = len;
-	page_idx = (page_align + read) >> PAGE_SHIFT;
-	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
-				  &ci->i_layout, pos, &this_len,
-				  ci->i_truncate_seq, ci->i_truncate_size,
-				  pages + page_idx, num_pages - page_idx,
-				  ((page_align + read) & ~PAGE_MASK));
-	if (ret == -ENOENT)
-		ret = 0;
-	hit_stripe = this_len < len;
-	was_short = ret >= 0 && ret < this_len;
-	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
-	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
-
-	i_size = i_size_read(inode);
-	if (ret >= 0) {
-		if (was_short && (pos + ret < i_size)) {
-			int zlen = min(this_len - ret, i_size - pos - ret);
-			int zoff = page_align + read + ret;
-			dout(" zero gap %llu to %llu\n",
-			     pos + ret, pos + ret + zlen);
-			ceph_zero_page_vector_range(zoff, zlen, pages);
-			ret += zlen;
-		}
-
-		read += ret;
-		pos += ret;
-		len -= ret;
-
-		/* hit stripe and need continue*/
-		if (len && hit_stripe && pos < i_size)
-			goto more;
-	}
-
-	if (read > 0) {
-		ret = read;
-		/* did we bounce off eof? */
-		if (pos + len > i_size)
-			*checkeof = CHECK_EOF;
-	}
-
-	dout("striped_read returns %d\n", ret);
-	return ret;
-}
-
 /*
  * Completely synchronous read and write methods.  Direct from __user
  * buffer to osd, or directly to user pages (if O_DIRECT).
@@ -632,15 +563,16 @@  static int striped_read(struct inode *inode,
  * If the read spans object boundary, just do multiple reads.
  */
 static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
-			      int *checkeof)
+			      int *retry_op)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
-	struct page **pages;
-	u64 off = iocb->ki_pos;
-	int num_pages;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	ssize_t ret;
-	size_t len = iov_iter_count(to);
+	u64 off = iocb->ki_pos;
+	u64 len = iov_iter_count(to);
 
 	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
 	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
@@ -653,61 +585,118 @@  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 	 * but it will at least behave sensibly when they are
 	 * in sequence.
 	 */
-	ret = filemap_write_and_wait_range(inode->i_mapping, off,
-						off + len);
+	ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
 	if (ret < 0)
 		return ret;
 
-	if (unlikely(to->type & ITER_PIPE)) {
+	ret = 0;
+	while ((len = iov_iter_count(to)) > 0) {
+		struct ceph_osd_request *req;
+		struct page **pages;
+		int num_pages;
 		size_t page_off;
-		ret = iov_iter_get_pages_alloc(to, &pages, len,
-					       &page_off);
-		if (ret <= 0)
-			return -ENOMEM;
-		num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+		u64 i_size;
+		bool more;
+
+		req = ceph_osdc_new_request(osdc, &ci->i_layout,
+					ci->i_vino, off, &len, 0, 1,
+					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+					NULL, ci->i_truncate_seq,
+					ci->i_truncate_size, false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			break;
+		}
 
-		ret = striped_read(inode, off, ret, pages, num_pages,
-				   page_off, checkeof);
-		if (ret > 0) {
-			iov_iter_advance(to, ret);
-			off += ret;
+		more = len < iov_iter_count(to);
+
+		if (unlikely(to->type & ITER_PIPE)) {
+			ret = iov_iter_get_pages_alloc(to, &pages, len,
+						       &page_off);
+			if (ret <= 0) {
+				ceph_osdc_put_request(req);
+				ret = -ENOMEM;
+				break;
+			}
+			num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+			if (ret < len) {
+				len = ret;
+				osd_req_op_extent_update(req, 0, len);
+				more = false;
+			}
 		} else {
-			iov_iter_advance(to, 0);
+			num_pages = calc_pages_for(off, len);
+			page_off = off & ~PAGE_MASK;
+			pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+			if (IS_ERR(pages)) {
+				ceph_osdc_put_request(req);
+				ret = PTR_ERR(pages);
+				break;
+			}
 		}
-		ceph_put_page_vector(pages, num_pages, false);
-	} else {
-		num_pages = calc_pages_for(off, len);
-		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
-		if (IS_ERR(pages))
-			return PTR_ERR(pages);
-
-		ret = striped_read(inode, off, len, pages, num_pages,
-				   (off & ~PAGE_MASK), checkeof);
-		if (ret > 0) {
-			int l, k = 0;
-			size_t left = ret;
-
-			while (left) {
-				size_t page_off = off & ~PAGE_MASK;
-				size_t copy = min_t(size_t, left,
-						    PAGE_SIZE - page_off);
-				l = copy_page_to_iter(pages[k++], page_off,
-						      copy, to);
-				off += l;
-				left -= l;
-				if (l < copy)
+
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
+						 false, false);
+		ret = ceph_osdc_start_request(osdc, req, false);
+		if (!ret)
+			ret = ceph_osdc_wait_request(osdc, req);
+		ceph_osdc_put_request(req);
+
+		i_size = i_size_read(inode);
+		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
+		     off, len, ret, i_size, (more ? " MORE" : ""));
+
+		if (ret == -ENOENT)
+			ret = 0;
+		if (ret >= 0 && ret < len && (off + ret < i_size)) {
+			int zlen = min(len - ret, i_size - off - ret);
+			int zoff = page_off + ret;
+			dout("sync_read zero gap %llu~%llu\n",
+                             off + ret, off + ret + zlen);
+			ceph_zero_page_vector_range(zoff, zlen, pages);
+			ret += zlen;
+		}
+
+		if (unlikely(to->type & ITER_PIPE)) {
+			if (ret > 0) {
+				iov_iter_advance(to, ret);
+				off += ret;
+			} else {
+				iov_iter_advance(to, 0);
+			}
+			ceph_put_page_vector(pages, num_pages, false);
+		} else {
+			int idx = 0;
+			size_t left = ret > 0 ? ret : 0;
+			while (left > 0) {
+				size_t len, copied;
+				page_off = off & ~PAGE_MASK;
+				len = min_t(size_t, left, PAGE_SIZE - page_off);
+				copied = copy_page_to_iter(pages[idx++],
+							   page_off, len, to);
+				off += copied;
+				left -= copied;
+				if (copied < len) {
+					ret = -EFAULT;
 					break;
+				}
 			}
+			ceph_release_page_vector(pages, num_pages);
 		}
-		ceph_release_page_vector(pages, num_pages);
+
+		if (ret <= 0 || off >= i_size || !more)
+			break;
 	}
 
 	if (off > iocb->ki_pos) {
+		if (ret >= 0 &&
+		    iov_iter_count(to) > 0 && off >= i_size_read(inode))
+			*retry_op = CHECK_EOF;
 		ret = off - iocb->ki_pos;
 		iocb->ki_pos = off;
 	}
 
-	dout("sync_read result %zd\n", ret);
+	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
 	return ret;
 }