diff mbox series

[RFC,v2,09/11] ceph: convert readpages to fscache_read_helper

Message ID 20200731130421.127022-10-jlayton@kernel.org (mailing list archive)
State New, archived
Headers show
Series ceph: convert to new FSCache API | expand

Commit Message

Jeff Layton July 31, 2020, 1:04 p.m. UTC
Convert ceph_readpages to use the fscache_read_helper. With this we can
rip out a lot of the old readpage/readpages infrastructure.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/addr.c | 209 +++++++------------------------------------------
 1 file changed, 28 insertions(+), 181 deletions(-)

Comments

David Wysochanski Aug. 9, 2020, 3:09 p.m. UTC | #1
On Fri, Jul 31, 2020 at 9:05 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> Convert ceph_readpages to use the fscache_read_helper. With this we can
> rip out a lot of the old readpage/readpages infrastructure.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/addr.c | 209 +++++++------------------------------------------
>  1 file changed, 28 insertions(+), 181 deletions(-)
>
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index cee497c108bb..8905fe4a0930 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -377,76 +377,23 @@ static int ceph_readpage(struct file *filp, struct page *page)
>         return err;
>  }
>
> -/*
> - * Finish an async read(ahead) op.
> - */
> -static void finish_read(struct ceph_osd_request *req)
> -{
> -       struct inode *inode = req->r_inode;
> -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> -       struct ceph_osd_data *osd_data;
> -       int rc = req->r_result <= 0 ? req->r_result : 0;
> -       int bytes = req->r_result >= 0 ? req->r_result : 0;
> -       int num_pages;
> -       int i;
> -
> -       dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
> -       if (rc == -EBLACKLISTED)
> -               ceph_inode_to_client(inode)->blacklisted = true;
> -
> -       /* unlock all pages, zeroing any data we didn't read */
> -       osd_data = osd_req_op_extent_osd_data(req, 0);
> -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
> -       num_pages = calc_pages_for((u64)osd_data->alignment,
> -                                       (u64)osd_data->length);
> -       for (i = 0; i < num_pages; i++) {
> -               struct page *page = osd_data->pages[i];
> -
> -               if (rc < 0 && rc != -ENOENT)
> -                       goto unlock;
> -               if (bytes < (int)PAGE_SIZE) {
> -                       /* zero (remainder of) page */
> -                       int s = bytes < 0 ? 0 : bytes;
> -                       zero_user_segment(page, s, PAGE_SIZE);
> -               }
> -               dout("finish_read %p uptodate %p idx %lu\n", inode, page,
> -                    page->index);
> -               flush_dcache_page(page);
> -               SetPageUptodate(page);
> -unlock:
> -               unlock_page(page);
> -               put_page(page);
> -               bytes -= PAGE_SIZE;
> -       }
> -
> -       ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
> -                                req->r_end_latency, rc);
> -
> -       kfree(osd_data->pages);
> -}
> -
> -/*
> - * start an async read(ahead) operation.  return nr_pages we submitted
> - * a read for on success, or negative error code.
> - */
> -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
> -                     struct list_head *page_list, int max)
> +static int ceph_readpages(struct file *file, struct address_space *mapping,
> +                         struct list_head *page_list, unsigned nr_pages)
>  {
> -       struct ceph_osd_client *osdc =
> -               &ceph_inode_to_client(inode)->client->osdc;
> +       struct inode *inode = file_inode(file);
>         struct ceph_inode_info *ci = ceph_inode(inode);
> -       struct page *page = lru_to_page(page_list);
> -       struct ceph_vino vino;
> -       struct ceph_osd_request *req;
> -       u64 off;
> -       u64 len;
> -       int i;
> -       struct page **pages;
> -       pgoff_t next_index;
> -       int nr_pages = 0;
> +       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> +       struct ceph_file_info *fi = file->private_data;
> +       struct ceph_rw_context *rw_ctx;
> +       struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
>         int got = 0;
>         int ret = 0;
> +       int max = fsc->mount_options->rsize >> PAGE_SHIFT;

Have you ran tests with different values of rsize?
Specifically, rsize < readahead_size == size_of_readpages

I'm seeing a lot of problems with NFS when varying rsize are used wrt
readahead values.  Specifically I'm seeing panics because fscache
expects a 1:1 mapping of issue_op() to io_done() calls, and I get
panics because multiple read completions are trying to unlock the
same pages inside fscache_read_done().

My understanding is afs does not have such 'rsize' limitation, so it
may not be an area that is well tested.  It could be my implementation
of the NFS conversion though, as I thinkwhat needs to happen is the
respect the above 1:1 mapping of issue_op() to io_done() calls, and my
initial implementation did not do that.

FWIW, specifically this unit test was originally failing for me with a panic.
Sun 09 Aug 2020 11:03:22 AM EDT: 1. On NFS client, install and enable
cachefilesd
Sun 09 Aug 2020 11:03:22 AM EDT: 2. On NFS client, mount -o
vers=4.1,fsc,rsize=16384 127.0.0.1:/export/dir1 /mnt/dir1
Sun 09 Aug 2020 11:03:22 AM EDT: 3. On NFS client, dd if=/dev/zero
of=/mnt/dir1/file1.bin bs=65536 count=1
Sun 09 Aug 2020 11:03:22 AM EDT: 4. On NFS client, echo 3 >
/proc/sys/vm/drop_caches
Sun 09 Aug 2020 11:03:22 AM EDT: 5. On NFS client, ./nfs-readahead.sh
set /mnt/dir1 65536
Sun 09 Aug 2020 11:03:23 AM EDT: 6. On NFS client, dd
if=/mnt/dir1/file1.bin of=/dev/null
Sun 09 Aug 2020 11:03:23 AM EDT: 8. On NFS client, echo 3 >
/proc/sys/vm/drop_caches
Sun 09 Aug 2020 11:03:23 AM EDT: 9. On NFS client, dd
if=/mnt/dir1/file1.bin of=/dev/null



> +
> +       if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
> +               return -EINVAL;
>
> +       rw_ctx = ceph_find_rw_context(fi);
>         if (!rw_ctx) {
>                 /* caller of readpages does not hold buffer and read caps
>                  * (fadvise, madvise and readahead cases) */
> @@ -459,133 +406,33 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
>                         dout("start_read %p, no cache cap\n", inode);
>                         ret = 0;
>                 }
> -               if (ret <= 0) {
> -                       if (got)
> -                               ceph_put_cap_refs(ci, got);
> -                       while (!list_empty(page_list)) {
> -                               page = lru_to_page(page_list);
> -                               list_del(&page->lru);
> -                               put_page(page);
> -                       }
> -                       return ret;
> -               }
> +               if (ret <= 0)
> +                       goto out;
>         }
>
> -       off = (u64) page_offset(page);
> +       dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
> +            inode, file, rw_ctx, nr_pages, max);
>
> -       /* count pages */
> -       next_index = page->index;
> -       list_for_each_entry_reverse(page, page_list, lru) {
> -               if (page->index != next_index)
> -                       break;
> -               nr_pages++;
> -               next_index++;
> -               if (max && nr_pages == max)
> -                       break;
> -       }
> -       len = nr_pages << PAGE_SHIFT;
> -       dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
> -            off, len);
> -       vino = ceph_vino(inode);
> -       req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
> -                                   0, 1, CEPH_OSD_OP_READ,
> -                                   CEPH_OSD_FLAG_READ, NULL,
> -                                   ci->i_truncate_seq, ci->i_truncate_size,
> -                                   false);
> -       if (IS_ERR(req)) {
> -               ret = PTR_ERR(req);
> -               goto out;
> -       }
> +       while (ret >= 0 && !list_empty(page_list)) {
> +               struct ceph_fscache_req *req = ceph_fsreq_alloc();
>
> -       /* build page vector */
> -       nr_pages = calc_pages_for(0, len);
> -       pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
> -       if (!pages) {
> -               ret = -ENOMEM;
> -               goto out_put;
> -       }
> -       for (i = 0; i < nr_pages; ++i) {
> -               page = list_entry(page_list->prev, struct page, lru);
> -               BUG_ON(PageLocked(page));
> -               list_del(&page->lru);
> -
> -               dout("start_read %p adding %p idx %lu\n", inode, page,
> -                    page->index);
> -               if (add_to_page_cache_lru(page, &inode->i_data, page->index,
> -                                         GFP_KERNEL)) {
> -                       put_page(page);
> -                       dout("start_read %p add_to_page_cache failed %p\n",
> -                            inode, page);
> -                       nr_pages = i;
> -                       if (nr_pages > 0) {
> -                               len = nr_pages << PAGE_SHIFT;
> -                               osd_req_op_extent_update(req, 0, len);
> -                               break;
> -                       }
> -                       goto out_pages;
> +               if (!req) {
> +                       ret = -ENOMEM;
> +                       break;
>                 }
> -               pages[i] = page;
> -       }
> -       osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
> -       req->r_callback = finish_read;
> -       req->r_inode = inode;
> -
> -       dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
> -       ret = ceph_osdc_start_request(osdc, req, false);
> -       if (ret < 0)
> -               goto out_pages;
> -       ceph_osdc_put_request(req);
> -
> -       /* After adding locked pages to page cache, the inode holds cache cap.
> -        * So we can drop our cap refs. */
> -       if (got)
> -               ceph_put_cap_refs(ci, got);
> -
> -       return nr_pages;
> +               fscache_init_io_request(&req->fscache_req, cookie, &ceph_readpage_fsreq_ops);
> +               req->fscache_req.mapping = inode->i_mapping;
>
> -out_pages:
> -       for (i = 0; i < nr_pages; ++i) {
> -               unlock_page(pages[i]);
> +               ret = fscache_read_helper_page_list(&req->fscache_req, page_list, max);
> +               ceph_fsreq_put(&req->fscache_req);
>         }
> -       ceph_put_page_vector(pages, nr_pages, false);
> -out_put:
> -       ceph_osdc_put_request(req);
>  out:
> +       /* After adding locked pages to page cache, the inode holds Fc refs. We can drop ours. */
>         if (got)
>                 ceph_put_cap_refs(ci, got);
> -       return ret;
> -}
>
> -
> -/*
> - * Read multiple pages.  Leave pages we don't read + unlock in page_list;
> - * the caller (VM) cleans them up.
> - */
> -static int ceph_readpages(struct file *file, struct address_space *mapping,
> -                         struct list_head *page_list, unsigned nr_pages)
> -{
> -       struct inode *inode = file_inode(file);
> -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> -       struct ceph_file_info *fi = file->private_data;
> -       struct ceph_rw_context *rw_ctx;
> -       int rc = 0;
> -       int max = 0;
> -
> -       if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
> -               return -EINVAL;
> -
> -       rw_ctx = ceph_find_rw_context(fi);
> -       max = fsc->mount_options->rsize >> PAGE_SHIFT;
> -       dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
> -            inode, file, rw_ctx, nr_pages, max);
> -       while (!list_empty(page_list)) {
> -               rc = start_read(inode, rw_ctx, page_list, max);
> -               if (rc < 0)
> -                       goto out;
> -       }
> -out:
> -       dout("readpages %p file %p ret %d\n", inode, file, rc);
> -       return rc;
> +       dout("readpages %p file %p ret %d\n", inode, file, ret);
> +       return ret;
>  }
>
>  struct ceph_writeback_ctl
> --
> 2.26.2
>
>
> --
> Linux-cachefs mailing list
> Linux-cachefs@redhat.com
> https://www.redhat.com/mailman/listinfo/linux-cachefs
>
David Wysochanski Aug. 9, 2020, 6:06 p.m. UTC | #2
On Fri, Jul 31, 2020 at 9:05 AM Jeff Layton <jlayton@kernel.org> wrote:
> +static int ceph_readpages(struct file *file, struct address_space *mapping,
> +                         struct list_head *page_list, unsigned nr_pages)
>  {
...
> +       int max = fsc->mount_options->rsize >> PAGE_SHIFT;
...
> +               ret = fscache_read_helper_page_list(&req->fscache_req, page_list, max);

Looks like the root of my problems is that the 'max_pages' parameter
given to fscache_read_helper_page_list() does not work for purposes of
limiting the IO to the 'rsize'.  That is, the fscache_io_request.nr_pages
exceeds 'max_pages' and becomes readahead_size.  So even though
max_pages is based on 'rsize', when issue_op() is called, it is for a
fscache_io_request that exceeds 'rsize', resulting in multiple NFS
reads that go over the wire and multiple completions, each of
which end up calling back into io_done() which blows up
because fscache does not expect this.  Looks like
fscache_shape_request() overrides any 'max_pages'
value (actually it is cachefiles_shape_request) , so it's
unclear why the netfs would pass in a 'max_pages' if it is
not honored - seems like a bug maybe or it's not obvious
what the purpose is there.  I tried a custom 'shape' method
and got further, but it blew up on another test, so I'm not sure.

It would be good to know if this somehow works for you but my guess is
you'll see similar failures when rsize < readahead_size == size_of_readpages.
David Howells Aug. 10, 2020, 10:09 a.m. UTC | #3
David Wysochanski <dwysocha@redhat.com> wrote:

> Looks like fscache_shape_request() overrides any 'max_pages' value (actually
> it is cachefiles_shape_request) , so it's unclear why the netfs would pass
> in a 'max_pages' if it is not honored - seems like a bug maybe or it's not
> obvious

I think the problem is that cachefiles_shape_request() is applying the limit
too early.  It's using it to cut down the number of pages in the original
request (only applicable to readpages), but then the shaping to fit cache
granules can exceed that, so it needs to be applied later also.

Does the attached patch help?

David
---
diff --git a/fs/cachefiles/content-map.c b/fs/cachefiles/content-map.c
index 2bfba2e41c39..ce05cf1d9a6e 100644
--- a/fs/cachefiles/content-map.c
+++ b/fs/cachefiles/content-map.c
@@ -134,7 +134,8 @@ void cachefiles_shape_request(struct fscache_object *obj,
 	_enter("{%lx,%lx,%x},%llx,%d",
 	       start, end, max_pages, i_size, shape->for_write);
 
-	if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE) {
+	if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE ||
+	    max_pages < CACHEFILES_GRAN_PAGES) {
 		shape->to_be_done = FSCACHE_READ_FROM_SERVER;
 		return;
 	}
@@ -144,10 +145,6 @@ void cachefiles_shape_request(struct fscache_object *obj,
 	if (shape->i_size > CACHEFILES_SIZE_LIMIT)
 		i_size = CACHEFILES_SIZE_LIMIT;
 
-	max_pages = round_down(max_pages, CACHEFILES_GRAN_PAGES);
-	if (end - start > max_pages)
-		end = start + max_pages;
-
 	granule = start / CACHEFILES_GRAN_PAGES;
 	if (granule / 8 >= object->content_map_size) {
 		cachefiles_expand_content_map(object, i_size);
@@ -185,6 +182,10 @@ void cachefiles_shape_request(struct fscache_object *obj,
 		start = round_down(start, CACHEFILES_GRAN_PAGES);
 		end   = round_up(end, CACHEFILES_GRAN_PAGES);
 
+		/* Trim to the maximum size the netfs supports */
+		if (end - start > max_pages)
+			end = round_down(start + max_pages, CACHEFILES_GRAN_PAGES);
+
 		/* But trim to the end of the file and the starting page */
 		eof = (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		if (eof <= shape->proposed_start)
Jeff Layton Aug. 10, 2020, 11:09 a.m. UTC | #4
On Sun, 2020-08-09 at 11:09 -0400, David Wysochanski wrote:
> On Fri, Jul 31, 2020 at 9:05 AM Jeff Layton <jlayton@kernel.org> wrote:
> > Convert ceph_readpages to use the fscache_read_helper. With this we can
> > rip out a lot of the old readpage/readpages infrastructure.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/addr.c | 209 +++++++------------------------------------------
> >  1 file changed, 28 insertions(+), 181 deletions(-)
> > 
> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > index cee497c108bb..8905fe4a0930 100644
> > --- a/fs/ceph/addr.c
> > +++ b/fs/ceph/addr.c
> > @@ -377,76 +377,23 @@ static int ceph_readpage(struct file *filp, struct page *page)
> >         return err;
> >  }
> > 
> > -/*
> > - * Finish an async read(ahead) op.
> > - */
> > -static void finish_read(struct ceph_osd_request *req)
> > -{
> > -       struct inode *inode = req->r_inode;
> > -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > -       struct ceph_osd_data *osd_data;
> > -       int rc = req->r_result <= 0 ? req->r_result : 0;
> > -       int bytes = req->r_result >= 0 ? req->r_result : 0;
> > -       int num_pages;
> > -       int i;
> > -
> > -       dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
> > -       if (rc == -EBLACKLISTED)
> > -               ceph_inode_to_client(inode)->blacklisted = true;
> > -
> > -       /* unlock all pages, zeroing any data we didn't read */
> > -       osd_data = osd_req_op_extent_osd_data(req, 0);
> > -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
> > -       num_pages = calc_pages_for((u64)osd_data->alignment,
> > -                                       (u64)osd_data->length);
> > -       for (i = 0; i < num_pages; i++) {
> > -               struct page *page = osd_data->pages[i];
> > -
> > -               if (rc < 0 && rc != -ENOENT)
> > -                       goto unlock;
> > -               if (bytes < (int)PAGE_SIZE) {
> > -                       /* zero (remainder of) page */
> > -                       int s = bytes < 0 ? 0 : bytes;
> > -                       zero_user_segment(page, s, PAGE_SIZE);
> > -               }
> > -               dout("finish_read %p uptodate %p idx %lu\n", inode, page,
> > -                    page->index);
> > -               flush_dcache_page(page);
> > -               SetPageUptodate(page);
> > -unlock:
> > -               unlock_page(page);
> > -               put_page(page);
> > -               bytes -= PAGE_SIZE;
> > -       }
> > -
> > -       ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
> > -                                req->r_end_latency, rc);
> > -
> > -       kfree(osd_data->pages);
> > -}
> > -
> > -/*
> > - * start an async read(ahead) operation.  return nr_pages we submitted
> > - * a read for on success, or negative error code.
> > - */
> > -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
> > -                     struct list_head *page_list, int max)
> > +static int ceph_readpages(struct file *file, struct address_space *mapping,
> > +                         struct list_head *page_list, unsigned nr_pages)
> >  {
> > -       struct ceph_osd_client *osdc =
> > -               &ceph_inode_to_client(inode)->client->osdc;
> > +       struct inode *inode = file_inode(file);
> >         struct ceph_inode_info *ci = ceph_inode(inode);
> > -       struct page *page = lru_to_page(page_list);
> > -       struct ceph_vino vino;
> > -       struct ceph_osd_request *req;
> > -       u64 off;
> > -       u64 len;
> > -       int i;
> > -       struct page **pages;
> > -       pgoff_t next_index;
> > -       int nr_pages = 0;
> > +       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > +       struct ceph_file_info *fi = file->private_data;
> > +       struct ceph_rw_context *rw_ctx;
> > +       struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
> >         int got = 0;
> >         int ret = 0;
> > +       int max = fsc->mount_options->rsize >> PAGE_SHIFT;
> 
> Have you ran tests with different values of rsize?
> Specifically, rsize < readahead_size == size_of_readpages
> 
> I'm seeing a lot of problems with NFS when varying rsize are used wrt
> readahead values.  Specifically I'm seeing panics because fscache
> expects a 1:1 mapping of issue_op() to io_done() calls, and I get
> panics because multiple read completions are trying to unlock the
> same pages inside fscache_read_done().
> 
> My understanding is afs does not have such 'rsize' limitation, so it
> may not be an area that is well tested.  It could be my implementation
> of the NFS conversion though, as I thinkwhat needs to happen is the
> respect the above 1:1 mapping of issue_op() to io_done() calls, and my
> initial implementation did not do that.
> 
> FWIW, specifically this unit test was originally failing for me with a panic.
> Sun 09 Aug 2020 11:03:22 AM EDT: 1. On NFS client, install and enable
> cachefilesd
> Sun 09 Aug 2020 11:03:22 AM EDT: 2. On NFS client, mount -o
> vers=4.1,fsc,rsize=16384 127.0.0.1:/export/dir1 /mnt/dir1
> Sun 09 Aug 2020 11:03:22 AM EDT: 3. On NFS client, dd if=/dev/zero
> of=/mnt/dir1/file1.bin bs=65536 count=1
> Sun 09 Aug 2020 11:03:22 AM EDT: 4. On NFS client, echo 3 >
> /proc/sys/vm/drop_caches
> Sun 09 Aug 2020 11:03:22 AM EDT: 5. On NFS client, ./nfs-readahead.sh
> set /mnt/dir1 65536
> Sun 09 Aug 2020 11:03:23 AM EDT: 8. On NFS client, echo 3 >
> /proc/sys/vm/drop_caches
> Sun 09 Aug 2020 11:03:23 AM EDT: 9. On NFS client, dd
> if=/mnt/dir1/file1.bin of=/dev/null
> 
> 

I haven't tested much with varying rsize and wsize (setting them on
cephfs is pretty rare), but I'll plan to. What's in nfs-readahead.sh?


> 
> > +
> > +       if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
> > +               return -EINVAL;
> > 
> > +       rw_ctx = ceph_find_rw_context(fi);
> >         if (!rw_ctx) {
> >                 /* caller of readpages does not hold buffer and read caps
> >                  * (fadvise, madvise and readahead cases) */
> > @@ -459,133 +406,33 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
> >                         dout("start_read %p, no cache cap\n", inode);
> >                         ret = 0;
> >                 }
> > -               if (ret <= 0) {
> > -                       if (got)
> > -                               ceph_put_cap_refs(ci, got);
> > -                       while (!list_empty(page_list)) {
> > -                               page = lru_to_page(page_list);
> > -                               list_del(&page->lru);
> > -                               put_page(page);
> > -                       }
> > -                       return ret;
> > -               }
> > +               if (ret <= 0)
> > +                       goto out;
> >         }
> > 
> > -       off = (u64) page_offset(page);
> > +       dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
> > +            inode, file, rw_ctx, nr_pages, max);
> > 
> > -       /* count pages */
> > -       next_index = page->index;
> > -       list_for_each_entry_reverse(page, page_list, lru) {
> > -               if (page->index != next_index)
> > -                       break;
> > -               nr_pages++;
> > -               next_index++;
> > -               if (max && nr_pages == max)
> > -                       break;
> > -       }
> > -       len = nr_pages << PAGE_SHIFT;
> > -       dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
> > -            off, len);
> > -       vino = ceph_vino(inode);
> > -       req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
> > -                                   0, 1, CEPH_OSD_OP_READ,
> > -                                   CEPH_OSD_FLAG_READ, NULL,
> > -                                   ci->i_truncate_seq, ci->i_truncate_size,
> > -                                   false);
> > -       if (IS_ERR(req)) {
> > -               ret = PTR_ERR(req);
> > -               goto out;
> > -       }
> > +       while (ret >= 0 && !list_empty(page_list)) {
> > +               struct ceph_fscache_req *req = ceph_fsreq_alloc();
> > 
> > -       /* build page vector */
> > -       nr_pages = calc_pages_for(0, len);
> > -       pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
> > -       if (!pages) {
> > -               ret = -ENOMEM;
> > -               goto out_put;
> > -       }
> > -       for (i = 0; i < nr_pages; ++i) {
> > -               page = list_entry(page_list->prev, struct page, lru);
> > -               BUG_ON(PageLocked(page));
> > -               list_del(&page->lru);
> > -
> > -               dout("start_read %p adding %p idx %lu\n", inode, page,
> > -                    page->index);
> > -               if (add_to_page_cache_lru(page, &inode->i_data, page->index,
> > -                                         GFP_KERNEL)) {
> > -                       put_page(page);
> > -                       dout("start_read %p add_to_page_cache failed %p\n",
> > -                            inode, page);
> > -                       nr_pages = i;
> > -                       if (nr_pages > 0) {
> > -                               len = nr_pages << PAGE_SHIFT;
> > -                               osd_req_op_extent_update(req, 0, len);
> > -                               break;
> > -                       }
> > -                       goto out_pages;
> > +               if (!req) {
> > +                       ret = -ENOMEM;
> > +                       break;
> >                 }
> > -               pages[i] = page;
> > -       }
> > -       osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
> > -       req->r_callback = finish_read;
> > -       req->r_inode = inode;
> > -
> > -       dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
> > -       ret = ceph_osdc_start_request(osdc, req, false);
> > -       if (ret < 0)
> > -               goto out_pages;
> > -       ceph_osdc_put_request(req);
> > -
> > -       /* After adding locked pages to page cache, the inode holds cache cap.
> > -        * So we can drop our cap refs. */
> > -       if (got)
> > -               ceph_put_cap_refs(ci, got);
> > -
> > -       return nr_pages;
> > +               fscache_init_io_request(&req->fscache_req, cookie, &ceph_readpage_fsreq_ops);
> > +               req->fscache_req.mapping = inode->i_mapping;
> > 
> > -out_pages:
> > -       for (i = 0; i < nr_pages; ++i) {
> > -               unlock_page(pages[i]);
> > +               ret = fscache_read_helper_page_list(&req->fscache_req, page_list, max);
> > +               ceph_fsreq_put(&req->fscache_req);
> >         }
> > -       ceph_put_page_vector(pages, nr_pages, false);
> > -out_put:
> > -       ceph_osdc_put_request(req);
> >  out:
> > +       /* After adding locked pages to page cache, the inode holds Fc refs. We can drop ours. */
> >         if (got)
> >                 ceph_put_cap_refs(ci, got);
> > -       return ret;
> > -}
> > 
> > -
> > -/*
> > - * Read multiple pages.  Leave pages we don't read + unlock in page_list;
> > - * the caller (VM) cleans them up.
> > - */
> > -static int ceph_readpages(struct file *file, struct address_space *mapping,
> > -                         struct list_head *page_list, unsigned nr_pages)
> > -{
> > -       struct inode *inode = file_inode(file);
> > -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > -       struct ceph_file_info *fi = file->private_data;
> > -       struct ceph_rw_context *rw_ctx;
> > -       int rc = 0;
> > -       int max = 0;
> > -
> > -       if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
> > -               return -EINVAL;
> > -
> > -       rw_ctx = ceph_find_rw_context(fi);
> > -       max = fsc->mount_options->rsize >> PAGE_SHIFT;
> > -       dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
> > -            inode, file, rw_ctx, nr_pages, max);
> > -       while (!list_empty(page_list)) {
> > -               rc = start_read(inode, rw_ctx, page_list, max);
> > -               if (rc < 0)
> > -                       goto out;
> > -       }
> > -out:
> > -       dout("readpages %p file %p ret %d\n", inode, file, rc);
> > -       return rc;
> > +       dout("readpages %p file %p ret %d\n", inode, file, ret);
> > +       return ret;
> >  }
> > 
> >  struct ceph_writeback_ctl
> > --
> > 2.26.2
> > 
> > 
> > --
> > Linux-cachefs mailing list
> > Linux-cachefs@redhat.com
> > https://www.redhat.com/mailman/listinfo/linux-cachefs
> >
David Wysochanski Aug. 10, 2020, 12:24 p.m. UTC | #5
On Mon, Aug 10, 2020 at 7:09 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Sun, 2020-08-09 at 11:09 -0400, David Wysochanski wrote:
> > On Fri, Jul 31, 2020 at 9:05 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > Convert ceph_readpages to use the fscache_read_helper. With this we can
> > > rip out a lot of the old readpage/readpages infrastructure.
> > >
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/addr.c | 209 +++++++------------------------------------------
> > >  1 file changed, 28 insertions(+), 181 deletions(-)
> > >
> > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > > index cee497c108bb..8905fe4a0930 100644
> > > --- a/fs/ceph/addr.c
> > > +++ b/fs/ceph/addr.c
> > > @@ -377,76 +377,23 @@ static int ceph_readpage(struct file *filp, struct page *page)
> > >         return err;
> > >  }
> > >
> > > -/*
> > > - * Finish an async read(ahead) op.
> > > - */
> > > -static void finish_read(struct ceph_osd_request *req)
> > > -{
> > > -       struct inode *inode = req->r_inode;
> > > -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > > -       struct ceph_osd_data *osd_data;
> > > -       int rc = req->r_result <= 0 ? req->r_result : 0;
> > > -       int bytes = req->r_result >= 0 ? req->r_result : 0;
> > > -       int num_pages;
> > > -       int i;
> > > -
> > > -       dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
> > > -       if (rc == -EBLACKLISTED)
> > > -               ceph_inode_to_client(inode)->blacklisted = true;
> > > -
> > > -       /* unlock all pages, zeroing any data we didn't read */
> > > -       osd_data = osd_req_op_extent_osd_data(req, 0);
> > > -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
> > > -       num_pages = calc_pages_for((u64)osd_data->alignment,
> > > -                                       (u64)osd_data->length);
> > > -       for (i = 0; i < num_pages; i++) {
> > > -               struct page *page = osd_data->pages[i];
> > > -
> > > -               if (rc < 0 && rc != -ENOENT)
> > > -                       goto unlock;
> > > -               if (bytes < (int)PAGE_SIZE) {
> > > -                       /* zero (remainder of) page */
> > > -                       int s = bytes < 0 ? 0 : bytes;
> > > -                       zero_user_segment(page, s, PAGE_SIZE);
> > > -               }
> > > -               dout("finish_read %p uptodate %p idx %lu\n", inode, page,
> > > -                    page->index);
> > > -               flush_dcache_page(page);
> > > -               SetPageUptodate(page);
> > > -unlock:
> > > -               unlock_page(page);
> > > -               put_page(page);
> > > -               bytes -= PAGE_SIZE;
> > > -       }
> > > -
> > > -       ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
> > > -                                req->r_end_latency, rc);
> > > -
> > > -       kfree(osd_data->pages);
> > > -}
> > > -
> > > -/*
> > > - * start an async read(ahead) operation.  return nr_pages we submitted
> > > - * a read for on success, or negative error code.
> > > - */
> > > -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
> > > -                     struct list_head *page_list, int max)
> > > +static int ceph_readpages(struct file *file, struct address_space *mapping,
> > > +                         struct list_head *page_list, unsigned nr_pages)
> > >  {
> > > -       struct ceph_osd_client *osdc =
> > > -               &ceph_inode_to_client(inode)->client->osdc;
> > > +       struct inode *inode = file_inode(file);
> > >         struct ceph_inode_info *ci = ceph_inode(inode);
> > > -       struct page *page = lru_to_page(page_list);
> > > -       struct ceph_vino vino;
> > > -       struct ceph_osd_request *req;
> > > -       u64 off;
> > > -       u64 len;
> > > -       int i;
> > > -       struct page **pages;
> > > -       pgoff_t next_index;
> > > -       int nr_pages = 0;
> > > +       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > > +       struct ceph_file_info *fi = file->private_data;
> > > +       struct ceph_rw_context *rw_ctx;
> > > +       struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
> > >         int got = 0;
> > >         int ret = 0;
> > > +       int max = fsc->mount_options->rsize >> PAGE_SHIFT;
> >
> > Have you ran tests with different values of rsize?
> > Specifically, rsize < readahead_size == size_of_readpages
> >
> > I'm seeing a lot of problems with NFS when varying rsize are used wrt
> > readahead values.  Specifically I'm seeing panics because fscache
> > expects a 1:1 mapping of issue_op() to io_done() calls, and I get
> > panics because multiple read completions are trying to unlock the
> > same pages inside fscache_read_done().
> >
> > My understanding is afs does not have such 'rsize' limitation, so it
> > may not be an area that is well tested.  It could be my implementation
> > of the NFS conversion though, as I thinkwhat needs to happen is the
> > respect the above 1:1 mapping of issue_op() to io_done() calls, and my
> > initial implementation did not do that.
> >
> > FWIW, specifically this unit test was originally failing for me with a panic.
> > Sun 09 Aug 2020 11:03:22 AM EDT: 1. On NFS client, install and enable
> > cachefilesd
> > Sun 09 Aug 2020 11:03:22 AM EDT: 2. On NFS client, mount -o
> > vers=4.1,fsc,rsize=16384 127.0.0.1:/export/dir1 /mnt/dir1
> > Sun 09 Aug 2020 11:03:22 AM EDT: 3. On NFS client, dd if=/dev/zero
> > of=/mnt/dir1/file1.bin bs=65536 count=1
> > Sun 09 Aug 2020 11:03:22 AM EDT: 4. On NFS client, echo 3 >
> > /proc/sys/vm/drop_caches
> > Sun 09 Aug 2020 11:03:22 AM EDT: 5. On NFS client, ./nfs-readahead.sh
> > set /mnt/dir1 65536
> > Sun 09 Aug 2020 11:03:23 AM EDT: 8. On NFS client, echo 3 >
> > /proc/sys/vm/drop_caches
> > Sun 09 Aug 2020 11:03:23 AM EDT: 9. On NFS client, dd
> > if=/mnt/dir1/file1.bin of=/dev/null
> >
> >
>
> I haven't tested much with varying rsize and wsize (setting them on
> cephfs is pretty rare), but I'll plan to. What's in nfs-readahead.sh?
>
>

See attached.
David Wysochanski Aug. 10, 2020, 1:50 p.m. UTC | #6
On Mon, Aug 10, 2020 at 6:09 AM David Howells <dhowells@redhat.com> wrote:
>
> David Wysochanski <dwysocha@redhat.com> wrote:
>
> > Looks like fscache_shape_request() overrides any 'max_pages' value (actually
> > it is cachefiles_shape_request) , so it's unclear why the netfs would pass
> > in a 'max_pages' if it is not honored - seems like a bug maybe or it's not
> > obvious
>
> I think the problem is that cachefiles_shape_request() is applying the limit
> too early.  It's using it to cut down the number of pages in the original
> request (only applicable to readpages), but then the shaping to fit cache
> granules can exceed that, so it needs to be applied later also.
>
> Does the attached patch help?
>
> David
> ---
> diff --git a/fs/cachefiles/content-map.c b/fs/cachefiles/content-map.c
> index 2bfba2e41c39..ce05cf1d9a6e 100644
> --- a/fs/cachefiles/content-map.c
> +++ b/fs/cachefiles/content-map.c
> @@ -134,7 +134,8 @@ void cachefiles_shape_request(struct fscache_object *obj,
>         _enter("{%lx,%lx,%x},%llx,%d",
>                start, end, max_pages, i_size, shape->for_write);
>
> -       if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE) {
> +       if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE ||
> +           max_pages < CACHEFILES_GRAN_PAGES) {
>                 shape->to_be_done = FSCACHE_READ_FROM_SERVER;
>                 return;
>         }
> @@ -144,10 +145,6 @@ void cachefiles_shape_request(struct fscache_object *obj,
>         if (shape->i_size > CACHEFILES_SIZE_LIMIT)
>                 i_size = CACHEFILES_SIZE_LIMIT;
>
> -       max_pages = round_down(max_pages, CACHEFILES_GRAN_PAGES);
> -       if (end - start > max_pages)
> -               end = start + max_pages;
> -
>         granule = start / CACHEFILES_GRAN_PAGES;
>         if (granule / 8 >= object->content_map_size) {
>                 cachefiles_expand_content_map(object, i_size);
> @@ -185,6 +182,10 @@ void cachefiles_shape_request(struct fscache_object *obj,
>                 start = round_down(start, CACHEFILES_GRAN_PAGES);
>                 end   = round_up(end, CACHEFILES_GRAN_PAGES);
>
> +               /* Trim to the maximum size the netfs supports */
> +               if (end - start > max_pages)
> +                       end = round_down(start + max_pages, CACHEFILES_GRAN_PAGES);
> +
>                 /* But trim to the end of the file and the starting page */
>                 eof = (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
>                 if (eof <= shape->proposed_start)
>

I tried this and got the same panic - I think i_size is the culprit
(it is larger than max_pages).  I'll send you a larger trace offline
with cachefiles/fscache debugging enabled if that helps, but below is
some custom tracing that may be enough because it shows before / after
shaping values.

Here's outline of the test (smaller rsize and readahead for simplicity):
# ./t1_rsize_lt_read.sh 4.1
Setting NFS vers=4.1
Mon 10 Aug 2020 09:34:18 AM EDT: 1. On NFS client, install and enable
cachefilesd
Mon 10 Aug 2020 09:34:18 AM EDT: 2. On NFS client, mount -o
vers=4.1,fsc,rsize=8192 127.0.0.1:/export/dir1 /mnt/dir1
Mon 10 Aug 2020 09:34:18 AM EDT: 3. On NFS client, dd if=/dev/zero
of=/mnt/dir1/file1.bin bs=16384 count=1
Mon 10 Aug 2020 09:34:18 AM EDT: 4. On NFS client, echo 3 >
/proc/sys/vm/drop_caches
Mon 10 Aug 2020 09:34:19 AM EDT: 5. On NFS client, ./nfs-readahead.sh
set /mnt/dir1 16384
Mon 10 Aug 2020 09:34:19 AM EDT: 6. On NFS client, dd
if=/mnt/dir1/file1.bin of=/dev/null
Mon 10 Aug 2020 09:34:19 AM EDT: 7. On NFS client, echo 3 >
/proc/sys/vm/drop_caches
Mon 10 Aug 2020 09:34:19 AM EDT: 8. On NFS client, dd
if=/mnt/dir1/file1.bin of=/dev/null


Console with custom nfs tracing
[   62.955355] t1_rsize_lt_rea (4840): drop_caches: 3
[   63.028786] fs/nfs/fscache.c:480 before read_helper_page_list pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 pages
ffffb4b4c0fafca8 max_pages 2
[   63.028804] fs/fscache/read_helper.c:347 pid 4882
fscache_read_helper before shape req ffff8902f50b5800 req->nr_pages 0
shape.actual_nr_pages 48 shape.proposed_nr_pages 4
[   63.037231] fs/fscache/read_helper.c:353 pid 4882
fscache_read_helper after shape req ffff8902f50b5800 req->nr_pages 0
shape.actual_nr_pages 4 shape.proposed_nr_pages 4
[   63.043421] fs/fscache/read_helper.c:531 pid 4882
fscache_read_helper before while req ffff8902f50b5800 req->nr_pages 1
shape.actual_nr_pages 4 shape.proposed_nr_pages 4
[   63.049498] fs/fscache/read_helper.c:531 pid 4882
fscache_read_helper before while req ffff8902f50b5800 req->nr_pages 2
shape.actual_nr_pages 4 shape.proposed_nr_pages 4
[   63.063708] fs/fscache/read_helper.c:531 pid 4882
fscache_read_helper before while req ffff8902f50b5800 req->nr_pages 3
shape.actual_nr_pages 4 shape.proposed_nr_pages 4
[   63.070114] fs/fscache/read_helper.c:531 pid 4882
fscache_read_helper before while req ffff8902f50b5800 req->nr_pages 4
shape.actual_nr_pages 4 shape.proposed_nr_pages 4
[   63.076438] fs/nfs/fscache.c:369 enter nfs_issue_op pid 4882 inode
ffff8902b4a2a828 cache ffff8902f50b5800 start 0 last 3
[   63.082964] fs/nfs/fscache.c:379 before readpage_async_filler pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page
fffff42f08741a00
[   63.087591] fs/nfs/fscache.c:382 after readpage_async_filler pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page
fffff42f08741a00 cache.error 0
[   63.093058] fs/nfs/fscache.c:379 before readpage_async_filler pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page
fffff42f08288680
[   63.098927] fs/nfs/fscache.c:382 after readpage_async_filler pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page
fffff42f08288680 cache.error 0
[   63.104507] fs/nfs/fscache.c:379 before readpage_async_filler pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page
fffff42f082816c0
[   63.110922] fs/nfs/fscache.c:382 after readpage_async_filler pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page
fffff42f082816c0 cache.error 0
[   63.111973] fs/nfs/fscache.c:523 pid 233 before io_done inode
ffff8902b4a2a828 bytes 8192 &req->cache ffff8902f50b5800 cache.pos 0
cache.len 16384
[   63.115407] fs/nfs/fscache.c:379 before readpage_async_filler pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page
fffff42f067e8f40
[   63.126337] fs/nfs/fscache.c:382 after readpage_async_filler pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page
fffff42f067e8f40 cache.error 0
[   63.131411] fs/nfs/fscache.c:388 exit nfs_issue_op pid 4882 inode
ffff8902b4a2a828 cache ffff8902f50b5800
[   63.131955] fs/nfs/fscache.c:523 pid 233 before io_done inode
ffff8902b4a2a828 bytes 8192 &req->cache ffff8902f50b5800 cache.pos 0
cache.len 16384
[   63.137012] fs/nfs/fscache.c:484 after read_helper_page_list pid
4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 cache.pos 0
cache.len 16384 cache.nr_pages 4 pages ffffb4b4c0fafca8 ret 0
[   63.140922] page:fffff42f08741a00 refcount:2 mapcount:0
mapping:00000000727f3adc index:0x0
[   63.141091] mapping->aops:nfs_file_aops [nfs] dentry name:"file1.bin"
[   63.146475] fs/nfs/fscache.c:490 outside while(!list_empty(pages))
read_helper_page_list pid 4882 inode ffff8902b4a2a828 cache
ffff8902f50b5800 cache.pos 0 cache.len 16384 cache.nr_pages 4
[   63.146740] fs/fscache/read_helper.c:347 pid 4882
fscache_read_helper before shape req ffff8902f50b5800 req->nr_pages 0
shape.actual_nr_pages 3227832042 shape.proposed_nr_pages 1
[   63.153662] flags: 0x17ffffc0000006(referenced|uptodate)
[   63.153699] raw: 0017ffffc0000006 dead000000000100 dead000000000122
ffff8902b4a2a9a0
[   63.168174] fs/fscache/read_helper.c:353 pid 4882
fscache_read_helper after shape req ffff8902f50b5800 req->nr_pages 0
shape.actual_nr_pages 5 shape.proposed_nr_pages 1
[   63.193131] raw: 0000000000000000 0000000000000000 00000001ffffffff
ffff8902ecfe8000
[   63.203785] page dumped because: VM_BUG_ON_PAGE(!PageLocked(page))
[   63.206372] page->mem_cgroup:ffff8902ecfe8000
[   63.208333] ------------[ cut here ]------------
[   63.211081] kernel BUG at mm/filemap.c:1290!
[   63.213152] invalid opcode: 0000 [#1] SMP PTI
David Wysochanski Aug. 10, 2020, 6:55 p.m. UTC | #7
On Mon, Aug 10, 2020 at 9:50 AM David Wysochanski <dwysocha@redhat.com> wrote:
>
> On Mon, Aug 10, 2020 at 6:09 AM David Howells <dhowells@redhat.com> wrote:
> >
> > David Wysochanski <dwysocha@redhat.com> wrote:
> >
> > > Looks like fscache_shape_request() overrides any 'max_pages' value (actually
> > > it is cachefiles_shape_request) , so it's unclear why the netfs would pass
> > > in a 'max_pages' if it is not honored - seems like a bug maybe or it's not
> > > obvious
> >
> > I think the problem is that cachefiles_shape_request() is applying the limit
> > too early.  It's using it to cut down the number of pages in the original
> > request (only applicable to readpages), but then the shaping to fit cache
> > granules can exceed that, so it needs to be applied later also.
> >
> > Does the attached patch help?
> >
> > David
> > ---
> > diff --git a/fs/cachefiles/content-map.c b/fs/cachefiles/content-map.c
> > index 2bfba2e41c39..ce05cf1d9a6e 100644
> > --- a/fs/cachefiles/content-map.c
> > +++ b/fs/cachefiles/content-map.c
> > @@ -134,7 +134,8 @@ void cachefiles_shape_request(struct fscache_object *obj,
> >         _enter("{%lx,%lx,%x},%llx,%d",
> >                start, end, max_pages, i_size, shape->for_write);
> >
> > -       if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE) {
> > +       if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE ||
> > +           max_pages < CACHEFILES_GRAN_PAGES) {
> >                 shape->to_be_done = FSCACHE_READ_FROM_SERVER;
> >                 return;
> >         }
> > @@ -144,10 +145,6 @@ void cachefiles_shape_request(struct fscache_object *obj,
> >         if (shape->i_size > CACHEFILES_SIZE_LIMIT)
> >                 i_size = CACHEFILES_SIZE_LIMIT;
> >
> > -       max_pages = round_down(max_pages, CACHEFILES_GRAN_PAGES);
> > -       if (end - start > max_pages)
> > -               end = start + max_pages;
> > -
> >         granule = start / CACHEFILES_GRAN_PAGES;
> >         if (granule / 8 >= object->content_map_size) {
> >                 cachefiles_expand_content_map(object, i_size);
> > @@ -185,6 +182,10 @@ void cachefiles_shape_request(struct fscache_object *obj,
> >                 start = round_down(start, CACHEFILES_GRAN_PAGES);
> >                 end   = round_up(end, CACHEFILES_GRAN_PAGES);
> >
> > +               /* Trim to the maximum size the netfs supports */
> > +               if (end - start > max_pages)
> > +                       end = round_down(start + max_pages, CACHEFILES_GRAN_PAGES);
> > +
> >                 /* But trim to the end of the file and the starting page */
> >                 eof = (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
> >                 if (eof <= shape->proposed_start)
> >
>
> I tried this and got the same panic - I think i_size is the culprit
> (it is larger than max_pages).  I'll send you a larger trace offline
> with cachefiles/fscache debugging enabled if that helps, but below is
> some custom tracing that may be enough because it shows before / after
> shaping values.
>

FWIW, after testing the aforementioned patch, and tracing it,
it is not i_size after all.  I added this small patch on top of the
patch to cachefiles_shape_request() and no more panics.  Though
this may not address the full underlying issues, it at least gets
past this point and max_pages seems to work better.

---
diff --git a/fs/fscache/read_helper.c b/fs/fscache/read_helper.c
index a464c3e3188a..fa67339e7304 100644
--- a/fs/fscache/read_helper.c
+++ b/fs/fscache/read_helper.c
@@ -318,8 +318,8 @@ static int fscache_read_helper(struct
fscache_io_request *req,
        switch (type) {
        case FSCACHE_READ_PAGE_LIST:
                shape.proposed_start = lru_to_page(pages)->index;
-               shape.proposed_nr_pages =
-                       lru_to_last_page(pages)->index -
shape.proposed_start + 1;
+               shape.proposed_nr_pages = min_t(unsigned int, max_pages,
+                       lru_to_last_page(pages)->index -
shape.proposed_start + 1);
                break;

        case FSCACHE_READ_LOCKED_PAGE:
diff mbox series

Patch

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index cee497c108bb..8905fe4a0930 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -377,76 +377,23 @@  static int ceph_readpage(struct file *filp, struct page *page)
 	return err;
 }
 
-/*
- * Finish an async read(ahead) op.
- */
-static void finish_read(struct ceph_osd_request *req)
-{
-	struct inode *inode = req->r_inode;
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_osd_data *osd_data;
-	int rc = req->r_result <= 0 ? req->r_result : 0;
-	int bytes = req->r_result >= 0 ? req->r_result : 0;
-	int num_pages;
-	int i;
-
-	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
-	if (rc == -EBLACKLISTED)
-		ceph_inode_to_client(inode)->blacklisted = true;
-
-	/* unlock all pages, zeroing any data we didn't read */
-	osd_data = osd_req_op_extent_osd_data(req, 0);
-	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
-	num_pages = calc_pages_for((u64)osd_data->alignment,
-					(u64)osd_data->length);
-	for (i = 0; i < num_pages; i++) {
-		struct page *page = osd_data->pages[i];
-
-		if (rc < 0 && rc != -ENOENT)
-			goto unlock;
-		if (bytes < (int)PAGE_SIZE) {
-			/* zero (remainder of) page */
-			int s = bytes < 0 ? 0 : bytes;
-			zero_user_segment(page, s, PAGE_SIZE);
-		}
- 		dout("finish_read %p uptodate %p idx %lu\n", inode, page,
-		     page->index);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-unlock:
-		unlock_page(page);
-		put_page(page);
-		bytes -= PAGE_SIZE;
-	}
-
-	ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
-				 req->r_end_latency, rc);
-
-	kfree(osd_data->pages);
-}
-
-/*
- * start an async read(ahead) operation.  return nr_pages we submitted
- * a read for on success, or negative error code.
- */
-static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
-		      struct list_head *page_list, int max)
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *page_list, unsigned nr_pages)
 {
-	struct ceph_osd_client *osdc =
-		&ceph_inode_to_client(inode)->client->osdc;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct page *page = lru_to_page(page_list);
-	struct ceph_vino vino;
-	struct ceph_osd_request *req;
-	u64 off;
-	u64 len;
-	int i;
-	struct page **pages;
-	pgoff_t next_index;
-	int nr_pages = 0;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_file_info *fi = file->private_data;
+	struct ceph_rw_context *rw_ctx;
+	struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
 	int got = 0;
 	int ret = 0;
+	int max = fsc->mount_options->rsize >> PAGE_SHIFT;
+
+	if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
+		return -EINVAL;
 
+	rw_ctx = ceph_find_rw_context(fi);
 	if (!rw_ctx) {
 		/* caller of readpages does not hold buffer and read caps
 		 * (fadvise, madvise and readahead cases) */
@@ -459,133 +406,33 @@  static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
 			dout("start_read %p, no cache cap\n", inode);
 			ret = 0;
 		}
-		if (ret <= 0) {
-			if (got)
-				ceph_put_cap_refs(ci, got);
-			while (!list_empty(page_list)) {
-				page = lru_to_page(page_list);
-				list_del(&page->lru);
-				put_page(page);
-			}
-			return ret;
-		}
+		if (ret <= 0)
+			goto out;
 	}
 
-	off = (u64) page_offset(page);
+	dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
+	     inode, file, rw_ctx, nr_pages, max);
 
-	/* count pages */
-	next_index = page->index;
-	list_for_each_entry_reverse(page, page_list, lru) {
-		if (page->index != next_index)
-			break;
-		nr_pages++;
-		next_index++;
-		if (max && nr_pages == max)
-			break;
-	}
-	len = nr_pages << PAGE_SHIFT;
-	dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
-	     off, len);
-	vino = ceph_vino(inode);
-	req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
-				    0, 1, CEPH_OSD_OP_READ,
-				    CEPH_OSD_FLAG_READ, NULL,
-				    ci->i_truncate_seq, ci->i_truncate_size,
-				    false);
-	if (IS_ERR(req)) {
-		ret = PTR_ERR(req);
-		goto out;
-	}
+	while (ret >= 0 && !list_empty(page_list)) {
+		struct ceph_fscache_req *req = ceph_fsreq_alloc();
 
-	/* build page vector */
-	nr_pages = calc_pages_for(0, len);
-	pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
-	if (!pages) {
-		ret = -ENOMEM;
-		goto out_put;
-	}
-	for (i = 0; i < nr_pages; ++i) {
-		page = list_entry(page_list->prev, struct page, lru);
-		BUG_ON(PageLocked(page));
-		list_del(&page->lru);
-
- 		dout("start_read %p adding %p idx %lu\n", inode, page,
-		     page->index);
-		if (add_to_page_cache_lru(page, &inode->i_data, page->index,
-					  GFP_KERNEL)) {
-			put_page(page);
-			dout("start_read %p add_to_page_cache failed %p\n",
-			     inode, page);
-			nr_pages = i;
-			if (nr_pages > 0) {
-				len = nr_pages << PAGE_SHIFT;
-				osd_req_op_extent_update(req, 0, len);
-				break;
-			}
-			goto out_pages;
+		if (!req) {
+			ret = -ENOMEM;
+			break;
 		}
-		pages[i] = page;
-	}
-	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
-	req->r_callback = finish_read;
-	req->r_inode = inode;
-
-	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
-	ret = ceph_osdc_start_request(osdc, req, false);
-	if (ret < 0)
-		goto out_pages;
-	ceph_osdc_put_request(req);
-
-	/* After adding locked pages to page cache, the inode holds cache cap.
-	 * So we can drop our cap refs. */
-	if (got)
-		ceph_put_cap_refs(ci, got);
-
-	return nr_pages;
+		fscache_init_io_request(&req->fscache_req, cookie, &ceph_readpage_fsreq_ops);
+		req->fscache_req.mapping = inode->i_mapping;
 
-out_pages:
-	for (i = 0; i < nr_pages; ++i) {
-		unlock_page(pages[i]);
+		ret = fscache_read_helper_page_list(&req->fscache_req, page_list, max);
+		ceph_fsreq_put(&req->fscache_req);
 	}
-	ceph_put_page_vector(pages, nr_pages, false);
-out_put:
-	ceph_osdc_put_request(req);
 out:
+	/* After adding locked pages to page cache, the inode holds Fc refs. We can drop ours. */
 	if (got)
 		ceph_put_cap_refs(ci, got);
-	return ret;
-}
 
-
-/*
- * Read multiple pages.  Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
- */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
-			  struct list_head *page_list, unsigned nr_pages)
-{
-	struct inode *inode = file_inode(file);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_file_info *fi = file->private_data;
-	struct ceph_rw_context *rw_ctx;
-	int rc = 0;
-	int max = 0;
-
-	if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
-		return -EINVAL;
-
-	rw_ctx = ceph_find_rw_context(fi);
-	max = fsc->mount_options->rsize >> PAGE_SHIFT;
-	dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
-	     inode, file, rw_ctx, nr_pages, max);
-	while (!list_empty(page_list)) {
-		rc = start_read(inode, rw_ctx, page_list, max);
-		if (rc < 0)
-			goto out;
-	}
-out:
-	dout("readpages %p file %p ret %d\n", inode, file, rc);
-	return rc;
+	dout("readpages %p file %p ret %d\n", inode, file, ret);
+	return ret;
 }
 
 struct ceph_writeback_ctl