Message ID | 20200731130421.127022-10-jlayton@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ceph: convert to new FSCache API | expand |
On Fri, Jul 31, 2020 at 9:05 AM Jeff Layton <jlayton@kernel.org> wrote: > > Convert ceph_readpages to use the fscache_read_helper. With this we can > rip out a lot of the old readpage/readpages infrastructure. > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > --- > fs/ceph/addr.c | 209 +++++++------------------------------------------ > 1 file changed, 28 insertions(+), 181 deletions(-) > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > index cee497c108bb..8905fe4a0930 100644 > --- a/fs/ceph/addr.c > +++ b/fs/ceph/addr.c > @@ -377,76 +377,23 @@ static int ceph_readpage(struct file *filp, struct page *page) > return err; > } > > -/* > - * Finish an async read(ahead) op. > - */ > -static void finish_read(struct ceph_osd_request *req) > -{ > - struct inode *inode = req->r_inode; > - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > - struct ceph_osd_data *osd_data; > - int rc = req->r_result <= 0 ? req->r_result : 0; > - int bytes = req->r_result >= 0 ? req->r_result : 0; > - int num_pages; > - int i; > - > - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); > - if (rc == -EBLACKLISTED) > - ceph_inode_to_client(inode)->blacklisted = true; > - > - /* unlock all pages, zeroing any data we didn't read */ > - osd_data = osd_req_op_extent_osd_data(req, 0); > - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); > - num_pages = calc_pages_for((u64)osd_data->alignment, > - (u64)osd_data->length); > - for (i = 0; i < num_pages; i++) { > - struct page *page = osd_data->pages[i]; > - > - if (rc < 0 && rc != -ENOENT) > - goto unlock; > - if (bytes < (int)PAGE_SIZE) { > - /* zero (remainder of) page */ > - int s = bytes < 0 ? 0 : bytes; > - zero_user_segment(page, s, PAGE_SIZE); > - } > - dout("finish_read %p uptodate %p idx %lu\n", inode, page, > - page->index); > - flush_dcache_page(page); > - SetPageUptodate(page); > -unlock: > - unlock_page(page); > - put_page(page); > - bytes -= PAGE_SIZE; > - } > - > - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, > - req->r_end_latency, rc); > - > - kfree(osd_data->pages); > -} > - > -/* > - * start an async read(ahead) operation. return nr_pages we submitted > - * a read for on success, or negative error code. > - */ > -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, > - struct list_head *page_list, int max) > +static int ceph_readpages(struct file *file, struct address_space *mapping, > + struct list_head *page_list, unsigned nr_pages) > { > - struct ceph_osd_client *osdc = > - &ceph_inode_to_client(inode)->client->osdc; > + struct inode *inode = file_inode(file); > struct ceph_inode_info *ci = ceph_inode(inode); > - struct page *page = lru_to_page(page_list); > - struct ceph_vino vino; > - struct ceph_osd_request *req; > - u64 off; > - u64 len; > - int i; > - struct page **pages; > - pgoff_t next_index; > - int nr_pages = 0; > + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > + struct ceph_file_info *fi = file->private_data; > + struct ceph_rw_context *rw_ctx; > + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); > int got = 0; > int ret = 0; > + int max = fsc->mount_options->rsize >> PAGE_SHIFT; Have you ran tests with different values of rsize? Specifically, rsize < readahead_size == size_of_readpages I'm seeing a lot of problems with NFS when varying rsize are used wrt readahead values. Specifically I'm seeing panics because fscache expects a 1:1 mapping of issue_op() to io_done() calls, and I get panics because multiple read completions are trying to unlock the same pages inside fscache_read_done(). My understanding is afs does not have such 'rsize' limitation, so it may not be an area that is well tested. It could be my implementation of the NFS conversion though, as I thinkwhat needs to happen is the respect the above 1:1 mapping of issue_op() to io_done() calls, and my initial implementation did not do that. FWIW, specifically this unit test was originally failing for me with a panic. Sun 09 Aug 2020 11:03:22 AM EDT: 1. On NFS client, install and enable cachefilesd Sun 09 Aug 2020 11:03:22 AM EDT: 2. On NFS client, mount -o vers=4.1,fsc,rsize=16384 127.0.0.1:/export/dir1 /mnt/dir1 Sun 09 Aug 2020 11:03:22 AM EDT: 3. On NFS client, dd if=/dev/zero of=/mnt/dir1/file1.bin bs=65536 count=1 Sun 09 Aug 2020 11:03:22 AM EDT: 4. On NFS client, echo 3 > /proc/sys/vm/drop_caches Sun 09 Aug 2020 11:03:22 AM EDT: 5. On NFS client, ./nfs-readahead.sh set /mnt/dir1 65536 Sun 09 Aug 2020 11:03:23 AM EDT: 6. On NFS client, dd if=/mnt/dir1/file1.bin of=/dev/null Sun 09 Aug 2020 11:03:23 AM EDT: 8. On NFS client, echo 3 > /proc/sys/vm/drop_caches Sun 09 Aug 2020 11:03:23 AM EDT: 9. On NFS client, dd if=/mnt/dir1/file1.bin of=/dev/null > + > + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) > + return -EINVAL; > > + rw_ctx = ceph_find_rw_context(fi); > if (!rw_ctx) { > /* caller of readpages does not hold buffer and read caps > * (fadvise, madvise and readahead cases) */ > @@ -459,133 +406,33 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, > dout("start_read %p, no cache cap\n", inode); > ret = 0; > } > - if (ret <= 0) { > - if (got) > - ceph_put_cap_refs(ci, got); > - while (!list_empty(page_list)) { > - page = lru_to_page(page_list); > - list_del(&page->lru); > - put_page(page); > - } > - return ret; > - } > + if (ret <= 0) > + goto out; > } > > - off = (u64) page_offset(page); > + dout("readpages %p file %p ctx %p nr_pages %d max %d\n", > + inode, file, rw_ctx, nr_pages, max); > > - /* count pages */ > - next_index = page->index; > - list_for_each_entry_reverse(page, page_list, lru) { > - if (page->index != next_index) > - break; > - nr_pages++; > - next_index++; > - if (max && nr_pages == max) > - break; > - } > - len = nr_pages << PAGE_SHIFT; > - dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, > - off, len); > - vino = ceph_vino(inode); > - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, > - 0, 1, CEPH_OSD_OP_READ, > - CEPH_OSD_FLAG_READ, NULL, > - ci->i_truncate_seq, ci->i_truncate_size, > - false); > - if (IS_ERR(req)) { > - ret = PTR_ERR(req); > - goto out; > - } > + while (ret >= 0 && !list_empty(page_list)) { > + struct ceph_fscache_req *req = ceph_fsreq_alloc(); > > - /* build page vector */ > - nr_pages = calc_pages_for(0, len); > - pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); > - if (!pages) { > - ret = -ENOMEM; > - goto out_put; > - } > - for (i = 0; i < nr_pages; ++i) { > - page = list_entry(page_list->prev, struct page, lru); > - BUG_ON(PageLocked(page)); > - list_del(&page->lru); > - > - dout("start_read %p adding %p idx %lu\n", inode, page, > - page->index); > - if (add_to_page_cache_lru(page, &inode->i_data, page->index, > - GFP_KERNEL)) { > - put_page(page); > - dout("start_read %p add_to_page_cache failed %p\n", > - inode, page); > - nr_pages = i; > - if (nr_pages > 0) { > - len = nr_pages << PAGE_SHIFT; > - osd_req_op_extent_update(req, 0, len); > - break; > - } > - goto out_pages; > + if (!req) { > + ret = -ENOMEM; > + break; > } > - pages[i] = page; > - } > - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); > - req->r_callback = finish_read; > - req->r_inode = inode; > - > - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); > - ret = ceph_osdc_start_request(osdc, req, false); > - if (ret < 0) > - goto out_pages; > - ceph_osdc_put_request(req); > - > - /* After adding locked pages to page cache, the inode holds cache cap. > - * So we can drop our cap refs. */ > - if (got) > - ceph_put_cap_refs(ci, got); > - > - return nr_pages; > + fscache_init_io_request(&req->fscache_req, cookie, &ceph_readpage_fsreq_ops); > + req->fscache_req.mapping = inode->i_mapping; > > -out_pages: > - for (i = 0; i < nr_pages; ++i) { > - unlock_page(pages[i]); > + ret = fscache_read_helper_page_list(&req->fscache_req, page_list, max); > + ceph_fsreq_put(&req->fscache_req); > } > - ceph_put_page_vector(pages, nr_pages, false); > -out_put: > - ceph_osdc_put_request(req); > out: > + /* After adding locked pages to page cache, the inode holds Fc refs. We can drop ours. */ > if (got) > ceph_put_cap_refs(ci, got); > - return ret; > -} > > - > -/* > - * Read multiple pages. Leave pages we don't read + unlock in page_list; > - * the caller (VM) cleans them up. > - */ > -static int ceph_readpages(struct file *file, struct address_space *mapping, > - struct list_head *page_list, unsigned nr_pages) > -{ > - struct inode *inode = file_inode(file); > - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > - struct ceph_file_info *fi = file->private_data; > - struct ceph_rw_context *rw_ctx; > - int rc = 0; > - int max = 0; > - > - if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) > - return -EINVAL; > - > - rw_ctx = ceph_find_rw_context(fi); > - max = fsc->mount_options->rsize >> PAGE_SHIFT; > - dout("readpages %p file %p ctx %p nr_pages %d max %d\n", > - inode, file, rw_ctx, nr_pages, max); > - while (!list_empty(page_list)) { > - rc = start_read(inode, rw_ctx, page_list, max); > - if (rc < 0) > - goto out; > - } > -out: > - dout("readpages %p file %p ret %d\n", inode, file, rc); > - return rc; > + dout("readpages %p file %p ret %d\n", inode, file, ret); > + return ret; > } > > struct ceph_writeback_ctl > -- > 2.26.2 > > > -- > Linux-cachefs mailing list > Linux-cachefs@redhat.com > https://www.redhat.com/mailman/listinfo/linux-cachefs >
On Fri, Jul 31, 2020 at 9:05 AM Jeff Layton <jlayton@kernel.org> wrote: > +static int ceph_readpages(struct file *file, struct address_space *mapping, > + struct list_head *page_list, unsigned nr_pages) > { ... > + int max = fsc->mount_options->rsize >> PAGE_SHIFT; ... > + ret = fscache_read_helper_page_list(&req->fscache_req, page_list, max); Looks like the root of my problems is that the 'max_pages' parameter given to fscache_read_helper_page_list() does not work for purposes of limiting the IO to the 'rsize'. That is, the fscache_io_request.nr_pages exceeds 'max_pages' and becomes readahead_size. So even though max_pages is based on 'rsize', when issue_op() is called, it is for a fscache_io_request that exceeds 'rsize', resulting in multiple NFS reads that go over the wire and multiple completions, each of which end up calling back into io_done() which blows up because fscache does not expect this. Looks like fscache_shape_request() overrides any 'max_pages' value (actually it is cachefiles_shape_request) , so it's unclear why the netfs would pass in a 'max_pages' if it is not honored - seems like a bug maybe or it's not obvious what the purpose is there. I tried a custom 'shape' method and got further, but it blew up on another test, so I'm not sure. It would be good to know if this somehow works for you but my guess is you'll see similar failures when rsize < readahead_size == size_of_readpages.
David Wysochanski <dwysocha@redhat.com> wrote: > Looks like fscache_shape_request() overrides any 'max_pages' value (actually > it is cachefiles_shape_request) , so it's unclear why the netfs would pass > in a 'max_pages' if it is not honored - seems like a bug maybe or it's not > obvious I think the problem is that cachefiles_shape_request() is applying the limit too early. It's using it to cut down the number of pages in the original request (only applicable to readpages), but then the shaping to fit cache granules can exceed that, so it needs to be applied later also. Does the attached patch help? David --- diff --git a/fs/cachefiles/content-map.c b/fs/cachefiles/content-map.c index 2bfba2e41c39..ce05cf1d9a6e 100644 --- a/fs/cachefiles/content-map.c +++ b/fs/cachefiles/content-map.c @@ -134,7 +134,8 @@ void cachefiles_shape_request(struct fscache_object *obj, _enter("{%lx,%lx,%x},%llx,%d", start, end, max_pages, i_size, shape->for_write); - if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE) { + if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE || + max_pages < CACHEFILES_GRAN_PAGES) { shape->to_be_done = FSCACHE_READ_FROM_SERVER; return; } @@ -144,10 +145,6 @@ void cachefiles_shape_request(struct fscache_object *obj, if (shape->i_size > CACHEFILES_SIZE_LIMIT) i_size = CACHEFILES_SIZE_LIMIT; - max_pages = round_down(max_pages, CACHEFILES_GRAN_PAGES); - if (end - start > max_pages) - end = start + max_pages; - granule = start / CACHEFILES_GRAN_PAGES; if (granule / 8 >= object->content_map_size) { cachefiles_expand_content_map(object, i_size); @@ -185,6 +182,10 @@ void cachefiles_shape_request(struct fscache_object *obj, start = round_down(start, CACHEFILES_GRAN_PAGES); end = round_up(end, CACHEFILES_GRAN_PAGES); + /* Trim to the maximum size the netfs supports */ + if (end - start > max_pages) + end = round_down(start + max_pages, CACHEFILES_GRAN_PAGES); + /* But trim to the end of the file and the starting page */ eof = (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; if (eof <= shape->proposed_start)
On Sun, 2020-08-09 at 11:09 -0400, David Wysochanski wrote: > On Fri, Jul 31, 2020 at 9:05 AM Jeff Layton <jlayton@kernel.org> wrote: > > Convert ceph_readpages to use the fscache_read_helper. With this we can > > rip out a lot of the old readpage/readpages infrastructure. > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > --- > > fs/ceph/addr.c | 209 +++++++------------------------------------------ > > 1 file changed, 28 insertions(+), 181 deletions(-) > > > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > > index cee497c108bb..8905fe4a0930 100644 > > --- a/fs/ceph/addr.c > > +++ b/fs/ceph/addr.c > > @@ -377,76 +377,23 @@ static int ceph_readpage(struct file *filp, struct page *page) > > return err; > > } > > > > -/* > > - * Finish an async read(ahead) op. > > - */ > > -static void finish_read(struct ceph_osd_request *req) > > -{ > > - struct inode *inode = req->r_inode; > > - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > - struct ceph_osd_data *osd_data; > > - int rc = req->r_result <= 0 ? req->r_result : 0; > > - int bytes = req->r_result >= 0 ? req->r_result : 0; > > - int num_pages; > > - int i; > > - > > - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); > > - if (rc == -EBLACKLISTED) > > - ceph_inode_to_client(inode)->blacklisted = true; > > - > > - /* unlock all pages, zeroing any data we didn't read */ > > - osd_data = osd_req_op_extent_osd_data(req, 0); > > - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); > > - num_pages = calc_pages_for((u64)osd_data->alignment, > > - (u64)osd_data->length); > > - for (i = 0; i < num_pages; i++) { > > - struct page *page = osd_data->pages[i]; > > - > > - if (rc < 0 && rc != -ENOENT) > > - goto unlock; > > - if (bytes < (int)PAGE_SIZE) { > > - /* zero (remainder of) page */ > > - int s = bytes < 0 ? 0 : bytes; > > - zero_user_segment(page, s, PAGE_SIZE); > > - } > > - dout("finish_read %p uptodate %p idx %lu\n", inode, page, > > - page->index); > > - flush_dcache_page(page); > > - SetPageUptodate(page); > > -unlock: > > - unlock_page(page); > > - put_page(page); > > - bytes -= PAGE_SIZE; > > - } > > - > > - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, > > - req->r_end_latency, rc); > > - > > - kfree(osd_data->pages); > > -} > > - > > -/* > > - * start an async read(ahead) operation. return nr_pages we submitted > > - * a read for on success, or negative error code. > > - */ > > -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, > > - struct list_head *page_list, int max) > > +static int ceph_readpages(struct file *file, struct address_space *mapping, > > + struct list_head *page_list, unsigned nr_pages) > > { > > - struct ceph_osd_client *osdc = > > - &ceph_inode_to_client(inode)->client->osdc; > > + struct inode *inode = file_inode(file); > > struct ceph_inode_info *ci = ceph_inode(inode); > > - struct page *page = lru_to_page(page_list); > > - struct ceph_vino vino; > > - struct ceph_osd_request *req; > > - u64 off; > > - u64 len; > > - int i; > > - struct page **pages; > > - pgoff_t next_index; > > - int nr_pages = 0; > > + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > + struct ceph_file_info *fi = file->private_data; > > + struct ceph_rw_context *rw_ctx; > > + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); > > int got = 0; > > int ret = 0; > > + int max = fsc->mount_options->rsize >> PAGE_SHIFT; > > Have you ran tests with different values of rsize? > Specifically, rsize < readahead_size == size_of_readpages > > I'm seeing a lot of problems with NFS when varying rsize are used wrt > readahead values. Specifically I'm seeing panics because fscache > expects a 1:1 mapping of issue_op() to io_done() calls, and I get > panics because multiple read completions are trying to unlock the > same pages inside fscache_read_done(). > > My understanding is afs does not have such 'rsize' limitation, so it > may not be an area that is well tested. It could be my implementation > of the NFS conversion though, as I thinkwhat needs to happen is the > respect the above 1:1 mapping of issue_op() to io_done() calls, and my > initial implementation did not do that. > > FWIW, specifically this unit test was originally failing for me with a panic. > Sun 09 Aug 2020 11:03:22 AM EDT: 1. On NFS client, install and enable > cachefilesd > Sun 09 Aug 2020 11:03:22 AM EDT: 2. On NFS client, mount -o > vers=4.1,fsc,rsize=16384 127.0.0.1:/export/dir1 /mnt/dir1 > Sun 09 Aug 2020 11:03:22 AM EDT: 3. On NFS client, dd if=/dev/zero > of=/mnt/dir1/file1.bin bs=65536 count=1 > Sun 09 Aug 2020 11:03:22 AM EDT: 4. On NFS client, echo 3 > > /proc/sys/vm/drop_caches > Sun 09 Aug 2020 11:03:22 AM EDT: 5. On NFS client, ./nfs-readahead.sh > set /mnt/dir1 65536 > Sun 09 Aug 2020 11:03:23 AM EDT: 8. On NFS client, echo 3 > > /proc/sys/vm/drop_caches > Sun 09 Aug 2020 11:03:23 AM EDT: 9. On NFS client, dd > if=/mnt/dir1/file1.bin of=/dev/null > > I haven't tested much with varying rsize and wsize (setting them on cephfs is pretty rare), but I'll plan to. What's in nfs-readahead.sh? > > > + > > + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) > > + return -EINVAL; > > > > + rw_ctx = ceph_find_rw_context(fi); > > if (!rw_ctx) { > > /* caller of readpages does not hold buffer and read caps > > * (fadvise, madvise and readahead cases) */ > > @@ -459,133 +406,33 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, > > dout("start_read %p, no cache cap\n", inode); > > ret = 0; > > } > > - if (ret <= 0) { > > - if (got) > > - ceph_put_cap_refs(ci, got); > > - while (!list_empty(page_list)) { > > - page = lru_to_page(page_list); > > - list_del(&page->lru); > > - put_page(page); > > - } > > - return ret; > > - } > > + if (ret <= 0) > > + goto out; > > } > > > > - off = (u64) page_offset(page); > > + dout("readpages %p file %p ctx %p nr_pages %d max %d\n", > > + inode, file, rw_ctx, nr_pages, max); > > > > - /* count pages */ > > - next_index = page->index; > > - list_for_each_entry_reverse(page, page_list, lru) { > > - if (page->index != next_index) > > - break; > > - nr_pages++; > > - next_index++; > > - if (max && nr_pages == max) > > - break; > > - } > > - len = nr_pages << PAGE_SHIFT; > > - dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, > > - off, len); > > - vino = ceph_vino(inode); > > - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, > > - 0, 1, CEPH_OSD_OP_READ, > > - CEPH_OSD_FLAG_READ, NULL, > > - ci->i_truncate_seq, ci->i_truncate_size, > > - false); > > - if (IS_ERR(req)) { > > - ret = PTR_ERR(req); > > - goto out; > > - } > > + while (ret >= 0 && !list_empty(page_list)) { > > + struct ceph_fscache_req *req = ceph_fsreq_alloc(); > > > > - /* build page vector */ > > - nr_pages = calc_pages_for(0, len); > > - pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); > > - if (!pages) { > > - ret = -ENOMEM; > > - goto out_put; > > - } > > - for (i = 0; i < nr_pages; ++i) { > > - page = list_entry(page_list->prev, struct page, lru); > > - BUG_ON(PageLocked(page)); > > - list_del(&page->lru); > > - > > - dout("start_read %p adding %p idx %lu\n", inode, page, > > - page->index); > > - if (add_to_page_cache_lru(page, &inode->i_data, page->index, > > - GFP_KERNEL)) { > > - put_page(page); > > - dout("start_read %p add_to_page_cache failed %p\n", > > - inode, page); > > - nr_pages = i; > > - if (nr_pages > 0) { > > - len = nr_pages << PAGE_SHIFT; > > - osd_req_op_extent_update(req, 0, len); > > - break; > > - } > > - goto out_pages; > > + if (!req) { > > + ret = -ENOMEM; > > + break; > > } > > - pages[i] = page; > > - } > > - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); > > - req->r_callback = finish_read; > > - req->r_inode = inode; > > - > > - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); > > - ret = ceph_osdc_start_request(osdc, req, false); > > - if (ret < 0) > > - goto out_pages; > > - ceph_osdc_put_request(req); > > - > > - /* After adding locked pages to page cache, the inode holds cache cap. > > - * So we can drop our cap refs. */ > > - if (got) > > - ceph_put_cap_refs(ci, got); > > - > > - return nr_pages; > > + fscache_init_io_request(&req->fscache_req, cookie, &ceph_readpage_fsreq_ops); > > + req->fscache_req.mapping = inode->i_mapping; > > > > -out_pages: > > - for (i = 0; i < nr_pages; ++i) { > > - unlock_page(pages[i]); > > + ret = fscache_read_helper_page_list(&req->fscache_req, page_list, max); > > + ceph_fsreq_put(&req->fscache_req); > > } > > - ceph_put_page_vector(pages, nr_pages, false); > > -out_put: > > - ceph_osdc_put_request(req); > > out: > > + /* After adding locked pages to page cache, the inode holds Fc refs. We can drop ours. */ > > if (got) > > ceph_put_cap_refs(ci, got); > > - return ret; > > -} > > > > - > > -/* > > - * Read multiple pages. Leave pages we don't read + unlock in page_list; > > - * the caller (VM) cleans them up. > > - */ > > -static int ceph_readpages(struct file *file, struct address_space *mapping, > > - struct list_head *page_list, unsigned nr_pages) > > -{ > > - struct inode *inode = file_inode(file); > > - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > - struct ceph_file_info *fi = file->private_data; > > - struct ceph_rw_context *rw_ctx; > > - int rc = 0; > > - int max = 0; > > - > > - if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) > > - return -EINVAL; > > - > > - rw_ctx = ceph_find_rw_context(fi); > > - max = fsc->mount_options->rsize >> PAGE_SHIFT; > > - dout("readpages %p file %p ctx %p nr_pages %d max %d\n", > > - inode, file, rw_ctx, nr_pages, max); > > - while (!list_empty(page_list)) { > > - rc = start_read(inode, rw_ctx, page_list, max); > > - if (rc < 0) > > - goto out; > > - } > > -out: > > - dout("readpages %p file %p ret %d\n", inode, file, rc); > > - return rc; > > + dout("readpages %p file %p ret %d\n", inode, file, ret); > > + return ret; > > } > > > > struct ceph_writeback_ctl > > -- > > 2.26.2 > > > > > > -- > > Linux-cachefs mailing list > > Linux-cachefs@redhat.com > > https://www.redhat.com/mailman/listinfo/linux-cachefs > >
On Mon, Aug 10, 2020 at 7:09 AM Jeff Layton <jlayton@kernel.org> wrote: > > On Sun, 2020-08-09 at 11:09 -0400, David Wysochanski wrote: > > On Fri, Jul 31, 2020 at 9:05 AM Jeff Layton <jlayton@kernel.org> wrote: > > > Convert ceph_readpages to use the fscache_read_helper. With this we can > > > rip out a lot of the old readpage/readpages infrastructure. > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > > --- > > > fs/ceph/addr.c | 209 +++++++------------------------------------------ > > > 1 file changed, 28 insertions(+), 181 deletions(-) > > > > > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > > > index cee497c108bb..8905fe4a0930 100644 > > > --- a/fs/ceph/addr.c > > > +++ b/fs/ceph/addr.c > > > @@ -377,76 +377,23 @@ static int ceph_readpage(struct file *filp, struct page *page) > > > return err; > > > } > > > > > > -/* > > > - * Finish an async read(ahead) op. > > > - */ > > > -static void finish_read(struct ceph_osd_request *req) > > > -{ > > > - struct inode *inode = req->r_inode; > > > - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > > - struct ceph_osd_data *osd_data; > > > - int rc = req->r_result <= 0 ? req->r_result : 0; > > > - int bytes = req->r_result >= 0 ? req->r_result : 0; > > > - int num_pages; > > > - int i; > > > - > > > - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); > > > - if (rc == -EBLACKLISTED) > > > - ceph_inode_to_client(inode)->blacklisted = true; > > > - > > > - /* unlock all pages, zeroing any data we didn't read */ > > > - osd_data = osd_req_op_extent_osd_data(req, 0); > > > - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); > > > - num_pages = calc_pages_for((u64)osd_data->alignment, > > > - (u64)osd_data->length); > > > - for (i = 0; i < num_pages; i++) { > > > - struct page *page = osd_data->pages[i]; > > > - > > > - if (rc < 0 && rc != -ENOENT) > > > - goto unlock; > > > - if (bytes < (int)PAGE_SIZE) { > > > - /* zero (remainder of) page */ > > > - int s = bytes < 0 ? 0 : bytes; > > > - zero_user_segment(page, s, PAGE_SIZE); > > > - } > > > - dout("finish_read %p uptodate %p idx %lu\n", inode, page, > > > - page->index); > > > - flush_dcache_page(page); > > > - SetPageUptodate(page); > > > -unlock: > > > - unlock_page(page); > > > - put_page(page); > > > - bytes -= PAGE_SIZE; > > > - } > > > - > > > - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, > > > - req->r_end_latency, rc); > > > - > > > - kfree(osd_data->pages); > > > -} > > > - > > > -/* > > > - * start an async read(ahead) operation. return nr_pages we submitted > > > - * a read for on success, or negative error code. > > > - */ > > > -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, > > > - struct list_head *page_list, int max) > > > +static int ceph_readpages(struct file *file, struct address_space *mapping, > > > + struct list_head *page_list, unsigned nr_pages) > > > { > > > - struct ceph_osd_client *osdc = > > > - &ceph_inode_to_client(inode)->client->osdc; > > > + struct inode *inode = file_inode(file); > > > struct ceph_inode_info *ci = ceph_inode(inode); > > > - struct page *page = lru_to_page(page_list); > > > - struct ceph_vino vino; > > > - struct ceph_osd_request *req; > > > - u64 off; > > > - u64 len; > > > - int i; > > > - struct page **pages; > > > - pgoff_t next_index; > > > - int nr_pages = 0; > > > + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > > + struct ceph_file_info *fi = file->private_data; > > > + struct ceph_rw_context *rw_ctx; > > > + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); > > > int got = 0; > > > int ret = 0; > > > + int max = fsc->mount_options->rsize >> PAGE_SHIFT; > > > > Have you ran tests with different values of rsize? > > Specifically, rsize < readahead_size == size_of_readpages > > > > I'm seeing a lot of problems with NFS when varying rsize are used wrt > > readahead values. Specifically I'm seeing panics because fscache > > expects a 1:1 mapping of issue_op() to io_done() calls, and I get > > panics because multiple read completions are trying to unlock the > > same pages inside fscache_read_done(). > > > > My understanding is afs does not have such 'rsize' limitation, so it > > may not be an area that is well tested. It could be my implementation > > of the NFS conversion though, as I thinkwhat needs to happen is the > > respect the above 1:1 mapping of issue_op() to io_done() calls, and my > > initial implementation did not do that. > > > > FWIW, specifically this unit test was originally failing for me with a panic. > > Sun 09 Aug 2020 11:03:22 AM EDT: 1. On NFS client, install and enable > > cachefilesd > > Sun 09 Aug 2020 11:03:22 AM EDT: 2. On NFS client, mount -o > > vers=4.1,fsc,rsize=16384 127.0.0.1:/export/dir1 /mnt/dir1 > > Sun 09 Aug 2020 11:03:22 AM EDT: 3. On NFS client, dd if=/dev/zero > > of=/mnt/dir1/file1.bin bs=65536 count=1 > > Sun 09 Aug 2020 11:03:22 AM EDT: 4. On NFS client, echo 3 > > > /proc/sys/vm/drop_caches > > Sun 09 Aug 2020 11:03:22 AM EDT: 5. On NFS client, ./nfs-readahead.sh > > set /mnt/dir1 65536 > > Sun 09 Aug 2020 11:03:23 AM EDT: 8. On NFS client, echo 3 > > > /proc/sys/vm/drop_caches > > Sun 09 Aug 2020 11:03:23 AM EDT: 9. On NFS client, dd > > if=/mnt/dir1/file1.bin of=/dev/null > > > > > > I haven't tested much with varying rsize and wsize (setting them on > cephfs is pretty rare), but I'll plan to. What's in nfs-readahead.sh? > > See attached.
On Mon, Aug 10, 2020 at 6:09 AM David Howells <dhowells@redhat.com> wrote: > > David Wysochanski <dwysocha@redhat.com> wrote: > > > Looks like fscache_shape_request() overrides any 'max_pages' value (actually > > it is cachefiles_shape_request) , so it's unclear why the netfs would pass > > in a 'max_pages' if it is not honored - seems like a bug maybe or it's not > > obvious > > I think the problem is that cachefiles_shape_request() is applying the limit > too early. It's using it to cut down the number of pages in the original > request (only applicable to readpages), but then the shaping to fit cache > granules can exceed that, so it needs to be applied later also. > > Does the attached patch help? > > David > --- > diff --git a/fs/cachefiles/content-map.c b/fs/cachefiles/content-map.c > index 2bfba2e41c39..ce05cf1d9a6e 100644 > --- a/fs/cachefiles/content-map.c > +++ b/fs/cachefiles/content-map.c > @@ -134,7 +134,8 @@ void cachefiles_shape_request(struct fscache_object *obj, > _enter("{%lx,%lx,%x},%llx,%d", > start, end, max_pages, i_size, shape->for_write); > > - if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE) { > + if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE || > + max_pages < CACHEFILES_GRAN_PAGES) { > shape->to_be_done = FSCACHE_READ_FROM_SERVER; > return; > } > @@ -144,10 +145,6 @@ void cachefiles_shape_request(struct fscache_object *obj, > if (shape->i_size > CACHEFILES_SIZE_LIMIT) > i_size = CACHEFILES_SIZE_LIMIT; > > - max_pages = round_down(max_pages, CACHEFILES_GRAN_PAGES); > - if (end - start > max_pages) > - end = start + max_pages; > - > granule = start / CACHEFILES_GRAN_PAGES; > if (granule / 8 >= object->content_map_size) { > cachefiles_expand_content_map(object, i_size); > @@ -185,6 +182,10 @@ void cachefiles_shape_request(struct fscache_object *obj, > start = round_down(start, CACHEFILES_GRAN_PAGES); > end = round_up(end, CACHEFILES_GRAN_PAGES); > > + /* Trim to the maximum size the netfs supports */ > + if (end - start > max_pages) > + end = round_down(start + max_pages, CACHEFILES_GRAN_PAGES); > + > /* But trim to the end of the file and the starting page */ > eof = (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; > if (eof <= shape->proposed_start) > I tried this and got the same panic - I think i_size is the culprit (it is larger than max_pages). I'll send you a larger trace offline with cachefiles/fscache debugging enabled if that helps, but below is some custom tracing that may be enough because it shows before / after shaping values. Here's outline of the test (smaller rsize and readahead for simplicity): # ./t1_rsize_lt_read.sh 4.1 Setting NFS vers=4.1 Mon 10 Aug 2020 09:34:18 AM EDT: 1. On NFS client, install and enable cachefilesd Mon 10 Aug 2020 09:34:18 AM EDT: 2. On NFS client, mount -o vers=4.1,fsc,rsize=8192 127.0.0.1:/export/dir1 /mnt/dir1 Mon 10 Aug 2020 09:34:18 AM EDT: 3. On NFS client, dd if=/dev/zero of=/mnt/dir1/file1.bin bs=16384 count=1 Mon 10 Aug 2020 09:34:18 AM EDT: 4. On NFS client, echo 3 > /proc/sys/vm/drop_caches Mon 10 Aug 2020 09:34:19 AM EDT: 5. On NFS client, ./nfs-readahead.sh set /mnt/dir1 16384 Mon 10 Aug 2020 09:34:19 AM EDT: 6. On NFS client, dd if=/mnt/dir1/file1.bin of=/dev/null Mon 10 Aug 2020 09:34:19 AM EDT: 7. On NFS client, echo 3 > /proc/sys/vm/drop_caches Mon 10 Aug 2020 09:34:19 AM EDT: 8. On NFS client, dd if=/mnt/dir1/file1.bin of=/dev/null Console with custom nfs tracing [ 62.955355] t1_rsize_lt_rea (4840): drop_caches: 3 [ 63.028786] fs/nfs/fscache.c:480 before read_helper_page_list pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 pages ffffb4b4c0fafca8 max_pages 2 [ 63.028804] fs/fscache/read_helper.c:347 pid 4882 fscache_read_helper before shape req ffff8902f50b5800 req->nr_pages 0 shape.actual_nr_pages 48 shape.proposed_nr_pages 4 [ 63.037231] fs/fscache/read_helper.c:353 pid 4882 fscache_read_helper after shape req ffff8902f50b5800 req->nr_pages 0 shape.actual_nr_pages 4 shape.proposed_nr_pages 4 [ 63.043421] fs/fscache/read_helper.c:531 pid 4882 fscache_read_helper before while req ffff8902f50b5800 req->nr_pages 1 shape.actual_nr_pages 4 shape.proposed_nr_pages 4 [ 63.049498] fs/fscache/read_helper.c:531 pid 4882 fscache_read_helper before while req ffff8902f50b5800 req->nr_pages 2 shape.actual_nr_pages 4 shape.proposed_nr_pages 4 [ 63.063708] fs/fscache/read_helper.c:531 pid 4882 fscache_read_helper before while req ffff8902f50b5800 req->nr_pages 3 shape.actual_nr_pages 4 shape.proposed_nr_pages 4 [ 63.070114] fs/fscache/read_helper.c:531 pid 4882 fscache_read_helper before while req ffff8902f50b5800 req->nr_pages 4 shape.actual_nr_pages 4 shape.proposed_nr_pages 4 [ 63.076438] fs/nfs/fscache.c:369 enter nfs_issue_op pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 start 0 last 3 [ 63.082964] fs/nfs/fscache.c:379 before readpage_async_filler pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page fffff42f08741a00 [ 63.087591] fs/nfs/fscache.c:382 after readpage_async_filler pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page fffff42f08741a00 cache.error 0 [ 63.093058] fs/nfs/fscache.c:379 before readpage_async_filler pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page fffff42f08288680 [ 63.098927] fs/nfs/fscache.c:382 after readpage_async_filler pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page fffff42f08288680 cache.error 0 [ 63.104507] fs/nfs/fscache.c:379 before readpage_async_filler pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page fffff42f082816c0 [ 63.110922] fs/nfs/fscache.c:382 after readpage_async_filler pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page fffff42f082816c0 cache.error 0 [ 63.111973] fs/nfs/fscache.c:523 pid 233 before io_done inode ffff8902b4a2a828 bytes 8192 &req->cache ffff8902f50b5800 cache.pos 0 cache.len 16384 [ 63.115407] fs/nfs/fscache.c:379 before readpage_async_filler pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page fffff42f067e8f40 [ 63.126337] fs/nfs/fscache.c:382 after readpage_async_filler pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 page fffff42f067e8f40 cache.error 0 [ 63.131411] fs/nfs/fscache.c:388 exit nfs_issue_op pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 [ 63.131955] fs/nfs/fscache.c:523 pid 233 before io_done inode ffff8902b4a2a828 bytes 8192 &req->cache ffff8902f50b5800 cache.pos 0 cache.len 16384 [ 63.137012] fs/nfs/fscache.c:484 after read_helper_page_list pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 cache.pos 0 cache.len 16384 cache.nr_pages 4 pages ffffb4b4c0fafca8 ret 0 [ 63.140922] page:fffff42f08741a00 refcount:2 mapcount:0 mapping:00000000727f3adc index:0x0 [ 63.141091] mapping->aops:nfs_file_aops [nfs] dentry name:"file1.bin" [ 63.146475] fs/nfs/fscache.c:490 outside while(!list_empty(pages)) read_helper_page_list pid 4882 inode ffff8902b4a2a828 cache ffff8902f50b5800 cache.pos 0 cache.len 16384 cache.nr_pages 4 [ 63.146740] fs/fscache/read_helper.c:347 pid 4882 fscache_read_helper before shape req ffff8902f50b5800 req->nr_pages 0 shape.actual_nr_pages 3227832042 shape.proposed_nr_pages 1 [ 63.153662] flags: 0x17ffffc0000006(referenced|uptodate) [ 63.153699] raw: 0017ffffc0000006 dead000000000100 dead000000000122 ffff8902b4a2a9a0 [ 63.168174] fs/fscache/read_helper.c:353 pid 4882 fscache_read_helper after shape req ffff8902f50b5800 req->nr_pages 0 shape.actual_nr_pages 5 shape.proposed_nr_pages 1 [ 63.193131] raw: 0000000000000000 0000000000000000 00000001ffffffff ffff8902ecfe8000 [ 63.203785] page dumped because: VM_BUG_ON_PAGE(!PageLocked(page)) [ 63.206372] page->mem_cgroup:ffff8902ecfe8000 [ 63.208333] ------------[ cut here ]------------ [ 63.211081] kernel BUG at mm/filemap.c:1290! [ 63.213152] invalid opcode: 0000 [#1] SMP PTI
On Mon, Aug 10, 2020 at 9:50 AM David Wysochanski <dwysocha@redhat.com> wrote: > > On Mon, Aug 10, 2020 at 6:09 AM David Howells <dhowells@redhat.com> wrote: > > > > David Wysochanski <dwysocha@redhat.com> wrote: > > > > > Looks like fscache_shape_request() overrides any 'max_pages' value (actually > > > it is cachefiles_shape_request) , so it's unclear why the netfs would pass > > > in a 'max_pages' if it is not honored - seems like a bug maybe or it's not > > > obvious > > > > I think the problem is that cachefiles_shape_request() is applying the limit > > too early. It's using it to cut down the number of pages in the original > > request (only applicable to readpages), but then the shaping to fit cache > > granules can exceed that, so it needs to be applied later also. > > > > Does the attached patch help? > > > > David > > --- > > diff --git a/fs/cachefiles/content-map.c b/fs/cachefiles/content-map.c > > index 2bfba2e41c39..ce05cf1d9a6e 100644 > > --- a/fs/cachefiles/content-map.c > > +++ b/fs/cachefiles/content-map.c > > @@ -134,7 +134,8 @@ void cachefiles_shape_request(struct fscache_object *obj, > > _enter("{%lx,%lx,%x},%llx,%d", > > start, end, max_pages, i_size, shape->for_write); > > > > - if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE) { > > + if (start >= CACHEFILES_SIZE_LIMIT / PAGE_SIZE || > > + max_pages < CACHEFILES_GRAN_PAGES) { > > shape->to_be_done = FSCACHE_READ_FROM_SERVER; > > return; > > } > > @@ -144,10 +145,6 @@ void cachefiles_shape_request(struct fscache_object *obj, > > if (shape->i_size > CACHEFILES_SIZE_LIMIT) > > i_size = CACHEFILES_SIZE_LIMIT; > > > > - max_pages = round_down(max_pages, CACHEFILES_GRAN_PAGES); > > - if (end - start > max_pages) > > - end = start + max_pages; > > - > > granule = start / CACHEFILES_GRAN_PAGES; > > if (granule / 8 >= object->content_map_size) { > > cachefiles_expand_content_map(object, i_size); > > @@ -185,6 +182,10 @@ void cachefiles_shape_request(struct fscache_object *obj, > > start = round_down(start, CACHEFILES_GRAN_PAGES); > > end = round_up(end, CACHEFILES_GRAN_PAGES); > > > > + /* Trim to the maximum size the netfs supports */ > > + if (end - start > max_pages) > > + end = round_down(start + max_pages, CACHEFILES_GRAN_PAGES); > > + > > /* But trim to the end of the file and the starting page */ > > eof = (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; > > if (eof <= shape->proposed_start) > > > > I tried this and got the same panic - I think i_size is the culprit > (it is larger than max_pages). I'll send you a larger trace offline > with cachefiles/fscache debugging enabled if that helps, but below is > some custom tracing that may be enough because it shows before / after > shaping values. > FWIW, after testing the aforementioned patch, and tracing it, it is not i_size after all. I added this small patch on top of the patch to cachefiles_shape_request() and no more panics. Though this may not address the full underlying issues, it at least gets past this point and max_pages seems to work better. --- diff --git a/fs/fscache/read_helper.c b/fs/fscache/read_helper.c index a464c3e3188a..fa67339e7304 100644 --- a/fs/fscache/read_helper.c +++ b/fs/fscache/read_helper.c @@ -318,8 +318,8 @@ static int fscache_read_helper(struct fscache_io_request *req, switch (type) { case FSCACHE_READ_PAGE_LIST: shape.proposed_start = lru_to_page(pages)->index; - shape.proposed_nr_pages = - lru_to_last_page(pages)->index - shape.proposed_start + 1; + shape.proposed_nr_pages = min_t(unsigned int, max_pages, + lru_to_last_page(pages)->index - shape.proposed_start + 1); break; case FSCACHE_READ_LOCKED_PAGE:
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index cee497c108bb..8905fe4a0930 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -377,76 +377,23 @@ static int ceph_readpage(struct file *filp, struct page *page) return err; } -/* - * Finish an async read(ahead) op. - */ -static void finish_read(struct ceph_osd_request *req) -{ - struct inode *inode = req->r_inode; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_data *osd_data; - int rc = req->r_result <= 0 ? req->r_result : 0; - int bytes = req->r_result >= 0 ? req->r_result : 0; - int num_pages; - int i; - - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); - if (rc == -EBLACKLISTED) - ceph_inode_to_client(inode)->blacklisted = true; - - /* unlock all pages, zeroing any data we didn't read */ - osd_data = osd_req_op_extent_osd_data(req, 0); - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); - num_pages = calc_pages_for((u64)osd_data->alignment, - (u64)osd_data->length); - for (i = 0; i < num_pages; i++) { - struct page *page = osd_data->pages[i]; - - if (rc < 0 && rc != -ENOENT) - goto unlock; - if (bytes < (int)PAGE_SIZE) { - /* zero (remainder of) page */ - int s = bytes < 0 ? 0 : bytes; - zero_user_segment(page, s, PAGE_SIZE); - } - dout("finish_read %p uptodate %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); -unlock: - unlock_page(page); - put_page(page); - bytes -= PAGE_SIZE; - } - - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); - - kfree(osd_data->pages); -} - -/* - * start an async read(ahead) operation. return nr_pages we submitted - * a read for on success, or negative error code. - */ -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, - struct list_head *page_list, int max) +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) { - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct page *page = lru_to_page(page_list); - struct ceph_vino vino; - struct ceph_osd_request *req; - u64 off; - u64 len; - int i; - struct page **pages; - pgoff_t next_index; - int nr_pages = 0; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_file_info *fi = file->private_data; + struct ceph_rw_context *rw_ctx; + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); int got = 0; int ret = 0; + int max = fsc->mount_options->rsize >> PAGE_SHIFT; + + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) + return -EINVAL; + rw_ctx = ceph_find_rw_context(fi); if (!rw_ctx) { /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ @@ -459,133 +406,33 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, dout("start_read %p, no cache cap\n", inode); ret = 0; } - if (ret <= 0) { - if (got) - ceph_put_cap_refs(ci, got); - while (!list_empty(page_list)) { - page = lru_to_page(page_list); - list_del(&page->lru); - put_page(page); - } - return ret; - } + if (ret <= 0) + goto out; } - off = (u64) page_offset(page); + dout("readpages %p file %p ctx %p nr_pages %d max %d\n", + inode, file, rw_ctx, nr_pages, max); - /* count pages */ - next_index = page->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index != next_index) - break; - nr_pages++; - next_index++; - if (max && nr_pages == max) - break; - } - len = nr_pages << PAGE_SHIFT; - dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, - off, len); - vino = ceph_vino(inode); - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 0, 1, CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ, NULL, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - goto out; - } + while (ret >= 0 && !list_empty(page_list)) { + struct ceph_fscache_req *req = ceph_fsreq_alloc(); - /* build page vector */ - nr_pages = calc_pages_for(0, len); - pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out_put; - } - for (i = 0; i < nr_pages; ++i) { - page = list_entry(page_list->prev, struct page, lru); - BUG_ON(PageLocked(page)); - list_del(&page->lru); - - dout("start_read %p adding %p idx %lu\n", inode, page, - page->index); - if (add_to_page_cache_lru(page, &inode->i_data, page->index, - GFP_KERNEL)) { - put_page(page); - dout("start_read %p add_to_page_cache failed %p\n", - inode, page); - nr_pages = i; - if (nr_pages > 0) { - len = nr_pages << PAGE_SHIFT; - osd_req_op_extent_update(req, 0, len); - break; - } - goto out_pages; + if (!req) { + ret = -ENOMEM; + break; } - pages[i] = page; - } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); - req->r_callback = finish_read; - req->r_inode = inode; - - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); - ret = ceph_osdc_start_request(osdc, req, false); - if (ret < 0) - goto out_pages; - ceph_osdc_put_request(req); - - /* After adding locked pages to page cache, the inode holds cache cap. - * So we can drop our cap refs. */ - if (got) - ceph_put_cap_refs(ci, got); - - return nr_pages; + fscache_init_io_request(&req->fscache_req, cookie, &ceph_readpage_fsreq_ops); + req->fscache_req.mapping = inode->i_mapping; -out_pages: - for (i = 0; i < nr_pages; ++i) { - unlock_page(pages[i]); + ret = fscache_read_helper_page_list(&req->fscache_req, page_list, max); + ceph_fsreq_put(&req->fscache_req); } - ceph_put_page_vector(pages, nr_pages, false); -out_put: - ceph_osdc_put_request(req); out: + /* After adding locked pages to page cache, the inode holds Fc refs. We can drop ours. */ if (got) ceph_put_cap_refs(ci, got); - return ret; -} - -/* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. - */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) -{ - struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_file_info *fi = file->private_data; - struct ceph_rw_context *rw_ctx; - int rc = 0; - int max = 0; - - if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) - return -EINVAL; - - rw_ctx = ceph_find_rw_context(fi); - max = fsc->mount_options->rsize >> PAGE_SHIFT; - dout("readpages %p file %p ctx %p nr_pages %d max %d\n", - inode, file, rw_ctx, nr_pages, max); - while (!list_empty(page_list)) { - rc = start_read(inode, rw_ctx, page_list, max); - if (rc < 0) - goto out; - } -out: - dout("readpages %p file %p ret %d\n", inode, file, rc); - return rc; + dout("readpages %p file %p ret %d\n", inode, file, ret); + return ret; } struct ceph_writeback_ctl
Convert ceph_readpages to use the fscache_read_helper. With this we can rip out a lot of the old readpage/readpages infrastructure. Signed-off-by: Jeff Layton <jlayton@kernel.org> --- fs/ceph/addr.c | 209 +++++++------------------------------------------ 1 file changed, 28 insertions(+), 181 deletions(-)