From patchwork Mon Aug 27 13:45:21 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Luis Henriques X-Patchwork-Id: 10577103 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 8DB6217DE for ; Mon, 27 Aug 2018 13:44:33 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 7BCF728CFA for ; Mon, 27 Aug 2018 13:44:33 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 6FD8E29421; Mon, 27 Aug 2018 13:44:33 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 1772D28CFA for ; Mon, 27 Aug 2018 13:44:33 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727460AbeH0RbN (ORCPT ); Mon, 27 Aug 2018 13:31:13 -0400 Received: from mx2.suse.de ([195.135.220.15]:41592 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1726934AbeH0RbN (ORCPT ); Mon, 27 Aug 2018 13:31:13 -0400 X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay1.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id EBE28AFD9; Mon, 27 Aug 2018 13:44:29 +0000 (UTC) From: Luis Henriques To: "Yan, Zheng" , Sage Weil , Ilya Dryomov , Gregory Farnum Cc: ceph-devel@vger.kernel.org, Luis Henriques Subject: [RFC PATCH v2 1/3] ceph: add non-blocking parameter to ceph_try_get_caps() Date: Mon, 27 Aug 2018 14:45:21 +0100 Message-Id: <20180827134523.6758-2-lhenriques@suse.com> In-Reply-To: <20180827134523.6758-1-lhenriques@suse.com> References: <20180827134523.6758-1-lhenriques@suse.com> Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP ceph_try_get_caps currently calls try_get_cap_refs with the nonblock parameter always set to 'true'. This change adds a new parameter that allows to set it's value. This will be useful for a follow-up patch that will need to get two sets of capabilities for two different inodes without risking a deadlock. Signed-off-by: Luis Henriques --- fs/ceph/addr.c | 2 +- fs/ceph/caps.c | 7 ++++--- fs/ceph/super.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 292b3d72d725..b0392bfbd1ae 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -322,7 +322,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got); + ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); if (ret < 0) { dout("start_read %p, error getting cap\n", inode); } else if (!(got & want)) { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 990258cbd836..18f1e1f2acba 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2685,17 +2685,18 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got) +int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, + bool nonblock, int *got) { int ret, err = 0; BUG_ON(need & ~CEPH_CAP_FILE_RD); - BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); ret = ceph_pool_perm_check(ci, need); if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, true, got, &err); + ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err); if (ret) { if (err == -EAGAIN) { ret = 0; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a7077a0c989f..bd8c76b167ca 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1006,7 +1006,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page); extern int ceph_try_get_caps(struct ceph_inode_info *ci, - int need, int want, int *got); + int need, int want, bool nonblock, int *got); /* for counting open files by mode */ extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); From patchwork Mon Aug 27 13:45:22 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Luis Henriques X-Patchwork-Id: 10577105 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id D50F917DE for ; Mon, 27 Aug 2018 13:44:35 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id C280028CFA for ; Mon, 27 Aug 2018 13:44:35 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id B6BC129572; Mon, 27 Aug 2018 13:44:35 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 377D028CFA for ; Mon, 27 Aug 2018 13:44:35 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727471AbeH0RbP (ORCPT ); Mon, 27 Aug 2018 13:31:15 -0400 Received: from mx2.suse.de ([195.135.220.15]:41604 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1726934AbeH0RbP (ORCPT ); Mon, 27 Aug 2018 13:31:15 -0400 X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id B00D2AFDA; Mon, 27 Aug 2018 13:44:31 +0000 (UTC) From: Luis Henriques To: "Yan, Zheng" , Sage Weil , Ilya Dryomov , Gregory Farnum Cc: ceph-devel@vger.kernel.org, Luis Henriques Subject: [RFC PATCH v2 2/3] ceph: support the RADOS copy-from operation Date: Mon, 27 Aug 2018 14:45:22 +0100 Message-Id: <20180827134523.6758-3-lhenriques@suse.com> In-Reply-To: <20180827134523.6758-1-lhenriques@suse.com> References: <20180827134523.6758-1-lhenriques@suse.com> Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP Add support for performing remote object copies using the 'copy-from' operation. Signed-off-by: Luis Henriques --- include/linux/ceph/osd_client.h | 17 ++++++++ include/linux/ceph/rados.h | 19 +++++++++ net/ceph/osd_client.c | 72 +++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0d6ee04b4c41..898d03bf80db 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -136,6 +136,13 @@ struct ceph_osd_req_op { u64 expected_object_size; u64 expected_write_size; } alloc_hint; + struct { + u64 snapid; + u64 src_version; + u8 flags; + u32 src_fadvise_flags; + struct ceph_osd_data osd_data; + } copy_from; }; }; @@ -511,6 +518,16 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct timespec *mtime, struct page **pages, int nr_pages); +extern int ceph_osdc_copy_from(struct ceph_osd_client *osdc, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u64 dst_snapid, + struct ceph_object_id *dst_oid, + struct ceph_object_locator *dst_oloc, + u8 dst_fadvise_flags); + /* watch/notify */ struct ceph_osd_linger_request * ceph_osdc_watch(struct ceph_osd_client *osdc, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index f1988387c5ad..d47540986eff 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -410,6 +410,14 @@ enum { enum { CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ + CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in + the near future */ + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed + in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only + once by this client */ }; #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ @@ -497,6 +505,17 @@ struct ceph_osd_op { __le64 expected_object_size; __le64 expected_write_size; } __attribute__ ((packed)) alloc_hint; + struct { + __le64 snapid; + __le64 src_version; + __u8 flags; + /* + * __le32 flags: CEPH_OSD_OP_FLAG_FADVISE_: mean the + * fadvise flags for dest object src_fadvise_flags mean + * the fadvise flags for src object + */ + __le32 src_fadvise_flags; + } __attribute__ ((packed)) copy_from; }; __le32 payload_len; } __attribute__ ((packed)); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a00c74f1154e..ce55d6080042 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -955,6 +955,14 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: break; + case CEPH_OSD_OP_COPY_FROM: + dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); + dst->copy_from.src_version = + cpu_to_le64(src->copy_from.src_version); + dst->copy_from.flags = src->copy_from.flags; + dst->copy_from.src_fadvise_flags = + cpu_to_le32(src->copy_from.src_fadvise_flags); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -1908,6 +1916,10 @@ static void setup_request_data(struct ceph_osd_request *req, ceph_osdc_msg_data_add(req->r_reply, &op->notify.response_data); break; + case CEPH_OSD_OP_COPY_FROM: + ceph_osdc_msg_data_add(msg, + &op->copy_from.osd_data); + break; } data_len += op->indata_len; @@ -5168,6 +5180,66 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } EXPORT_SYMBOL(ceph_osdc_writepages); +int ceph_osdc_copy_from(struct ceph_osd_client *osdc, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u64 dst_snapid, + struct ceph_object_id *dst_oid, + struct ceph_object_locator *dst_oloc, + u8 dst_fadvise_flags) +{ + struct ceph_osd_request *req = NULL; + struct ceph_options *opts = osdc->client->options; + struct ceph_osd_req_op *op; + struct page **pages; + void *p, *end; + int ret; + + pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(pages)) + return PTR_ERR(pages); + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); + if (!req) { + ret = -ENOMEM; + goto out; + } + req->r_flags = CEPH_OSD_FLAG_WRITE; + op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, 0); + op->copy_from.snapid = src_snapid; + op->copy_from.src_version = src_version; + op->copy_from.flags = dst_fadvise_flags; + op->copy_from.src_fadvise_flags = src_fadvise_flags; + + ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); + ceph_oid_copy(&req->r_t.base_oid, dst_oid); + + p = page_address(pages[0]); + end = p + PAGE_SIZE; + ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); + encode_oloc(&p, end, src_oloc); + op->indata_len = PAGE_SIZE - (end - p); + + ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, + op->indata_len, 0, + false, true); + req->r_snapid = dst_snapid; + req->r_data_offset = 0; /* XXX dst_off? */ + + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (ret) + goto out; + ceph_osdc_start_request(osdc, req, false); + ret = wait_request_timeout(req, opts->mount_timeout); +out: + ceph_release_page_vector(pages, 1); + if (req) + ceph_osdc_put_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_copy_from); + int __init ceph_osdc_setup(void) { size_t size = sizeof(struct ceph_osd_request) + From patchwork Mon Aug 27 13:45:23 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Luis Henriques X-Patchwork-Id: 10577107 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id B1E5614BD for ; Mon, 27 Aug 2018 13:44:37 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 9D42128CFA for ; Mon, 27 Aug 2018 13:44:37 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 91A0129572; Mon, 27 Aug 2018 13:44:37 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id E947628CFA for ; Mon, 27 Aug 2018 13:44:36 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727485AbeH0RbR (ORCPT ); Mon, 27 Aug 2018 13:31:17 -0400 Received: from mx2.suse.de ([195.135.220.15]:41618 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1727463AbeH0RbR (ORCPT ); Mon, 27 Aug 2018 13:31:17 -0400 X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay1.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id D0418AFCC; Mon, 27 Aug 2018 13:44:32 +0000 (UTC) From: Luis Henriques To: "Yan, Zheng" , Sage Weil , Ilya Dryomov , Gregory Farnum Cc: ceph-devel@vger.kernel.org, Luis Henriques Subject: [RFC PATCH v2 3/3] ceph: support copy_file_range file operation Date: Mon, 27 Aug 2018 14:45:23 +0100 Message-Id: <20180827134523.6758-4-lhenriques@suse.com> In-Reply-To: <20180827134523.6758-1-lhenriques@suse.com> References: <20180827134523.6758-1-lhenriques@suse.com> Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP This commit implements support for the copy_file_range syscall in cephfs. It is implemented using the RADOS 'copy-from' operation, which allows to do a remote object copy, without the need to download/upload data from/to the OSDs. Some manual copy may however be required if the source/destination file offsets aren't object aligned or if the copy lenght is smaller than the object size. Signed-off-by: Luis Henriques --- fs/ceph/file.c | 225 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ad0bed99b1d5..8939ec224144 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include @@ -1820,6 +1821,229 @@ static long ceph_fallocate(struct file *file, int mode, return ret; } +static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + struct ceph_inode_info *src_ci = ceph_inode(src_inode); + struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(src_inode)->client->osdc; + struct ceph_cap_flush *prealloc_cf; + struct ceph_object_locator src_oloc, dst_oloc; + loff_t endoff = 0; + loff_t size; + ssize_t ret = -EIO; + int src_got = 0; + int dst_got = 0; + bool retrying = false; + + if (src_inode == dst_inode) + return -EINVAL; + if (ceph_snap(dst_inode) != CEPH_NOSNAP) + return -EROFS; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + /* Start by sync'ing the source file */ + ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); + if (ret < 0) + goto out; + + size = i_size_read(src_inode); + /* + * Don't copy beyond source file EOF. Instead of simply setting lenght + * to (size - src_off), just drop to VFS default implementation, as the + * local i_size may be stale due to other clients writing to the source + * inode. + */ + if (src_off + len > size) { + ret = -EOPNOTSUPP; + goto out; + } + if (!len) { + ret = 0; + goto out; + } + size = i_size_read(dst_inode); + endoff = dst_off + len; + ret = inode_newsize_ok(dst_inode, endoff); + if (ret) + goto out; + + if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) { + ret = -EDQUOT; + goto out; + } + +retry_caps: + ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + endoff, &dst_got, NULL); + if (ret < 0) + goto out; + /* + * We also need to get FILE_RD capabilities for source file as other + * clients may have dirty data in their caches. And OSDs know nothing + * about caps, so they can't safely do the remote object copies. + * + * However, since we're already holding the FILE_WR capability for the + * source file, we would risk a deadlock by using ceph_get_caps. Thus, + * we'll do some retry dance instead to try to get both capabilities. + * If everything fails, we just return -EOPNOTSUPP and fallback to the + * VFS default copy_file_range implementation. + */ + ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + false, &src_got); + if (ret <= 0) { + if (retrying) { + ret = -EOPNOTSUPP; + goto out_dst_caps; + } + /* Start by dropping dsc_ci caps and getting src_ci caps */ + ceph_put_cap_refs(dst_ci, dst_got); + ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, + (src_off + len), &src_got, NULL); + if (ret < 0) { + ret = -EOPNOTSUPP; + goto out; + } + /*... drop them too, and retry */ + ceph_put_cap_refs(src_ci, src_got); + retrying = true; + goto retry_caps; + } + + /* Drop dst file cached pages */ + ret = invalidate_inode_pages2_range(dst_inode->i_mapping, + dst_off >> PAGE_SHIFT, + endoff >> PAGE_SHIFT); + if (ret < 0) { + printk("Failed to invalidate inode pages (%ld)\n", ret); + ret = 0; /* XXX */ + } + src_oloc.pool = src_ci->i_layout.pool_id; + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); + dst_oloc.pool = dst_ci->i_layout.pool_id; + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); + /* + * TODO: should file_start_write/file_end_write be used for the whole + * loop? Or any other locking? + */ + while (len > 0) { + struct ceph_object_id src_oid, dst_oid; + u64 objnum, objoff; + u32 objlen; + size_t copy_len = min_t(size_t, src_ci->i_layout.object_size, len); + int err = 0; + + ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, + copy_len, &objnum, &objoff, + &objlen); + ceph_oid_init(&src_oid); + ceph_oid_printf(&src_oid, "%llx.%08llx", + src_ci->i_vino.ino, objnum); + + /* Do manual copy if: + * - source file offset isn't object aligned, or + * - copy length is smaller than object size + */ + if (objoff || (copy_len < src_ci->i_layout.object_size)) { + /* Do not copy beyond this object */ + if (copy_len > objlen) + copy_len = objlen; + err = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, copy_len, flags); + if (err < 0) { + ret = err; + goto out_caps; + } + len -= copy_len; + ret += copy_len; + continue; + } + + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, + copy_len, &objnum, &objoff, + &objlen); + ceph_oid_init(&dst_oid); + ceph_oid_printf(&dst_oid, "%llx.%08llx", + dst_ci->i_vino.ino, objnum); + /* Again... do a manual copy if: + * - destination file offset isn't object aligned, or + * - copy length is smaller than object size + * (although the object size should be the same for different + * files in the same filesystem...) + */ + if (objoff || (copy_len < dst_ci->i_layout.object_size)) { + if (copy_len > objlen) + copy_len = objlen; + err = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, copy_len, flags); + if (err < 0) { + ret = err; + goto out_caps; + } + len -= copy_len; + ceph_oid_destroy(&src_oid); + ret += copy_len; + continue; + } + /* Finally... do an object remote copy */ + err = ceph_osdc_copy_from(osdc, src_ci->i_vino.snap, + 0, /* XXX src_ci->i_version ? */ + &src_oid, &src_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL|CEPH_OSD_OP_FLAG_FADVISE_WILLNEED, + dst_ci->i_vino.snap, &dst_oid, + &dst_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL|CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + if (err) { + printk("copy_from returned an error: %d\n", err); /* XXX */ + ret = err; + goto out_caps; + } + len -= copy_len; + src_off += copy_len; + dst_off += copy_len; + ret += copy_len; + ceph_oid_destroy(&src_oid); + ceph_oid_destroy(&dst_oid); + } + /* Let the MDS know about destination object size change */ + if (endoff > size) { + int dirty; + int caps_flags = CHECK_CAPS_AUTHONLY; + + if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) + caps_flags |= CHECK_CAPS_NODELAY; + if (ceph_inode_set_size(dst_inode, endoff)) + caps_flags |= CHECK_CAPS_AUTHONLY; + if (caps_flags) + ceph_check_caps(dst_ci, caps_flags, NULL); + spin_lock(&dst_ci->i_ceph_lock); + dst_ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps( + dst_ci, + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER, + &prealloc_cf); + spin_unlock(&dst_ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(dst_inode, dirty); + } +out_caps: + ceph_put_cap_refs(src_ci, src_got); +out_dst_caps: + ceph_put_cap_refs(dst_ci, dst_got); +out: + ceph_free_cap_flush(prealloc_cf); + + return ret; +} + const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, @@ -1835,5 +2059,6 @@ const struct file_operations ceph_file_fops = { .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, .fallocate = ceph_fallocate, + .copy_file_range = ceph_copy_file_range, };