Message ID | 20200115205912.38688-11-jlayton@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ceph: asynchronous file create support | expand |
On 1/16/20 4:59 AM, Jeff Layton wrote: > With the Octopus release, the MDS will hand out directory create caps. > > If we have Fxc caps on the directory, and complete directory information > or a known negative dentry, then we can return without waiting on the > reply, allowing the open() call to return very quickly to userland. > > We use the normal ceph_fill_inode() routine to fill in the inode, so we > have to gin up some reply inode information with what we'd expect the > newly-created inode to have. The client assumes that it has a full set > of caps on the new inode, and that the MDS will revoke them when there > is conflicting access. > > This functionality is gated on the enable_async_dirops module option, > along with async unlinks, and on the server supporting the necessary > CephFS feature bit. > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > --- > fs/ceph/file.c | 196 +++++++++++++++++++++++++++++++++-- > include/linux/ceph/ceph_fs.h | 3 + > 2 files changed, 190 insertions(+), 9 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index b44ccbc85fe4..2742417fa5ec 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -448,6 +448,169 @@ cache_file_layout(struct inode *dst, struct inode *src) > spin_unlock(&cdst->i_ceph_lock); > } > > +/* > + * Try to set up an async create. We need caps, a file layout, and inode number, > + * and either a lease on the dentry or complete dir info. If any of those > + * criteria are not satisfied, then return false and the caller can go > + * synchronous. > + */ > +static bool try_prep_async_create(struct inode *dir, struct dentry *dentry, > + struct ceph_file_layout *lo, > + unsigned long *pino) > +{ > + struct ceph_inode_info *ci = ceph_inode(dir); > + bool ret = false; > + unsigned long ino; > + > + spin_lock(&ci->i_ceph_lock); > + /* No auth cap means no chance for Dc caps */ > + if (!ci->i_auth_cap) > + goto no_async; > + > + /* Any delegated inos? */ > + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) > + goto no_async; > + > + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) > + goto no_async; > + > + /* Use LOOKUP_RCU since we're under i_ceph_lock */ > + if (!__ceph_dir_is_complete(ci) && > + !dentry_lease_is_valid(dentry, LOOKUP_RCU)) > + goto no_async; > + > + if (!(__ceph_caps_issued(ci, NULL) & > + (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE))) > + goto no_async; > + (ceph_caps_issued(ci, NULL) & CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE)) == (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE) > + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); > + if (!ino) > + goto no_async; > + > + *pino = ino; > + ceph_take_cap_refs(ci, CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE, false); > + memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); > + rcu_assign_pointer(lo->pool_ns, > + ceph_try_get_string(ci->i_cached_layout.pool_ns)); > + ret = true; > +no_async: > + spin_unlock(&ci->i_ceph_lock); > + return ret; > +} > + > +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, > + struct ceph_mds_request *req) > +{ > + mapping_set_error(req->r_parent->i_mapping, req->r_err); > + > + if (req->r_target_inode) { > + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); > + u64 ino = ceph_vino(req->r_target_inode).ino; > + > + if (req->r_deleg_ino != ino) > + pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%lx target=0x%llx\n", > + __func__, req->r_err, req->r_deleg_ino, ino); > + mapping_set_error(req->r_target_inode->i_mapping, req->r_err); > + > + spin_lock(&ci->i_ceph_lock); > + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { > + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; > + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); > + } > + spin_unlock(&ci->i_ceph_lock); > + } else { > + pr_warn("%s: no req->r_target_inode for 0x%lx\n", __func__, > + req->r_deleg_ino); > + } > + ceph_put_cap_refs(ceph_inode(req->r_parent), > + CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE); > +} > + > +static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, > + struct file *file, umode_t mode, > + struct ceph_mds_request *req, > + struct ceph_acl_sec_ctx *as_ctx, > + struct ceph_file_layout *lo) > +{ > + int ret; > + char xattr_buf[4]; > + struct ceph_mds_reply_inode in = { }; > + struct ceph_mds_reply_info_in iinfo = { .in = &in }; > + struct ceph_inode_info *ci = ceph_inode(dir); > + struct inode *inode; > + struct timespec64 now; > + struct ceph_vino vino = { .ino = req->r_deleg_ino, > + .snap = CEPH_NOSNAP }; > + > + ktime_get_real_ts64(&now); > + > + inode = ceph_get_inode(dentry->d_sb, vino); > + if (IS_ERR(inode)) > + return PTR_ERR(inode); > + > + iinfo.inline_version = CEPH_INLINE_NONE; > + iinfo.change_attr = 1; > + ceph_encode_timespec64(&iinfo.btime, &now); > + > + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); > + iinfo.xattr_data = xattr_buf; > + memset(iinfo.xattr_data, 0, iinfo.xattr_len); > + > + in.ino = cpu_to_le64(vino.ino); > + in.snapid = cpu_to_le64(CEPH_NOSNAP); > + in.version = cpu_to_le64(1); // ??? > + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); > + in.cap.cap_id = cpu_to_le64(1); > + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); > + in.cap.flags = CEPH_CAP_FLAG_AUTH; > + in.ctime = in.mtime = in.atime = iinfo.btime; > + in.mode = cpu_to_le32((u32)mode); > + in.truncate_seq = cpu_to_le32(1); > + in.truncate_size = cpu_to_le64(-1ULL); > + in.xattr_version = cpu_to_le64(1); > + in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); > + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ? > + dir->i_gid : current_fsgid())); > + in.nlink = cpu_to_le32(1); > + in.max_size = cpu_to_le64(lo->stripe_unit); > + > + ceph_file_layout_to_legacy(lo, &in.layout); > + > + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, > + req->r_fmode, NULL); > + if (ret) { > + dout("%s failed to fill inode: %d\n", __func__, ret); > + if (inode->i_state & I_NEW) > + discard_new_inode(inode); > + } else { > + struct dentry *dn; > + > + dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__, > + vino.ino, dir->i_ino, dentry->d_name.name); > + ceph_dir_clear_ordered(dir); > + ceph_init_inode_acls(inode, as_ctx); > + if (inode->i_state & I_NEW) { > + /* > + * If it's not I_NEW, then someone created this before > + * we got here. Assume the server is aware of it at > + * that point and don't worry about setting > + * CEPH_I_ASYNC_CREATE. > + */ > + ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; > + unlock_new_inode(inode); > + } > + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { > + if (!d_unhashed(dentry)) > + d_drop(dentry); > + dn = d_splice_alias(inode, dentry); > + WARN_ON_ONCE(dn && dn != dentry); > + } > + file->f_mode |= FMODE_CREATED; > + ret = finish_open(file, dentry, ceph_open); > + } > + return ret; > +} > + > /* > * Do a lookup + open with a single request. If we get a non-existent > * file or symlink, return 1 so the VFS can retry. > @@ -460,6 +623,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > struct ceph_mds_request *req; > struct dentry *dn; > struct ceph_acl_sec_ctx as_ctx = {}; > + bool try_async = enable_async_dirops; > int mask; > int err; > > @@ -492,28 +656,41 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > } > req->r_dentry = dget(dentry); > req->r_num_caps = 2; > + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > + if (ceph_security_xattr_wanted(dir)) > + mask |= CEPH_CAP_XATTR_SHARED; > + req->r_args.open.mask = cpu_to_le32(mask); > + req->r_parent = dir; > + > if (flags & O_CREAT) { > + struct ceph_file_layout lo; > + > req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; > req->r_dentry_unless = CEPH_CAP_FILE_EXCL; > if (as_ctx.pagelist) { > req->r_pagelist = as_ctx.pagelist; > as_ctx.pagelist = NULL; > } > + if (try_async && try_prep_async_create(dir, dentry, &lo, > + &req->r_deleg_ino)) { > + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); > + req->r_callback = ceph_async_create_cb; > + err = ceph_mdsc_submit_request(mdsc, dir, req); > + if (!err) > + err = ceph_finish_async_create(dir, dentry, > + file, mode, req, > + &as_ctx, &lo); > + goto out_req; > + } > } > > - mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > - if (ceph_security_xattr_wanted(dir)) > - mask |= CEPH_CAP_XATTR_SHARED; > - req->r_args.open.mask = cpu_to_le32(mask); > - > - req->r_parent = dir; > set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); > err = ceph_mdsc_do_request(mdsc, > (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, > req); > err = ceph_handle_snapdir(req, dentry, err); > if (err) > - goto out_req; > + goto out_fmode; > > if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) > err = ceph_handle_notrace_create(dir, dentry); > @@ -527,7 +704,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > dn = NULL; > } > if (err) > - goto out_req; > + goto out_fmode; > if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { > /* make vfs retry on splice, ENOENT, or symlink */ > dout("atomic_open finish_no_open on dn %p\n", dn); > @@ -543,9 +720,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > } > err = finish_open(file, dentry, ceph_open); > } > -out_req: > +out_fmode: > if (!req->r_err && req->r_target_inode) > ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); > +out_req: > ceph_mdsc_put_request(req); > out_ctx: > ceph_release_acl_sec_ctx(&as_ctx); > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > index 91d09cf37649..e035c5194005 100644 > --- a/include/linux/ceph/ceph_fs.h > +++ b/include/linux/ceph/ceph_fs.h > @@ -659,6 +659,9 @@ int ceph_flags_to_mode(int flags); > #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ > CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ > CEPH_CAP_PIN) > +#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \ > + CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \ > + CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR) > > #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ > CEPH_LOCK_IXATTR) >
On Thu, 2020-01-16 at 23:09 +0800, Yan, Zheng wrote: > On 1/16/20 4:59 AM, Jeff Layton wrote: > > With the Octopus release, the MDS will hand out directory create caps. > > > > If we have Fxc caps on the directory, and complete directory information > > or a known negative dentry, then we can return without waiting on the > > reply, allowing the open() call to return very quickly to userland. > > > > We use the normal ceph_fill_inode() routine to fill in the inode, so we > > have to gin up some reply inode information with what we'd expect the > > newly-created inode to have. The client assumes that it has a full set > > of caps on the new inode, and that the MDS will revoke them when there > > is conflicting access. > > > > This functionality is gated on the enable_async_dirops module option, > > along with async unlinks, and on the server supporting the necessary > > CephFS feature bit. > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > --- > > fs/ceph/file.c | 196 +++++++++++++++++++++++++++++++++-- > > include/linux/ceph/ceph_fs.h | 3 + > > 2 files changed, 190 insertions(+), 9 deletions(-) > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > index b44ccbc85fe4..2742417fa5ec 100644 > > --- a/fs/ceph/file.c > > +++ b/fs/ceph/file.c > > @@ -448,6 +448,169 @@ cache_file_layout(struct inode *dst, struct inode *src) > > spin_unlock(&cdst->i_ceph_lock); > > } > > > > +/* > > + * Try to set up an async create. We need caps, a file layout, and inode number, > > + * and either a lease on the dentry or complete dir info. If any of those > > + * criteria are not satisfied, then return false and the caller can go > > + * synchronous. > > + */ > > +static bool try_prep_async_create(struct inode *dir, struct dentry *dentry, > > + struct ceph_file_layout *lo, > > + unsigned long *pino) > > +{ > > + struct ceph_inode_info *ci = ceph_inode(dir); > > + bool ret = false; > > + unsigned long ino; > > + > > + spin_lock(&ci->i_ceph_lock); > > + /* No auth cap means no chance for Dc caps */ > > + if (!ci->i_auth_cap) > > + goto no_async; > > + > > + /* Any delegated inos? */ > > + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) > > + goto no_async; > > + > > + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) > > + goto no_async; > > + > > + /* Use LOOKUP_RCU since we're under i_ceph_lock */ > > + if (!__ceph_dir_is_complete(ci) && > > + !dentry_lease_is_valid(dentry, LOOKUP_RCU)) > > + goto no_async; > > + > > + if (!(__ceph_caps_issued(ci, NULL) & > > + (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE))) > > + goto no_async; > > + > > (ceph_caps_issued(ci, NULL) & > CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE)) == > (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE) > Good catch! Fixed in my tree. Retesting now and will fold that one in. > > + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); > > + if (!ino) > > + goto no_async; > > + > > + *pino = ino; > > + ceph_take_cap_refs(ci, CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE, false); > > + memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); > > + rcu_assign_pointer(lo->pool_ns, > > + ceph_try_get_string(ci->i_cached_layout.pool_ns)); > > + ret = true; > > +no_async: > > + spin_unlock(&ci->i_ceph_lock); > > + return ret; > > +} > > + > > +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, > > + struct ceph_mds_request *req) > > +{ > > + mapping_set_error(req->r_parent->i_mapping, req->r_err); > > + > > + if (req->r_target_inode) { > > + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); > > + u64 ino = ceph_vino(req->r_target_inode).ino; > > + > > + if (req->r_deleg_ino != ino) > > + pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%lx target=0x%llx\n", > > + __func__, req->r_err, req->r_deleg_ino, ino); > > + mapping_set_error(req->r_target_inode->i_mapping, req->r_err); > > + > > + spin_lock(&ci->i_ceph_lock); > > + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { > > + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; > > + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); > > + } > > + spin_unlock(&ci->i_ceph_lock); > > + } else { > > + pr_warn("%s: no req->r_target_inode for 0x%lx\n", __func__, > > + req->r_deleg_ino); > > + } > > + ceph_put_cap_refs(ceph_inode(req->r_parent), > > + CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE); > > +} > > + > > +static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, > > + struct file *file, umode_t mode, > > + struct ceph_mds_request *req, > > + struct ceph_acl_sec_ctx *as_ctx, > > + struct ceph_file_layout *lo) > > +{ > > + int ret; > > + char xattr_buf[4]; > > + struct ceph_mds_reply_inode in = { }; > > + struct ceph_mds_reply_info_in iinfo = { .in = &in }; > > + struct ceph_inode_info *ci = ceph_inode(dir); > > + struct inode *inode; > > + struct timespec64 now; > > + struct ceph_vino vino = { .ino = req->r_deleg_ino, > > + .snap = CEPH_NOSNAP }; > > + > > + ktime_get_real_ts64(&now); > > + > > + inode = ceph_get_inode(dentry->d_sb, vino); > > + if (IS_ERR(inode)) > > + return PTR_ERR(inode); > > + > > + iinfo.inline_version = CEPH_INLINE_NONE; > > + iinfo.change_attr = 1; > > + ceph_encode_timespec64(&iinfo.btime, &now); > > + > > + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); > > + iinfo.xattr_data = xattr_buf; > > + memset(iinfo.xattr_data, 0, iinfo.xattr_len); > > + > > + in.ino = cpu_to_le64(vino.ino); > > + in.snapid = cpu_to_le64(CEPH_NOSNAP); > > + in.version = cpu_to_le64(1); // ??? > > + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); > > + in.cap.cap_id = cpu_to_le64(1); > > + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); > > + in.cap.flags = CEPH_CAP_FLAG_AUTH; > > + in.ctime = in.mtime = in.atime = iinfo.btime; > > + in.mode = cpu_to_le32((u32)mode); > > + in.truncate_seq = cpu_to_le32(1); > > + in.truncate_size = cpu_to_le64(-1ULL); > > + in.xattr_version = cpu_to_le64(1); > > + in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); > > + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ? > > + dir->i_gid : current_fsgid())); > > + in.nlink = cpu_to_le32(1); > > + in.max_size = cpu_to_le64(lo->stripe_unit); > > + > > + ceph_file_layout_to_legacy(lo, &in.layout); > > + > > + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, > > + req->r_fmode, NULL); > > + if (ret) { > > + dout("%s failed to fill inode: %d\n", __func__, ret); > > + if (inode->i_state & I_NEW) > > + discard_new_inode(inode); > > + } else { > > + struct dentry *dn; > > + > > + dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__, > > + vino.ino, dir->i_ino, dentry->d_name.name); > > + ceph_dir_clear_ordered(dir); > > + ceph_init_inode_acls(inode, as_ctx); > > + if (inode->i_state & I_NEW) { > > + /* > > + * If it's not I_NEW, then someone created this before > > + * we got here. Assume the server is aware of it at > > + * that point and don't worry about setting > > + * CEPH_I_ASYNC_CREATE. > > + */ > > + ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; > > + unlock_new_inode(inode); > > + } > > + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { > > + if (!d_unhashed(dentry)) > > + d_drop(dentry); > > + dn = d_splice_alias(inode, dentry); > > + WARN_ON_ONCE(dn && dn != dentry); > > + } > > + file->f_mode |= FMODE_CREATED; > > + ret = finish_open(file, dentry, ceph_open); > > + } > > + return ret; > > +} > > + > > /* > > * Do a lookup + open with a single request. If we get a non-existent > > * file or symlink, return 1 so the VFS can retry. > > @@ -460,6 +623,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > struct ceph_mds_request *req; > > struct dentry *dn; > > struct ceph_acl_sec_ctx as_ctx = {}; > > + bool try_async = enable_async_dirops; > > int mask; > > int err; > > > > @@ -492,28 +656,41 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > } > > req->r_dentry = dget(dentry); > > req->r_num_caps = 2; > > + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > > + if (ceph_security_xattr_wanted(dir)) > > + mask |= CEPH_CAP_XATTR_SHARED; > > + req->r_args.open.mask = cpu_to_le32(mask); > > + req->r_parent = dir; > > + > > if (flags & O_CREAT) { > > + struct ceph_file_layout lo; > > + > > req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; > > req->r_dentry_unless = CEPH_CAP_FILE_EXCL; > > if (as_ctx.pagelist) { > > req->r_pagelist = as_ctx.pagelist; > > as_ctx.pagelist = NULL; > > } > > + if (try_async && try_prep_async_create(dir, dentry, &lo, > > + &req->r_deleg_ino)) { > > + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); > > + req->r_callback = ceph_async_create_cb; > > + err = ceph_mdsc_submit_request(mdsc, dir, req); > > + if (!err) > > + err = ceph_finish_async_create(dir, dentry, > > + file, mode, req, > > + &as_ctx, &lo); > > + goto out_req; > > + } > > } > > > > - mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > > - if (ceph_security_xattr_wanted(dir)) > > - mask |= CEPH_CAP_XATTR_SHARED; > > - req->r_args.open.mask = cpu_to_le32(mask); > > - > > - req->r_parent = dir; > > set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); > > err = ceph_mdsc_do_request(mdsc, > > (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, > > req); > > err = ceph_handle_snapdir(req, dentry, err); > > if (err) > > - goto out_req; > > + goto out_fmode; > > > > if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) > > err = ceph_handle_notrace_create(dir, dentry); > > @@ -527,7 +704,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > dn = NULL; > > } > > if (err) > > - goto out_req; > > + goto out_fmode; > > if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { > > /* make vfs retry on splice, ENOENT, or symlink */ > > dout("atomic_open finish_no_open on dn %p\n", dn); > > @@ -543,9 +720,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > } > > err = finish_open(file, dentry, ceph_open); > > } > > -out_req: > > +out_fmode: > > if (!req->r_err && req->r_target_inode) > > ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); > > +out_req: > > ceph_mdsc_put_request(req); > > out_ctx: > > ceph_release_acl_sec_ctx(&as_ctx); > > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > > index 91d09cf37649..e035c5194005 100644 > > --- a/include/linux/ceph/ceph_fs.h > > +++ b/include/linux/ceph/ceph_fs.h > > @@ -659,6 +659,9 @@ int ceph_flags_to_mode(int flags); > > #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ > > CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ > > CEPH_CAP_PIN) > > +#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \ > > + CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \ > > + CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR) > > > > #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ > > CEPH_LOCK_IXATTR) > >
On 1/16/20 4:59 AM, Jeff Layton wrote: > With the Octopus release, the MDS will hand out directory create caps. > > If we have Fxc caps on the directory, and complete directory information > or a known negative dentry, then we can return without waiting on the > reply, allowing the open() call to return very quickly to userland. > > We use the normal ceph_fill_inode() routine to fill in the inode, so we > have to gin up some reply inode information with what we'd expect the > newly-created inode to have. The client assumes that it has a full set > of caps on the new inode, and that the MDS will revoke them when there > is conflicting access. > > This functionality is gated on the enable_async_dirops module option, > along with async unlinks, and on the server supporting the necessary > CephFS feature bit. > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > --- > fs/ceph/file.c | 196 +++++++++++++++++++++++++++++++++-- > include/linux/ceph/ceph_fs.h | 3 + > 2 files changed, 190 insertions(+), 9 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index b44ccbc85fe4..2742417fa5ec 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -448,6 +448,169 @@ cache_file_layout(struct inode *dst, struct inode *src) > spin_unlock(&cdst->i_ceph_lock); > } > > +/* > + * Try to set up an async create. We need caps, a file layout, and inode number, > + * and either a lease on the dentry or complete dir info. If any of those > + * criteria are not satisfied, then return false and the caller can go > + * synchronous. > + */ > +static bool try_prep_async_create(struct inode *dir, struct dentry *dentry, > + struct ceph_file_layout *lo, > + unsigned long *pino) > +{ > + struct ceph_inode_info *ci = ceph_inode(dir); > + bool ret = false; > + unsigned long ino; > + > + spin_lock(&ci->i_ceph_lock); > + /* No auth cap means no chance for Dc caps */ > + if (!ci->i_auth_cap) > + goto no_async; > + > + /* Any delegated inos? */ > + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) > + goto no_async; > + > + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) > + goto no_async; > + > + /* Use LOOKUP_RCU since we're under i_ceph_lock */ > + if (!__ceph_dir_is_complete(ci) && > + !dentry_lease_is_valid(dentry, LOOKUP_RCU)) > + goto no_async; dentry_lease_is_valid() checks dentry lease. When directory inode has Fsx caps, mds does not issue lease for individual dentry. Check here should be something like dir_lease_is_valid() > + > + if (!(__ceph_caps_issued(ci, NULL) & > + (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE))) > + goto no_async; > + > + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); > + if (!ino) > + goto no_async; > + > + *pino = ino; > + ceph_take_cap_refs(ci, CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE, false); > + memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); > + rcu_assign_pointer(lo->pool_ns, > + ceph_try_get_string(ci->i_cached_layout.pool_ns)); > + ret = true; > +no_async: > + spin_unlock(&ci->i_ceph_lock); > + return ret; > +} > + > +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, > + struct ceph_mds_request *req) > +{ > + mapping_set_error(req->r_parent->i_mapping, req->r_err); > + > + if (req->r_target_inode) { > + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); > + u64 ino = ceph_vino(req->r_target_inode).ino; > + > + if (req->r_deleg_ino != ino) > + pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%lx target=0x%llx\n", > + __func__, req->r_err, req->r_deleg_ino, ino); > + mapping_set_error(req->r_target_inode->i_mapping, req->r_err); > + > + spin_lock(&ci->i_ceph_lock); > + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { > + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; > + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); > + } > + spin_unlock(&ci->i_ceph_lock); > + } else { > + pr_warn("%s: no req->r_target_inode for 0x%lx\n", __func__, > + req->r_deleg_ino); > + } > + ceph_put_cap_refs(ceph_inode(req->r_parent), > + CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE); > +} > + > +static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, > + struct file *file, umode_t mode, > + struct ceph_mds_request *req, > + struct ceph_acl_sec_ctx *as_ctx, > + struct ceph_file_layout *lo) > +{ > + int ret; > + char xattr_buf[4]; > + struct ceph_mds_reply_inode in = { }; > + struct ceph_mds_reply_info_in iinfo = { .in = &in }; > + struct ceph_inode_info *ci = ceph_inode(dir); > + struct inode *inode; > + struct timespec64 now; > + struct ceph_vino vino = { .ino = req->r_deleg_ino, > + .snap = CEPH_NOSNAP }; > + > + ktime_get_real_ts64(&now); > + > + inode = ceph_get_inode(dentry->d_sb, vino); > + if (IS_ERR(inode)) > + return PTR_ERR(inode); > + > + iinfo.inline_version = CEPH_INLINE_NONE; > + iinfo.change_attr = 1; > + ceph_encode_timespec64(&iinfo.btime, &now); > + > + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); > + iinfo.xattr_data = xattr_buf; > + memset(iinfo.xattr_data, 0, iinfo.xattr_len); > + > + in.ino = cpu_to_le64(vino.ino); > + in.snapid = cpu_to_le64(CEPH_NOSNAP); > + in.version = cpu_to_le64(1); // ??? > + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); > + in.cap.cap_id = cpu_to_le64(1); > + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); > + in.cap.flags = CEPH_CAP_FLAG_AUTH; > + in.ctime = in.mtime = in.atime = iinfo.btime; > + in.mode = cpu_to_le32((u32)mode); > + in.truncate_seq = cpu_to_le32(1); > + in.truncate_size = cpu_to_le64(-1ULL); > + in.xattr_version = cpu_to_le64(1); > + in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); > + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ? > + dir->i_gid : current_fsgid())); > + in.nlink = cpu_to_le32(1); > + in.max_size = cpu_to_le64(lo->stripe_unit); > + > + ceph_file_layout_to_legacy(lo, &in.layout); > + > + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, > + req->r_fmode, NULL); > + if (ret) { > + dout("%s failed to fill inode: %d\n", __func__, ret); > + if (inode->i_state & I_NEW) > + discard_new_inode(inode); > + } else { > + struct dentry *dn; > + > + dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__, > + vino.ino, dir->i_ino, dentry->d_name.name); > + ceph_dir_clear_ordered(dir); > + ceph_init_inode_acls(inode, as_ctx); > + if (inode->i_state & I_NEW) { > + /* > + * If it's not I_NEW, then someone created this before > + * we got here. Assume the server is aware of it at > + * that point and don't worry about setting > + * CEPH_I_ASYNC_CREATE. > + */ > + ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; > + unlock_new_inode(inode); > + } > + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { > + if (!d_unhashed(dentry)) > + d_drop(dentry); > + dn = d_splice_alias(inode, dentry); > + WARN_ON_ONCE(dn && dn != dentry); > + } > + file->f_mode |= FMODE_CREATED; > + ret = finish_open(file, dentry, ceph_open); > + } > + return ret; > +} > + > /* > * Do a lookup + open with a single request. If we get a non-existent > * file or symlink, return 1 so the VFS can retry. > @@ -460,6 +623,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > struct ceph_mds_request *req; > struct dentry *dn; > struct ceph_acl_sec_ctx as_ctx = {}; > + bool try_async = enable_async_dirops; > int mask; > int err; > > @@ -492,28 +656,41 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > } > req->r_dentry = dget(dentry); > req->r_num_caps = 2; > + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > + if (ceph_security_xattr_wanted(dir)) > + mask |= CEPH_CAP_XATTR_SHARED; > + req->r_args.open.mask = cpu_to_le32(mask); > + req->r_parent = dir; > + > if (flags & O_CREAT) { > + struct ceph_file_layout lo; > + > req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; > req->r_dentry_unless = CEPH_CAP_FILE_EXCL; > if (as_ctx.pagelist) { > req->r_pagelist = as_ctx.pagelist; > as_ctx.pagelist = NULL; > } > + if (try_async && try_prep_async_create(dir, dentry, &lo, > + &req->r_deleg_ino)) { > + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); > + req->r_callback = ceph_async_create_cb; > + err = ceph_mdsc_submit_request(mdsc, dir, req); > + if (!err) > + err = ceph_finish_async_create(dir, dentry, > + file, mode, req, > + &as_ctx, &lo); > + goto out_req; > + } > } > > - mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > - if (ceph_security_xattr_wanted(dir)) > - mask |= CEPH_CAP_XATTR_SHARED; > - req->r_args.open.mask = cpu_to_le32(mask); > - > - req->r_parent = dir; > set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); > err = ceph_mdsc_do_request(mdsc, > (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, > req); > err = ceph_handle_snapdir(req, dentry, err); > if (err) > - goto out_req; > + goto out_fmode; > > if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) > err = ceph_handle_notrace_create(dir, dentry); > @@ -527,7 +704,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > dn = NULL; > } > if (err) > - goto out_req; > + goto out_fmode; > if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { > /* make vfs retry on splice, ENOENT, or symlink */ > dout("atomic_open finish_no_open on dn %p\n", dn); > @@ -543,9 +720,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > } > err = finish_open(file, dentry, ceph_open); > } > -out_req: > +out_fmode: > if (!req->r_err && req->r_target_inode) > ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); > +out_req: > ceph_mdsc_put_request(req); > out_ctx: > ceph_release_acl_sec_ctx(&as_ctx); > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > index 91d09cf37649..e035c5194005 100644 > --- a/include/linux/ceph/ceph_fs.h > +++ b/include/linux/ceph/ceph_fs.h > @@ -659,6 +659,9 @@ int ceph_flags_to_mode(int flags); > #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ > CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ > CEPH_CAP_PIN) > +#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \ > + CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \ > + CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR) > > #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ > CEPH_LOCK_IXATTR) >
On Fri, 2020-01-17 at 21:28 +0800, Yan, Zheng wrote: > On 1/16/20 4:59 AM, Jeff Layton wrote: > > With the Octopus release, the MDS will hand out directory create caps. > > > > If we have Fxc caps on the directory, and complete directory information > > or a known negative dentry, then we can return without waiting on the > > reply, allowing the open() call to return very quickly to userland. > > > > We use the normal ceph_fill_inode() routine to fill in the inode, so we > > have to gin up some reply inode information with what we'd expect the > > newly-created inode to have. The client assumes that it has a full set > > of caps on the new inode, and that the MDS will revoke them when there > > is conflicting access. > > > > This functionality is gated on the enable_async_dirops module option, > > along with async unlinks, and on the server supporting the necessary > > CephFS feature bit. > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > --- > > fs/ceph/file.c | 196 +++++++++++++++++++++++++++++++++-- > > include/linux/ceph/ceph_fs.h | 3 + > > 2 files changed, 190 insertions(+), 9 deletions(-) > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > index b44ccbc85fe4..2742417fa5ec 100644 > > --- a/fs/ceph/file.c > > +++ b/fs/ceph/file.c > > @@ -448,6 +448,169 @@ cache_file_layout(struct inode *dst, struct inode *src) > > spin_unlock(&cdst->i_ceph_lock); > > } > > > > +/* > > + * Try to set up an async create. We need caps, a file layout, and inode number, > > + * and either a lease on the dentry or complete dir info. If any of those > > + * criteria are not satisfied, then return false and the caller can go > > + * synchronous. > > + */ > > +static bool try_prep_async_create(struct inode *dir, struct dentry *dentry, > > + struct ceph_file_layout *lo, > > + unsigned long *pino) > > +{ > > + struct ceph_inode_info *ci = ceph_inode(dir); > > + bool ret = false; > > + unsigned long ino; > > + > > + spin_lock(&ci->i_ceph_lock); > > + /* No auth cap means no chance for Dc caps */ > > + if (!ci->i_auth_cap) > > + goto no_async; > > + > > + /* Any delegated inos? */ > > + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) > > + goto no_async; > > + > > + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) > > + goto no_async; > > + > > + /* Use LOOKUP_RCU since we're under i_ceph_lock */ > > + if (!__ceph_dir_is_complete(ci) && > > + !dentry_lease_is_valid(dentry, LOOKUP_RCU)) > > + goto no_async; > > dentry_lease_is_valid() checks dentry lease. When directory inode has > Fsx caps, mds does not issue lease for individual dentry. Check here > should be something like dir_lease_is_valid() Ok, I think I get it. The catch here is that we're calling this from atomic_open, so we may be dealing with a dentry that is brand new and has never had a lookup. I think we have to handle those two cases differently. This is what I'm thinking: --- fs/ceph/file.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7b14dba92266..a3eb38fac68a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -459,6 +459,7 @@ static bool try_prep_async_create(struct inode *dir, struct dentry *dentry, unsigned long *pino) { struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); bool ret = false; unsigned long ino; @@ -474,16 +475,19 @@ static bool try_prep_async_create(struct inode *dir, struct dentry *dentry, if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) goto no_async; - /* Use LOOKUP_RCU since we're under i_ceph_lock */ - if (!__ceph_dir_is_complete(ci) && - !dentry_lease_is_valid(dentry, LOOKUP_RCU)) - goto no_async; - if ((__ceph_caps_issued(ci, NULL) & (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE)) != (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE)) goto no_async; + if (d_in_lookup(dentry)) { + if (!__ceph_dir_is_complete(ci)) + goto no_async; + } else if (atomic_read(&ci->i_shared_gen) != + READ_ONCE(di->lease_shared_gen)) { + goto no_async; + } + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); if (!ino) goto no_async;
On 1/18/20 1:40 AM, Jeff Layton wrote: > On Fri, 2020-01-17 at 21:28 +0800, Yan, Zheng wrote: >> On 1/16/20 4:59 AM, Jeff Layton wrote: >>> With the Octopus release, the MDS will hand out directory create caps. >>> >>> If we have Fxc caps on the directory, and complete directory information >>> or a known negative dentry, then we can return without waiting on the >>> reply, allowing the open() call to return very quickly to userland. >>> >>> We use the normal ceph_fill_inode() routine to fill in the inode, so we >>> have to gin up some reply inode information with what we'd expect the >>> newly-created inode to have. The client assumes that it has a full set >>> of caps on the new inode, and that the MDS will revoke them when there >>> is conflicting access. >>> >>> This functionality is gated on the enable_async_dirops module option, >>> along with async unlinks, and on the server supporting the necessary >>> CephFS feature bit. >>> >>> Signed-off-by: Jeff Layton <jlayton@kernel.org> >>> --- >>> fs/ceph/file.c | 196 +++++++++++++++++++++++++++++++++-- >>> include/linux/ceph/ceph_fs.h | 3 + >>> 2 files changed, 190 insertions(+), 9 deletions(-) >>> >>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c >>> index b44ccbc85fe4..2742417fa5ec 100644 >>> --- a/fs/ceph/file.c >>> +++ b/fs/ceph/file.c >>> @@ -448,6 +448,169 @@ cache_file_layout(struct inode *dst, struct inode *src) >>> spin_unlock(&cdst->i_ceph_lock); >>> } >>> >>> +/* >>> + * Try to set up an async create. We need caps, a file layout, and inode number, >>> + * and either a lease on the dentry or complete dir info. If any of those >>> + * criteria are not satisfied, then return false and the caller can go >>> + * synchronous. >>> + */ >>> +static bool try_prep_async_create(struct inode *dir, struct dentry *dentry, >>> + struct ceph_file_layout *lo, >>> + unsigned long *pino) >>> +{ >>> + struct ceph_inode_info *ci = ceph_inode(dir); >>> + bool ret = false; >>> + unsigned long ino; >>> + >>> + spin_lock(&ci->i_ceph_lock); >>> + /* No auth cap means no chance for Dc caps */ >>> + if (!ci->i_auth_cap) >>> + goto no_async; >>> + >>> + /* Any delegated inos? */ >>> + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) >>> + goto no_async; >>> + >>> + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) >>> + goto no_async; >>> + >>> + /* Use LOOKUP_RCU since we're under i_ceph_lock */ >>> + if (!__ceph_dir_is_complete(ci) && >>> + !dentry_lease_is_valid(dentry, LOOKUP_RCU)) >>> + goto no_async; >> >> dentry_lease_is_valid() checks dentry lease. When directory inode has >> Fsx caps, mds does not issue lease for individual dentry. Check here >> should be something like dir_lease_is_valid() > > Ok, I think I get it. The catch here is that we're calling this from > atomic_open, so we may be dealing with a dentry that is brand new and > has never had a lookup. I think we have to handle those two cases > differently. > > This is what I'm thinking: > > --- > fs/ceph/file.c | 14 +++++++++----- > 1 file changed, 9 insertions(+), 5 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 7b14dba92266..a3eb38fac68a 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -459,6 +459,7 @@ static bool try_prep_async_create(struct inode *dir, > struct dentry *dentry, > unsigned long *pino) > { > struct ceph_inode_info *ci = ceph_inode(dir); > + struct ceph_dentry_info *di = ceph_dentry(dentry); > bool ret = false; > unsigned long ino; > > @@ -474,16 +475,19 @@ static bool try_prep_async_create(struct inode > *dir, struct dentry *dentry, > if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) > goto no_async; > > - /* Use LOOKUP_RCU since we're under i_ceph_lock */ > - if (!__ceph_dir_is_complete(ci) && > - !dentry_lease_is_valid(dentry, LOOKUP_RCU)) > - goto no_async; > - > if ((__ceph_caps_issued(ci, NULL) & > (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE)) != > (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE)) > goto no_async; > > + if (d_in_lookup(dentry)) { > + if (!__ceph_dir_is_complete(ci)) > + goto no_async; > + } else if (atomic_read(&ci->i_shared_gen) != > + READ_ONCE(di->lease_shared_gen)) { > + goto no_async; > Make sense > ino = ceph_get_deleg_ino(ci->i_auth_cap->session); > if (!ino) > goto no_async; >
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b44ccbc85fe4..2742417fa5ec 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -448,6 +448,169 @@ cache_file_layout(struct inode *dst, struct inode *src) spin_unlock(&cdst->i_ceph_lock); } +/* + * Try to set up an async create. We need caps, a file layout, and inode number, + * and either a lease on the dentry or complete dir info. If any of those + * criteria are not satisfied, then return false and the caller can go + * synchronous. + */ +static bool try_prep_async_create(struct inode *dir, struct dentry *dentry, + struct ceph_file_layout *lo, + unsigned long *pino) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + bool ret = false; + unsigned long ino; + + spin_lock(&ci->i_ceph_lock); + /* No auth cap means no chance for Dc caps */ + if (!ci->i_auth_cap) + goto no_async; + + /* Any delegated inos? */ + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) + goto no_async; + + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) + goto no_async; + + /* Use LOOKUP_RCU since we're under i_ceph_lock */ + if (!__ceph_dir_is_complete(ci) && + !dentry_lease_is_valid(dentry, LOOKUP_RCU)) + goto no_async; + + if (!(__ceph_caps_issued(ci, NULL) & + (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE))) + goto no_async; + + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); + if (!ino) + goto no_async; + + *pino = ino; + ceph_take_cap_refs(ci, CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE, false); + memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); + rcu_assign_pointer(lo->pool_ns, + ceph_try_get_string(ci->i_cached_layout.pool_ns)); + ret = true; +no_async: + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + mapping_set_error(req->r_parent->i_mapping, req->r_err); + + if (req->r_target_inode) { + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); + u64 ino = ceph_vino(req->r_target_inode).ino; + + if (req->r_deleg_ino != ino) + pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%lx target=0x%llx\n", + __func__, req->r_err, req->r_deleg_ino, ino); + mapping_set_error(req->r_target_inode->i_mapping, req->r_err); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + } + spin_unlock(&ci->i_ceph_lock); + } else { + pr_warn("%s: no req->r_target_inode for 0x%lx\n", __func__, + req->r_deleg_ino); + } + ceph_put_cap_refs(ceph_inode(req->r_parent), + CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE); +} + +static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, + struct file *file, umode_t mode, + struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx, + struct ceph_file_layout *lo) +{ + int ret; + char xattr_buf[4]; + struct ceph_mds_reply_inode in = { }; + struct ceph_mds_reply_info_in iinfo = { .in = &in }; + struct ceph_inode_info *ci = ceph_inode(dir); + struct inode *inode; + struct timespec64 now; + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; + + ktime_get_real_ts64(&now); + + inode = ceph_get_inode(dentry->d_sb, vino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + iinfo.inline_version = CEPH_INLINE_NONE; + iinfo.change_attr = 1; + ceph_encode_timespec64(&iinfo.btime, &now); + + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + + in.ino = cpu_to_le64(vino.ino); + in.snapid = cpu_to_le64(CEPH_NOSNAP); + in.version = cpu_to_le64(1); // ??? + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); + in.cap.cap_id = cpu_to_le64(1); + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); + in.cap.flags = CEPH_CAP_FLAG_AUTH; + in.ctime = in.mtime = in.atime = iinfo.btime; + in.mode = cpu_to_le32((u32)mode); + in.truncate_seq = cpu_to_le32(1); + in.truncate_size = cpu_to_le64(-1ULL); + in.xattr_version = cpu_to_le64(1); + in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ? + dir->i_gid : current_fsgid())); + in.nlink = cpu_to_le32(1); + in.max_size = cpu_to_le64(lo->stripe_unit); + + ceph_file_layout_to_legacy(lo, &in.layout); + + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, + req->r_fmode, NULL); + if (ret) { + dout("%s failed to fill inode: %d\n", __func__, ret); + if (inode->i_state & I_NEW) + discard_new_inode(inode); + } else { + struct dentry *dn; + + dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__, + vino.ino, dir->i_ino, dentry->d_name.name); + ceph_dir_clear_ordered(dir); + ceph_init_inode_acls(inode, as_ctx); + if (inode->i_state & I_NEW) { + /* + * If it's not I_NEW, then someone created this before + * we got here. Assume the server is aware of it at + * that point and don't worry about setting + * CEPH_I_ASYNC_CREATE. + */ + ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; + unlock_new_inode(inode); + } + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { + if (!d_unhashed(dentry)) + d_drop(dentry); + dn = d_splice_alias(inode, dentry); + WARN_ON_ONCE(dn && dn != dentry); + } + file->f_mode |= FMODE_CREATED; + ret = finish_open(file, dentry, ceph_open); + } + return ret; +} + /* * Do a lookup + open with a single request. If we get a non-existent * file or symlink, return 1 so the VFS can retry. @@ -460,6 +623,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_mds_request *req; struct dentry *dn; struct ceph_acl_sec_ctx as_ctx = {}; + bool try_async = enable_async_dirops; int mask; int err; @@ -492,28 +656,41 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } req->r_dentry = dget(dentry); req->r_num_caps = 2; + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.open.mask = cpu_to_le32(mask); + req->r_parent = dir; + if (flags & O_CREAT) { + struct ceph_file_layout lo; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (as_ctx.pagelist) { req->r_pagelist = as_ctx.pagelist; as_ctx.pagelist = NULL; } + if (try_async && try_prep_async_create(dir, dentry, &lo, + &req->r_deleg_ino)) { + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); + req->r_callback = ceph_async_create_cb; + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) + err = ceph_finish_async_create(dir, dentry, + file, mode, req, + &as_ctx, &lo); + goto out_req; + } } - mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; - if (ceph_security_xattr_wanted(dir)) - mask |= CEPH_CAP_XATTR_SHARED; - req->r_args.open.mask = cpu_to_le32(mask); - - req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); err = ceph_handle_snapdir(req, dentry, err); if (err) - goto out_req; + goto out_fmode; if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); @@ -527,7 +704,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, dn = NULL; } if (err) - goto out_req; + goto out_fmode; if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { /* make vfs retry on splice, ENOENT, or symlink */ dout("atomic_open finish_no_open on dn %p\n", dn); @@ -543,9 +720,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } err = finish_open(file, dentry, ceph_open); } -out_req: +out_fmode: if (!req->r_err && req->r_target_inode) ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); +out_req: ceph_mdsc_put_request(req); out_ctx: ceph_release_acl_sec_ctx(&as_ctx); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 91d09cf37649..e035c5194005 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -659,6 +659,9 @@ int ceph_flags_to_mode(int flags); #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ CEPH_CAP_PIN) +#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \ + CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \ + CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR) #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ CEPH_LOCK_IXATTR)
With the Octopus release, the MDS will hand out directory create caps. If we have Fxc caps on the directory, and complete directory information or a known negative dentry, then we can return without waiting on the reply, allowing the open() call to return very quickly to userland. We use the normal ceph_fill_inode() routine to fill in the inode, so we have to gin up some reply inode information with what we'd expect the newly-created inode to have. The client assumes that it has a full set of caps on the new inode, and that the MDS will revoke them when there is conflicting access. This functionality is gated on the enable_async_dirops module option, along with async unlinks, and on the server supporting the necessary CephFS feature bit. Signed-off-by: Jeff Layton <jlayton@kernel.org> --- fs/ceph/file.c | 196 +++++++++++++++++++++++++++++++++-- include/linux/ceph/ceph_fs.h | 3 + 2 files changed, 190 insertions(+), 9 deletions(-)