diff mbox series

[1/2] ceph: issue getattr/lookup reqs to MDSes in an aggregative pattern

Message ID 20181028135742.24668-1-xxhdx1985126@gmail.com (mailing list archive)
State New, archived
Headers show
Series [1/2] ceph: issue getattr/lookup reqs to MDSes in an aggregative pattern | expand

Commit Message

Xuehan Xu Oct. 28, 2018, 1:57 p.m. UTC
From: Xuehan Xu <xuxuehan@360.cn>

Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
op, issue a new one if there is no inflight req that requires that same caps
as the current getattr/lookup op.

Signed-off-by: Xuehan Xu <xuxuehan@360.cn>
---
 fs/ceph/dir.c        | 99 ++++++++++++++++++++++++++++++--------------
 fs/ceph/inode.c      | 48 ++++++++++++++++-----
 fs/ceph/mds_client.c | 23 +++++++++-
 fs/ceph/mds_client.h |  5 ++-
 fs/ceph/super.c      | 68 ++++++++++++++++++++++++++++++
 fs/ceph/super.h      | 13 ++++++
 6 files changed, 211 insertions(+), 45 deletions(-)
diff mbox series

Patch

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 036ac0f3a393..fa4911bd5576 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -731,7 +731,7 @@  static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	struct ceph_mds_request *req;
+	struct ceph_mds_request *req = NULL;
 	int op;
 	int mask;
 	int err;
@@ -765,6 +765,10 @@  static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		spin_unlock(&ci->i_ceph_lock);
 	}
 
+	mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+	if (ceph_security_xattr_wanted(dir))
+		mask |= CEPH_CAP_XATTR_SHARED;
+
 	op = ceph_snap(dir) == CEPH_SNAPDIR ?
 		CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
 	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
@@ -772,12 +776,9 @@  static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_CAST(req);
 	req->r_dentry = dget(dentry);
 	req->r_num_caps = 2;
-
-	mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
-	if (ceph_security_xattr_wanted(dir))
-		mask |= CEPH_CAP_XATTR_SHARED;
+	
 	req->r_args.getattr.mask = cpu_to_le32(mask);
-
+	
 	req->r_parent = dir;
 	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -1176,6 +1177,7 @@  static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
 				}
 			}
 		}
+		dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
 	}
 	spin_unlock(&dentry->d_lock);
 
@@ -1184,7 +1186,7 @@  static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
 					 CEPH_MDS_LEASE_RENEW, seq);
 		ceph_put_mds_session(session);
 	}
-	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+	dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
 	return valid;
 }
 
@@ -1252,46 +1254,79 @@  static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (!valid) {
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(dir->i_sb)->mdsc;
-		struct ceph_mds_request *req;
+		struct ceph_mds_request *req = NULL;
+		struct ceph_inode_info* cdir = ceph_inode(dir);
 		int op, err;
 		u32 mask;
 
 		if (flags & LOOKUP_RCU)
 			return -ECHILD;
+		mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+		if (ceph_security_xattr_wanted(dir))
+			mask |= CEPH_CAP_XATTR_SHARED;
 
 		op = ceph_snap(dir) == CEPH_SNAPDIR ?
 			CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
-		req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
-		if (!IS_ERR(req)) {
-			req->r_dentry = dget(dentry);
-			req->r_num_caps = 2;
-			req->r_parent = dir;
+		if (op == CEPH_MDS_OP_LOOKUP) {
+			mutex_lock(&cdir->lookups_inflight_lock);
+			dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n", 
+					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
+			req = __search_inode_getattr_or_lookup(&cdir->lookups_inflight, mask, true);
+		}
+		if (req && op == CEPH_MDS_OP_LOOKUP) {
+			dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", 
+					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
+			ceph_mdsc_get_request(req);
+			mutex_unlock(&cdir->lookups_inflight_lock);
+			err = ceph_mdsc_wait_for_request(req);
+			dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
+					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
+		} else {
 
-			mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
-			if (ceph_security_xattr_wanted(dir))
-				mask |= CEPH_CAP_XATTR_SHARED;
-			req->r_args.getattr.mask = cpu_to_le32(mask);
+			req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+			if (op == CEPH_MDS_OP_LOOKUP) {
+				if (!IS_ERR(req)) {
+					req->r_dentry = dget(dentry);
+					req->r_num_caps = 2;
+					req->r_parent = dir;
+					req->r_args.getattr.mask = cpu_to_le32(mask);
+					__register_inode_getattr_or_lookup(cdir, req, true);
+					dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", 
+							dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
+				}
+				mutex_unlock(&cdir->lookups_inflight_lock);
+			}
+			if (IS_ERR(req))
+				goto out;
 
 			err = ceph_mdsc_do_request(mdsc, NULL, req);
-			switch (err) {
-			case 0:
-				if (d_really_is_positive(dentry) &&
-				    d_inode(dentry) == req->r_target_inode)
-					valid = 1;
-				break;
-			case -ENOENT:
-				if (d_really_is_negative(dentry))
-					valid = 1;
-				/* Fallthrough */
-			default:
-				break;
+			if (op == CEPH_MDS_OP_LOOKUP) {
+				mutex_lock(&cdir->lookups_inflight_lock);
+				__unregister_inode_getattr_or_lookup(cdir, req, true);
+				dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n", 
+						dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
+				mutex_unlock(&cdir->lookups_inflight_lock);
 			}
-			ceph_mdsc_put_request(req);
-			dout("d_revalidate %p lookup result=%d\n",
-			     dentry, err);
 		}
+		switch (err) {
+		case 0:
+			if (d_really_is_positive(dentry) &&
+			    d_inode(dentry) == req->r_target_inode)
+				valid = 1;
+			break;
+		case -ENOENT:
+			if (d_really_is_negative(dentry))
+				valid = 1;
+			/* Fallthrough */
+		default:
+			break;
+		}
+		ceph_mdsc_put_request(req);
+		dout("d_revalidate %p lookup result=%d\n",
+   				dentry, err);
 	}
 
+out:
 	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
 	if (valid) {
 		ceph_dentry_lru_touch(dentry);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index a866be999216..c51e2f186139 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -430,6 +430,8 @@  struct inode *ceph_alloc_inode(struct super_block *sb)
 	dout("alloc_inode %p\n", &ci->vfs_inode);
 
 	spin_lock_init(&ci->i_ceph_lock);
+	mutex_init(&ci->getattrs_inflight_lock);
+	mutex_init(&ci->lookups_inflight_lock);
 
 	ci->i_version = 0;
 	ci->i_inline_version = 0;
@@ -461,6 +463,8 @@  struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_xattrs.index_version = 0;
 
 	ci->i_caps = RB_ROOT;
+	ci->getattrs_inflight = RB_ROOT;
+	ci->lookups_inflight = RB_ROOT;
 	ci->i_auth_cap = NULL;
 	ci->i_dirty_caps = 0;
 	ci->i_flushing_caps = 0;
@@ -1047,9 +1051,10 @@  static void update_dentry_lease(struct dentry *dentry,
 	 * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
 	 * we expect a negative dentry.
 	 */
+	dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
 	if (!tgt_vino && d_really_is_positive(dentry))
 		return;
-
+	dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
 	if (tgt_vino && (d_really_is_negative(dentry) ||
 			!ceph_ino_compare(d_inode(dentry), tgt_vino)))
 		return;
@@ -2194,6 +2199,7 @@  int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 	struct ceph_mds_request *req;
 	int mode;
 	int err;
+	struct ceph_inode_info* cinode = ceph_inode(inode);
 
 	if (ceph_snap(inode) == CEPH_SNAPDIR) {
 		dout("do_getattr inode %p SNAPDIR\n", inode);
@@ -2205,16 +2211,36 @@  int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 	if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
 		return 0;
 
-	mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
-	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-	req->r_inode = inode;
-	ihold(inode);
-	req->r_num_caps = 1;
-	req->r_args.getattr.mask = cpu_to_le32(mask);
-	req->r_locked_page = locked_page;
-	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	mutex_lock(&cinode->getattrs_inflight_lock);
+	dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
+	req = __search_inode_getattr_or_lookup(&cinode->getattrs_inflight, mask, false);
+	if (req) {
+		dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
+		ceph_mdsc_get_request(req);
+		mutex_unlock(&cinode->getattrs_inflight_lock);
+		err = ceph_mdsc_wait_for_request(req);
+		dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
+	} else {
+		mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
+		if (!IS_ERR(req)) {
+			req->r_inode = inode;
+			ihold(inode);
+			req->r_num_caps = 1;
+			req->r_args.getattr.mask = cpu_to_le32(mask);
+			req->r_locked_page = locked_page;
+			__register_inode_getattr_or_lookup(cinode, req, false);
+			dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
+		}
+		mutex_unlock(&cinode->getattrs_inflight_lock);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		mutex_lock(&cinode->getattrs_inflight_lock);
+		__unregister_inode_getattr_or_lookup(cinode, req, false);
+		dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
+		mutex_unlock(&cinode->getattrs_inflight_lock);
+	}
 	if (locked_page && err == 0) {
 		u64 inline_version = req->r_reply_info.targeti.inline_version;
 		if (inline_version == 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index dc8bc664a871..4412ee13164e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1792,7 +1792,10 @@  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	req->r_fmode = -1;
 	kref_init(&req->r_kref);
 	RB_CLEAR_NODE(&req->r_node);
+	RB_CLEAR_NODE(&req->getattr_node);
+	RB_CLEAR_NODE(&req->lookup_node);
 	INIT_LIST_HEAD(&req->r_wait);
+	init_completion(&req->batch_op_completion);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -2386,6 +2389,23 @@  void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
 	mutex_unlock(&mdsc->mutex);
 }
 
+int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
+{
+	int err = 0;
+	long timeleft = wait_for_completion_killable_timeout(
+			&req->batch_op_completion,
+			ceph_timeout_jiffies(req->r_timeout));
+	if (timeleft > 0)
+		err = 0;
+	else if (!timeleft)
+		err = -EIO;  /* timed out */
+	else
+		err = timeleft;  /* killed */
+	if (!err)
+		return err;
+	return req->batch_op_err;
+}
+
 /*
  * Synchrously perform an mds request.  Take care of all of the
  * session setup, forwarding, retry details.
@@ -2458,7 +2478,8 @@  int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	} else {
 		err = req->r_err;
 	}
-
+	req->batch_op_err = err;
+	complete_all(&req->batch_op_completion);
 out:
 	mutex_unlock(&mdsc->mutex);
 	dout("do_request %p done, result %d\n", req, err);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 2ec3b5b35067..830c97e1bcf0 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -199,6 +199,7 @@  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
 struct ceph_mds_request {
 	u64 r_tid;                   /* transaction id */
 	struct rb_node r_node;
+	struct rb_node getattr_node, lookup_node;
 	struct ceph_mds_client *r_mdsc;
 
 	int r_op;                    /* mds op code */
@@ -250,7 +251,7 @@  struct ceph_mds_request {
 	struct ceph_msg  *r_reply;
 	struct ceph_mds_reply_info_parsed r_reply_info;
 	struct page *r_locked_page;
-	int r_err;
+	int r_err, batch_op_err;
 
 	unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
 	unsigned long r_started;  /* start time to measure timeout against */
@@ -273,6 +274,7 @@  struct ceph_mds_request {
 
 	struct kref       r_kref;
 	struct list_head  r_wait;
+	struct completion batch_op_completion;
 	struct completion r_completion;
 	struct completion r_safe_completion;
 	ceph_mds_request_callback_t r_callback;
@@ -411,6 +413,7 @@  extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
 				     struct ceph_mds_request *req);
+extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 				struct inode *dir,
 				struct ceph_mds_request *req);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 95a3b3ac9b6e..021fb7c1072c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1158,6 +1158,74 @@  static void __exit exit_ceph(void)
 	destroy_caches();
 }
 
+void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
+					  struct ceph_mds_request* req,
+					  bool is_lookup)
+{
+	if (!is_lookup)
+		rb_erase(&req->getattr_node, &ci->getattrs_inflight);
+	else
+		rb_erase(&req->lookup_node, &ci->lookups_inflight);
+}
+
+void __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
+					struct ceph_mds_request* req,
+					bool is_lookup)
+{
+	struct rb_node **p = NULL, *parent = NULL;
+	struct ceph_mds_request *tmp = NULL;
+
+	if (!is_lookup)
+		p = &ci->getattrs_inflight.rb_node;
+	else
+		p = &ci->lookups_inflight.rb_node;
+	
+	while (*p) {
+		parent = *p;
+		if (!is_lookup)
+			tmp = rb_entry(parent, struct ceph_mds_request, getattr_node);
+		else
+			tmp = rb_entry(parent, struct ceph_mds_request, lookup_node);
+		if (req->r_args.getattr.mask < tmp->r_args.getattr.mask)
+			p = &(*p)->rb_left;
+		else if (req->r_args.getattr.mask > tmp->r_args.getattr.mask)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+	
+	if (!is_lookup) {
+		rb_link_node(&req->getattr_node, parent, p);
+		rb_insert_color(&req->getattr_node, &ci->getattrs_inflight);
+	} else {
+		rb_link_node(&req->lookup_node, parent, p);
+		rb_insert_color(&req->lookup_node, &ci->getattrs_inflight);
+	}
+}
+
+struct ceph_mds_request* __search_inode_getattr_or_lookup(struct rb_root* root,
+				      int mask,
+				      bool is_lookup)
+{
+	struct rb_node *node = root->rb_node;  /* top of the tree */
+	
+	while (node)
+	{
+		struct ceph_mds_request* tmp = NULL;
+		if (!is_lookup)
+			tmp = rb_entry(node, struct ceph_mds_request, getattr_node);
+		else
+			tmp = rb_entry(node, struct ceph_mds_request, lookup_node);
+		
+		if (tmp->r_args.getattr.mask > mask)
+			node = node->rb_left;
+		else if (tmp->r_args.getattr.mask < mask)
+			node = node->rb_right;
+		else
+			return tmp;  /* Found it */
+	}
+	return NULL;
+}
 module_init(init_ceph);
 module_exit(exit_ceph);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a7077a0c989f..d39234049e88 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -292,6 +292,8 @@  struct ceph_inode_info {
 	struct ceph_vino i_vino;   /* ceph ino + snap */
 
 	spinlock_t i_ceph_lock;
+	struct mutex getattrs_inflight_lock, lookups_inflight_lock;
+	struct rb_root getattrs_inflight, lookups_inflight;
 
 	u64 i_version;
 	u64 i_inline_version;
@@ -859,6 +861,17 @@  extern int ceph_fill_file_size(struct inode *inode, int issued,
 extern void ceph_fill_file_time(struct inode *inode, int issued,
 				u64 time_warp_seq, struct timespec *ctime,
 				struct timespec *mtime, struct timespec *atime);
+extern void __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
+					       struct ceph_mds_request* req,
+					       bool is_lookup);
+
+extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
+						 struct ceph_mds_request* req,
+						 bool is_lookup);
+
+extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct rb_root* root,
+					     int mask,
+					     bool is_lookup);
 extern int ceph_fill_trace(struct super_block *sb,
 			   struct ceph_mds_request *req);
 extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,