[v2] ceph: check availability of mds cluster on mount after wait timeout
diff mbox series

Message ID 20191127083508.12102-1-xiubli@redhat.com
State New
Headers show
Series
  • [v2] ceph: check availability of mds cluster on mount after wait timeout
Related show

Commit Message

Xiubo Li Nov. 27, 2019, 8:35 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

If all the MDS daemons are down for some reasons and for the first
time to do the mount, it will fail with IO error after the mount
request timed out.

Or if the cluster becomes laggy suddenly, and just before the kclient
getting the new mdsmap and the mount request is fired off, it also
will fail with IO error.

This will add some useful hint message by checking the cluster state
before the fail the mount operation.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c | 4 ++--
 fs/ceph/super.c      | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

Comments

Xiubo Li Dec. 10, 2019, 5:14 a.m. UTC | #1
Checked the new mount API, still need this patch to do the check.

The following is the simple V3 patch, it will return -ESTALE to the 
userland if the cluster is laggy or no MDS is up, then in the mount.ceph 
we can check it and print some hint about the "cluster is laggy or no 
MDS is up", will it make sense ?

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7d3ec051f179..1065190e00df 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2577,7 +2577,6 @@ static void __do_request(struct ceph_mds_client *mdsc,
                       CEPH_MOUNT_OPT_MOUNTWAIT) &&
!ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
                         err = -ENOENT;
-                       pr_info("probably no mds server is up\n");
                         goto finish;
                 }
         }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9c9a7c68eea3..da3aee796c17 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1068,6 +1068,11 @@ static int ceph_get_tree(struct fs_context *fc)
         return 0;

  out_splat:
+       if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
+               pr_info("No mds server is up or the cluster is laggy\n");
+               err = -ESTALE;
+       }
+
         ceph_mdsc_close_sessions(fsc->mdsc);
         deactivate_locked_super(sb);
         goto out_final;




BRs

On 2019/11/27 16:35, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
>
> If all the MDS daemons are down for some reasons and for the first
> time to do the mount, it will fail with IO error after the mount
> request timed out.
>
> Or if the cluster becomes laggy suddenly, and just before the kclient
> getting the new mdsmap and the mount request is fired off, it also
> will fail with IO error.
>
> This will add some useful hint message by checking the cluster state
> before the fail the mount operation.
>
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>   fs/ceph/mds_client.c | 4 ++--
>   fs/ceph/super.c      | 4 ++++
>   2 files changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 109ec7e2ee7b..163b470f3000 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2556,7 +2556,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
>   		      CEPH_MOUNT_OPT_MOUNTWAIT) &&
>   		    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
>   			err = -ENOENT;
> -			pr_info("probably no mds server is up\n");
> +			pr_info("No mds server is up or the cluster is laggy\n");
>   			goto finish;
>   		}
>   	}
> @@ -2706,7 +2706,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
>   		if (timeleft > 0)
>   			err = 0;
>   		else if (!timeleft)
> -			err = -EIO;  /* timed out */
> +			err = -ETIMEDOUT;  /* timed out */
>   		else
>   			err = timeleft;  /* killed */
>   	}
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index af2754b80b7c..39810677e601 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -1137,6 +1137,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
>   	return res;
>   
>   out_splat:
> +	if (PTR_ERR(res) == -ETIMEDOUT &&
> +	    !ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap))
> +		pr_info("No mds server is up or the cluster is laggy\n");
> +
>   	ceph_mdsc_close_sessions(fsc->mdsc);
>   	deactivate_locked_super(sb);
>   	goto out_final;

Patch
diff mbox series

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 109ec7e2ee7b..163b470f3000 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2556,7 +2556,7 @@  static void __do_request(struct ceph_mds_client *mdsc,
 		      CEPH_MOUNT_OPT_MOUNTWAIT) &&
 		    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
 			err = -ENOENT;
-			pr_info("probably no mds server is up\n");
+			pr_info("No mds server is up or the cluster is laggy\n");
 			goto finish;
 		}
 	}
@@ -2706,7 +2706,7 @@  static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
 		if (timeleft > 0)
 			err = 0;
 		else if (!timeleft)
-			err = -EIO;  /* timed out */
+			err = -ETIMEDOUT;  /* timed out */
 		else
 			err = timeleft;  /* killed */
 	}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index af2754b80b7c..39810677e601 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1137,6 +1137,10 @@  static struct dentry *ceph_mount(struct file_system_type *fs_type,
 	return res;
 
 out_splat:
+	if (PTR_ERR(res) == -ETIMEDOUT &&
+	    !ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap))
+		pr_info("No mds server is up or the cluster is laggy\n");
+
 	ceph_mdsc_close_sessions(fsc->mdsc);
 	deactivate_locked_super(sb);
 	goto out_final;