Message ID | 1568083391-920-1-git-send-email-simon29rock@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ceph: add mount opt, always_auth | expand |
On Mon, 2019-09-09 at 22:43 -0400, simon gao wrote: > In larger clusters (hundreds of millions of files). We have to pin the > directory on a fixed mds now. Some op of client use USE_ANY_MDS mode > to access mds, which may result in requests being sent to noauth mds > and then forwarded to authmds. > the opt is used to reduce forward ops by sending req to auth mds. > --- > fs/ceph/mds_client.c | 7 ++++++- > fs/ceph/super.c | 7 +++++++ > fs/ceph/super.h | 1 + > 3 files changed, 14 insertions(+), 1 deletion(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 920e9f0..aca4490 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -878,6 +878,7 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) > static int __choose_mds(struct ceph_mds_client *mdsc, > struct ceph_mds_request *req) > { > + struct ceph_mount_options *ma = mdsc->fsc->mount_options; > struct inode *inode; > struct ceph_inode_info *ci; > struct ceph_cap *cap; > @@ -900,7 +901,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc, > > if (mode == USE_RANDOM_MDS) > goto random; > - > + // force to send the req to auth mds > + if (ma->flags & CEPH_MOUNT_OPT_ALWAYS_AUTH && mode != USE_AUTH_MDS){ > + dout("change mode %d => USE_AUTH_MDS", mode); > + mode = USE_AUTH_MDS; > + } > inode = NULL; > if (req->r_inode) { > if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { > diff --git a/fs/ceph/super.c b/fs/ceph/super.c > index ab4868c..1e81ebc 100644 > --- a/fs/ceph/super.c > +++ b/fs/ceph/super.c > @@ -169,6 +169,7 @@ enum { > Opt_noquotadf, > Opt_copyfrom, > Opt_nocopyfrom, > + Opt_always_auth, > }; > > static match_table_t fsopt_tokens = { > @@ -210,6 +211,7 @@ enum { > {Opt_noquotadf, "noquotadf"}, > {Opt_copyfrom, "copyfrom"}, > {Opt_nocopyfrom, "nocopyfrom"}, > + {Opt_always_auth, "always_auth"}, > {-1, NULL} > }; > > @@ -381,6 +383,9 @@ static int parse_fsopt_token(char *c, void *private) > case Opt_noacl: > fsopt->sb_flags &= ~SB_POSIXACL; > break; > + case Opt_always_auth: > + fsopt->flags |= CEPH_MOUNT_OPT_ALWAYS_AUTH; > + break; > default: > BUG_ON(token); > } > @@ -563,6 +568,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) > seq_puts(m, ",nopoolperm"); > if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF) > seq_puts(m, ",noquotadf"); > + if (fsopt->flags & CEPH_MOUNT_OPT_ALWAYS_AUTH) > + seq_puts(m, ",always_auth"); > > #ifdef CONFIG_CEPH_FS_POSIX_ACL > if (fsopt->sb_flags & SB_POSIXACL) > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index 6b9f1ee..65f6423 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -41,6 +41,7 @@ > #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ > #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ > #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ > +#define CEPH_MOUNT_OPT_ALWAYS_AUTH (1<<15) /* send op to auth mds, not to replicative mds */ > > #define CEPH_MOUNT_OPT_DEFAULT \ > (CEPH_MOUNT_OPT_DCACHE | \ I've no particular objection here, but I'd prefer Greg's ack before we merge it, since he raised earlier concerns. If we are going to take it, then this will need to be rebased on top of the mount API conversion that's currently in ceph-client/testing branch.
thanks Jeff Layton <jlayton@kernel.org> 于2019年9月10日周二 下午6:11写道: > > On Mon, 2019-09-09 at 22:43 -0400, simon gao wrote: > > In larger clusters (hundreds of millions of files). We have to pin the > > directory on a fixed mds now. Some op of client use USE_ANY_MDS mode > > to access mds, which may result in requests being sent to noauth mds > > and then forwarded to authmds. > > the opt is used to reduce forward ops by sending req to auth mds. > > --- > > fs/ceph/mds_client.c | 7 ++++++- > > fs/ceph/super.c | 7 +++++++ > > fs/ceph/super.h | 1 + > > 3 files changed, 14 insertions(+), 1 deletion(-) > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > index 920e9f0..aca4490 100644 > > --- a/fs/ceph/mds_client.c > > +++ b/fs/ceph/mds_client.c > > @@ -878,6 +878,7 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) > > static int __choose_mds(struct ceph_mds_client *mdsc, > > struct ceph_mds_request *req) > > { > > + struct ceph_mount_options *ma = mdsc->fsc->mount_options; > > struct inode *inode; > > struct ceph_inode_info *ci; > > struct ceph_cap *cap; > > @@ -900,7 +901,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc, > > > > if (mode == USE_RANDOM_MDS) > > goto random; > > - > > + // force to send the req to auth mds > > + if (ma->flags & CEPH_MOUNT_OPT_ALWAYS_AUTH && mode != USE_AUTH_MDS){ > > + dout("change mode %d => USE_AUTH_MDS", mode); > > + mode = USE_AUTH_MDS; > > + } > > inode = NULL; > > if (req->r_inode) { > > if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c > > index ab4868c..1e81ebc 100644 > > --- a/fs/ceph/super.c > > +++ b/fs/ceph/super.c > > @@ -169,6 +169,7 @@ enum { > > Opt_noquotadf, > > Opt_copyfrom, > > Opt_nocopyfrom, > > + Opt_always_auth, > > }; > > > > static match_table_t fsopt_tokens = { > > @@ -210,6 +211,7 @@ enum { > > {Opt_noquotadf, "noquotadf"}, > > {Opt_copyfrom, "copyfrom"}, > > {Opt_nocopyfrom, "nocopyfrom"}, > > + {Opt_always_auth, "always_auth"}, > > {-1, NULL} > > }; > > > > @@ -381,6 +383,9 @@ static int parse_fsopt_token(char *c, void *private) > > case Opt_noacl: > > fsopt->sb_flags &= ~SB_POSIXACL; > > break; > > + case Opt_always_auth: > > + fsopt->flags |= CEPH_MOUNT_OPT_ALWAYS_AUTH; > > + break; > > default: > > BUG_ON(token); > > } > > @@ -563,6 +568,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) > > seq_puts(m, ",nopoolperm"); > > if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF) > > seq_puts(m, ",noquotadf"); > > + if (fsopt->flags & CEPH_MOUNT_OPT_ALWAYS_AUTH) > > + seq_puts(m, ",always_auth"); > > > > #ifdef CONFIG_CEPH_FS_POSIX_ACL > > if (fsopt->sb_flags & SB_POSIXACL) > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > > index 6b9f1ee..65f6423 100644 > > --- a/fs/ceph/super.h > > +++ b/fs/ceph/super.h > > @@ -41,6 +41,7 @@ > > #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ > > #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ > > #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ > > +#define CEPH_MOUNT_OPT_ALWAYS_AUTH (1<<15) /* send op to auth mds, not to replicative mds */ > > > > #define CEPH_MOUNT_OPT_DEFAULT \ > > (CEPH_MOUNT_OPT_DCACHE | \ > > I've no particular objection here, but I'd prefer Greg's ack before we > merge it, since he raised earlier concerns. > > If we are going to take it, then this will need to be rebased on top of > the mount API conversion that's currently in ceph-client/testing branch. > -- > Jeff Layton <jlayton@kernel.org> >
On Tue, Sep 10, 2019 at 3:11 AM Jeff Layton <jlayton@kernel.org> wrote: > I've no particular objection here, but I'd prefer Greg's ack before we > merge it, since he raised earlier concerns. You have my acked-by in light of Zheng's comments elsewhere and the evidence that this actually works in some scenarios. Might be nice to at least get far enough to generate tickets based on your questions in the other thread, though: On Wed, Sep 11, 2019 at 9:26 AM Jeff Layton <jlayton@kernel.org> wrote: > In an ideal world, what should happen in this case? Should we be > changing MDS policy to forward the request in this situation? > > This mount option seems like it's exposing something that is really an > internal implementation detail to the admin. That might be justified, > but I'm unclear on why we don't expect more saner behavior from the MDS > on this? I think partly it's that early designs underestimated the cost of replication and overestimated its utility, but I also thought forwards were supposed to happen more often than replication so I'm curious why it's apparently not doing that. -Greg
On Wed, 2019-09-11 at 11:30 -0700, Gregory Farnum wrote: > On Tue, Sep 10, 2019 at 3:11 AM Jeff Layton <jlayton@kernel.org> wrote: > > I've no particular objection here, but I'd prefer Greg's ack before we > > merge it, since he raised earlier concerns. > > You have my acked-by in light of Zheng's comments elsewhere and the > evidence that this actually works in some scenarios. > > Might be nice to at least get far enough to generate tickets based on > your questions in the other thread, though: > I'm not sold yet. Why is this something the client should have to worry about at all? Can we do something on the MDS to better handle this situation? This really feels like we're exposing an implementation detail via mount option. At a bare minimum, if we take this, I'd like to see some documentation. When should a user decide to turn this on or off? There are no guidelines to the use of this thing so far. > On Wed, Sep 11, 2019 at 9:26 AM Jeff Layton <jlayton@kernel.org> wrote: > > In an ideal world, what should happen in this case? Should we be > > changing MDS policy to forward the request in this situation? > > > > This mount option seems like it's exposing something that is really an > > internal implementation detail to the admin. That might be justified, > > but I'm unclear on why we don't expect more saner behavior from the MDS > > on this? > > I think partly it's that early designs underestimated the cost of > replication and overestimated its utility, but I also thought forwards > were supposed to happen more often than replication so I'm curious why > it's apparently not doing that. > -Greg
On Thu, Sep 12, 2019 at 6:21 AM Jeff Layton <jlayton@kernel.org> wrote: > > On Wed, 2019-09-11 at 11:30 -0700, Gregory Farnum wrote: > > On Tue, Sep 10, 2019 at 3:11 AM Jeff Layton <jlayton@kernel.org> wrote: > > > I've no particular objection here, but I'd prefer Greg's ack before we > > > merge it, since he raised earlier concerns. > > > > You have my acked-by in light of Zheng's comments elsewhere and the > > evidence that this actually works in some scenarios. > > > > Might be nice to at least get far enough to generate tickets based on > > your questions in the other thread, though: > > > > I'm not sold yet. > > Why is this something the client should have to worry about at all? Can > we do something on the MDS to better handle this situation? This really > feels like we're exposing an implementation detail via mount option. > I think we can. make mds return empty DirStat::dist in request reply > At a bare minimum, if we take this, I'd like to see some documentation. > When should a user decide to turn this on or off? There are no > guidelines to the use of this thing so far. > > > > On Wed, Sep 11, 2019 at 9:26 AM Jeff Layton <jlayton@kernel.org> wrote: > > > In an ideal world, what should happen in this case? Should we be > > > changing MDS policy to forward the request in this situation? > > > > > > This mount option seems like it's exposing something that is really an > > > internal implementation detail to the admin. That might be justified, > > > but I'm unclear on why we don't expect more saner behavior from the MDS > > > on this? > > > > I think partly it's that early designs underestimated the cost of > > replication and overestimated its utility, but I also thought forwards > > were supposed to happen more often than replication so I'm curious why > > it's apparently not doing that. > > -Greg > > -- > Jeff Layton <jlayton@kernel.org> >
On Fri, 2019-09-13 at 10:08 +0800, Yan, Zheng wrote: > On Thu, Sep 12, 2019 at 6:21 AM Jeff Layton <jlayton@kernel.org> wrote: > > On Wed, 2019-09-11 at 11:30 -0700, Gregory Farnum wrote: > > > On Tue, Sep 10, 2019 at 3:11 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > I've no particular objection here, but I'd prefer Greg's ack before we > > > > merge it, since he raised earlier concerns. > > > > > > You have my acked-by in light of Zheng's comments elsewhere and the > > > evidence that this actually works in some scenarios. > > > > > > Might be nice to at least get far enough to generate tickets based on > > > your questions in the other thread, though: > > > > > > > I'm not sold yet. > > > > Why is this something the client should have to worry about at all? Can > > we do something on the MDS to better handle this situation? This really > > feels like we're exposing an implementation detail via mount option. > > > > I think we can. make mds return empty DirStat::dist in request reply > I guess that'd make the client think that it wasn't replicated? Under what conditions would you have it return that in the reply? Should we be looking to have the MDS favor forwarding over replication more (as Greg seems to be suggesting)? Note too that I'm not opposed to adding some sort of mitigation for this problem if needed to help with code that's in the field, but I'd prefer to address the root cause if we can so the workaround may not be needed in the future. Mount options are harder to deprecate since they'll be in docs forever, and they are necessarily per-vfsmount. If you do need this, would the switch be more appropriate as a kernel module parameter instead? > > At a bare minimum, if we take this, I'd like to see some documentation. > > When should a user decide to turn this on or off? There are no > > guidelines to the use of this thing so far. > > > > > > > On Wed, Sep 11, 2019 at 9:26 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > In an ideal world, what should happen in this case? Should we be > > > > changing MDS policy to forward the request in this situation? > > > > > > > > This mount option seems like it's exposing something that is really an > > > > internal implementation detail to the admin. That might be justified, > > > > but I'm unclear on why we don't expect more saner behavior from the MDS > > > > on this? > > > > > > I think partly it's that early designs underestimated the cost of > > > replication and overestimated its utility, but I also thought forwards > > > were supposed to happen more often than replication so I'm curious why > > > it's apparently not doing that. > > > -Greg > > > > -- > > Jeff Layton <jlayton@kernel.org> > >
Hi Jeff Yes. If there are too many files in a cluster (100 million levels) far more than cache of a single mds, fixed inodes to special mds, and prevents other mds from loading these inodes, which can achieve better results. This problem is particularly prominent on mds0. This problem is similar to flushcache. Good ratio of ssh and hdd can get better results. Removing the duplicate inodes is equivalent to increasing the total amount of cache. I think. thanks gaoyu > > On Fri, 2019-09-13 at 10:08 +0800, Yan, Zheng wrote: > > On Thu, Sep 12, 2019 at 6:21 AM Jeff Layton <jlayton@kernel.org> wrote: > > > On Wed, 2019-09-11 at 11:30 -0700, Gregory Farnum wrote: > > > > On Tue, Sep 10, 2019 at 3:11 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > I've no particular objection here, but I'd prefer Greg's ack before we > > > > > merge it, since he raised earlier concerns. > > > > > > > > You have my acked-by in light of Zheng's comments elsewhere and the > > > > evidence that this actually works in some scenarios. > > > > > > > > Might be nice to at least get far enough to generate tickets based on > > > > your questions in the other thread, though: > > > > > > > > > > I'm not sold yet. > > > > > > Why is this something the client should have to worry about at all? Can > > > we do something on the MDS to better handle this situation? This really > > > feels like we're exposing an implementation detail via mount option. > > > > > > > I think we can. make mds return empty DirStat::dist in request reply > > > > I guess that'd make the client think that it wasn't replicated? > > Under what conditions would you have it return that in the reply? Should > we be looking to have the MDS favor forwarding over replication more (as > Greg seems to be suggesting)? > > Note too that I'm not opposed to adding some sort of mitigation for this > problem if needed to help with code that's in the field, but I'd prefer > to address the root cause if we can so the workaround may not be needed > in the future. > > Mount options are harder to deprecate since they'll be in docs forever, > and they are necessarily per-vfsmount. If you do need this, would the > switch be more appropriate as a kernel module parameter instead? > > > > At a bare minimum, if we take this, I'd like to see some documentation. > > > When should a user decide to turn this on or off? There are no > > > guidelines to the use of this thing so far. > > > > > > > > > > On Wed, Sep 11, 2019 at 9:26 AM Jeff Layton <jlayton@kernel.org> wrote: > > > > > In an ideal world, what should happen in this case? Should we be > > > > > changing MDS policy to forward the request in this situation? > > > > > > > > > > This mount option seems like it's exposing something that is really an > > > > > internal implementation detail to the admin. That might be justified, > > > > > but I'm unclear on why we don't expect more saner behavior from the MDS > > > > > on this? > > > > > > > > I think partly it's that early designs underestimated the cost of > > > > replication and overestimated its utility, but I also thought forwards > > > > were supposed to happen more often than replication so I'm curious why > > > > it's apparently not doing that. > > > > -Greg > > > > > > -- > > > Jeff Layton <jlayton@kernel.org> > > > > > -- > Jeff Layton <jlayton@kernel.org> >
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 920e9f0..aca4490 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -878,6 +878,7 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct ceph_mount_options *ma = mdsc->fsc->mount_options; struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; @@ -900,7 +901,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (mode == USE_RANDOM_MDS) goto random; - + // force to send the req to auth mds + if (ma->flags & CEPH_MOUNT_OPT_ALWAYS_AUTH && mode != USE_AUTH_MDS){ + dout("change mode %d => USE_AUTH_MDS", mode); + mode = USE_AUTH_MDS; + } inode = NULL; if (req->r_inode) { if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index ab4868c..1e81ebc 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -169,6 +169,7 @@ enum { Opt_noquotadf, Opt_copyfrom, Opt_nocopyfrom, + Opt_always_auth, }; static match_table_t fsopt_tokens = { @@ -210,6 +211,7 @@ enum { {Opt_noquotadf, "noquotadf"}, {Opt_copyfrom, "copyfrom"}, {Opt_nocopyfrom, "nocopyfrom"}, + {Opt_always_auth, "always_auth"}, {-1, NULL} }; @@ -381,6 +383,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noacl: fsopt->sb_flags &= ~SB_POSIXACL; break; + case Opt_always_auth: + fsopt->flags |= CEPH_MOUNT_OPT_ALWAYS_AUTH; + break; default: BUG_ON(token); } @@ -563,6 +568,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nopoolperm"); if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF) seq_puts(m, ",noquotadf"); + if (fsopt->flags & CEPH_MOUNT_OPT_ALWAYS_AUTH) + seq_puts(m, ",always_auth"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (fsopt->sb_flags & SB_POSIXACL) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6b9f1ee..65f6423 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -41,6 +41,7 @@ #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ +#define CEPH_MOUNT_OPT_ALWAYS_AUTH (1<<15) /* send op to auth mds, not to replicative mds */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \