diff mbox series

[5/5] libceph: read_policy option

Message ID 20200529151952.15184-6-idryomov@gmail.com (mailing list archive)
State New, archived
Headers show
Series libceph: support for replica reads | expand

Commit Message

Ilya Dryomov May 29, 2020, 3:19 p.m. UTC
Expose balanced and localized reads through read_policy=balance
and read_policy=localize.  The default is to read from primary.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h |  2 ++
 net/ceph/ceph_common.c       | 26 ++++++++++++++++++++++++++
 net/ceph/osd_client.c        |  5 ++++-
 3 files changed, 32 insertions(+), 1 deletion(-)

Comments

Jeff Layton May 29, 2020, 5:19 p.m. UTC | #1
On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote:
> Expose balanced and localized reads through read_policy=balance
> and read_policy=localize.  The default is to read from primary.
> 
> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> ---
>  include/linux/ceph/libceph.h |  2 ++
>  net/ceph/ceph_common.c       | 26 ++++++++++++++++++++++++++
>  net/ceph/osd_client.c        |  5 ++++-
>  3 files changed, 32 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 4733959f1ec7..0a9f807ceda6 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -52,6 +52,8 @@ struct ceph_options {
>  	unsigned long osd_idle_ttl;		/* jiffies */
>  	unsigned long osd_keepalive_timeout;	/* jiffies */
>  	unsigned long osd_request_timeout;	/* jiffies */
> +	unsigned int osd_req_flags;  /* CEPH_OSD_FLAG_*, applied to
> +					each OSD request */
>  
>  	/*
>  	 * any type that can't be simply compared or doesn't need
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index 6d495685ee03..1a834cb0d04d 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -265,6 +265,7 @@ enum {
>  	Opt_key,
>  	Opt_ip,
>  	Opt_crush_location,
> +	Opt_read_policy,
>  	/* string args above */
>  	Opt_share,
>  	Opt_crc,
> @@ -274,6 +275,17 @@ enum {
>  	Opt_abort_on_full,
>  };
>  
> +enum {
> +	Opt_read_policy_balance,
> +	Opt_read_policy_localize,
> +};
> +
> +static const struct constant_table ceph_param_read_policy[] = {
> +	{"balance",	Opt_read_policy_balance},
> +	{"localize",	Opt_read_policy_localize},
> +	{}
> +};
> +
>  static const struct fs_parameter_spec ceph_parameters[] = {
>  	fsparam_flag	("abort_on_full",		Opt_abort_on_full),
>  	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
> @@ -290,6 +302,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
>  	fsparam_u32	("osdkeepalive",		Opt_osdkeepalivetimeout),
>  	__fsparam	(fs_param_is_s32, "osdtimeout", Opt_osdtimeout,
>  			 fs_param_deprecated, NULL),
> +	fsparam_enum	("read_policy",			Opt_read_policy,
> +			 ceph_param_read_policy),
>  	fsparam_string	("secret",			Opt_secret),
>  	fsparam_flag_no ("share",			Opt_share),
>  	fsparam_flag_no ("tcp_nodelay",			Opt_tcp_nodelay),
> @@ -470,6 +484,18 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
>  			return err;
>  		}
>  		break;
> +	case Opt_read_policy:
> +		switch (result.uint_32) {
> +		case Opt_read_policy_balance:
> +			opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS;
> +			break;
> +		case Opt_read_policy_localize:
> +			opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
> +			break;
> +		default:
> +			BUG();
> +		}
> +		break;

Suppose I specify "-o read_policy=balance,read_policy=localize".

Principle of least surprise says "last one wins", but you'll end up with
both flags set here, and I think the final result would still be
"balance". I think it'd probably be best to rework this so that the last
option specified is what you get.

I also think you want a way to explicitly set it back to default
behavior (read_policy=primary ?), as it's not uncommon for people to
specify mount options in fstab but then append to them on the command
line. e.g.:

    # mount /mnt/cephfs -o read_policy=primary

...when fstab already has read_policy=balance.

 
>  	case Opt_osdtimeout:
>  		warn_plog(&log, "Ignoring osdtimeout");
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 15c3afa8089b..da7046db9fbe 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -2425,11 +2425,14 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
>  
>  static void account_request(struct ceph_osd_request *req)
>  {
> +	struct ceph_osd_client *osdc = req->r_osdc;
> +
>  	WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
>  	WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
>  
>  	req->r_flags |= CEPH_OSD_FLAG_ONDISK;
> -	atomic_inc(&req->r_osdc->num_requests);
> +	req->r_flags |= osdc->client->options->osd_req_flags;
> +	atomic_inc(&osdc->num_requests);
>  
>  	req->r_start_stamp = jiffies;
>  	req->r_start_latency = ktime_get();
Ilya Dryomov May 29, 2020, 6:42 p.m. UTC | #2
On Fri, May 29, 2020 at 7:19 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote:
> > Expose balanced and localized reads through read_policy=balance
> > and read_policy=localize.  The default is to read from primary.
> >
> > Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> > ---
> >  include/linux/ceph/libceph.h |  2 ++
> >  net/ceph/ceph_common.c       | 26 ++++++++++++++++++++++++++
> >  net/ceph/osd_client.c        |  5 ++++-
> >  3 files changed, 32 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> > index 4733959f1ec7..0a9f807ceda6 100644
> > --- a/include/linux/ceph/libceph.h
> > +++ b/include/linux/ceph/libceph.h
> > @@ -52,6 +52,8 @@ struct ceph_options {
> >       unsigned long osd_idle_ttl;             /* jiffies */
> >       unsigned long osd_keepalive_timeout;    /* jiffies */
> >       unsigned long osd_request_timeout;      /* jiffies */
> > +     unsigned int osd_req_flags;  /* CEPH_OSD_FLAG_*, applied to
> > +                                     each OSD request */
> >
> >       /*
> >        * any type that can't be simply compared or doesn't need
> > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> > index 6d495685ee03..1a834cb0d04d 100644
> > --- a/net/ceph/ceph_common.c
> > +++ b/net/ceph/ceph_common.c
> > @@ -265,6 +265,7 @@ enum {
> >       Opt_key,
> >       Opt_ip,
> >       Opt_crush_location,
> > +     Opt_read_policy,
> >       /* string args above */
> >       Opt_share,
> >       Opt_crc,
> > @@ -274,6 +275,17 @@ enum {
> >       Opt_abort_on_full,
> >  };
> >
> > +enum {
> > +     Opt_read_policy_balance,
> > +     Opt_read_policy_localize,
> > +};
> > +
> > +static const struct constant_table ceph_param_read_policy[] = {
> > +     {"balance",     Opt_read_policy_balance},
> > +     {"localize",    Opt_read_policy_localize},
> > +     {}
> > +};
> > +
> >  static const struct fs_parameter_spec ceph_parameters[] = {
> >       fsparam_flag    ("abort_on_full",               Opt_abort_on_full),
> >       fsparam_flag_no ("cephx_require_signatures",    Opt_cephx_require_signatures),
> > @@ -290,6 +302,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
> >       fsparam_u32     ("osdkeepalive",                Opt_osdkeepalivetimeout),
> >       __fsparam       (fs_param_is_s32, "osdtimeout", Opt_osdtimeout,
> >                        fs_param_deprecated, NULL),
> > +     fsparam_enum    ("read_policy",                 Opt_read_policy,
> > +                      ceph_param_read_policy),
> >       fsparam_string  ("secret",                      Opt_secret),
> >       fsparam_flag_no ("share",                       Opt_share),
> >       fsparam_flag_no ("tcp_nodelay",                 Opt_tcp_nodelay),
> > @@ -470,6 +484,18 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
> >                       return err;
> >               }
> >               break;
> > +     case Opt_read_policy:
> > +             switch (result.uint_32) {
> > +             case Opt_read_policy_balance:
> > +                     opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS;
> > +                     break;
> > +             case Opt_read_policy_localize:
> > +                     opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
> > +                     break;
> > +             default:
> > +                     BUG();
> > +             }
> > +             break;
>
> Suppose I specify "-o read_policy=balance,read_policy=localize".
>
> Principle of least surprise says "last one wins", but you'll end up with
> both flags set here, and I think the final result would still be
> "balance". I think it'd probably be best to rework this so that the last
> option specified is what you get.
>
> I also think you want a way to explicitly set it back to default
> behavior (read_policy=primary ?), as it's not uncommon for people to
> specify mount options in fstab but then append to them on the command
> line. e.g.:
>
>     # mount /mnt/cephfs -o read_policy=primary
>
> ...when fstab already has read_policy=balance.

Yes, currently balance wins.

Adding read_policy=primary and implementing the override behaviour
for read_policy sounds reasonable.

Thanks,

                Ilya
diff mbox series

Patch

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 4733959f1ec7..0a9f807ceda6 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -52,6 +52,8 @@  struct ceph_options {
 	unsigned long osd_idle_ttl;		/* jiffies */
 	unsigned long osd_keepalive_timeout;	/* jiffies */
 	unsigned long osd_request_timeout;	/* jiffies */
+	unsigned int osd_req_flags;  /* CEPH_OSD_FLAG_*, applied to
+					each OSD request */
 
 	/*
 	 * any type that can't be simply compared or doesn't need
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 6d495685ee03..1a834cb0d04d 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -265,6 +265,7 @@  enum {
 	Opt_key,
 	Opt_ip,
 	Opt_crush_location,
+	Opt_read_policy,
 	/* string args above */
 	Opt_share,
 	Opt_crc,
@@ -274,6 +275,17 @@  enum {
 	Opt_abort_on_full,
 };
 
+enum {
+	Opt_read_policy_balance,
+	Opt_read_policy_localize,
+};
+
+static const struct constant_table ceph_param_read_policy[] = {
+	{"balance",	Opt_read_policy_balance},
+	{"localize",	Opt_read_policy_localize},
+	{}
+};
+
 static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_flag	("abort_on_full",		Opt_abort_on_full),
 	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
@@ -290,6 +302,8 @@  static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_u32	("osdkeepalive",		Opt_osdkeepalivetimeout),
 	__fsparam	(fs_param_is_s32, "osdtimeout", Opt_osdtimeout,
 			 fs_param_deprecated, NULL),
+	fsparam_enum	("read_policy",			Opt_read_policy,
+			 ceph_param_read_policy),
 	fsparam_string	("secret",			Opt_secret),
 	fsparam_flag_no ("share",			Opt_share),
 	fsparam_flag_no ("tcp_nodelay",			Opt_tcp_nodelay),
@@ -470,6 +484,18 @@  int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 			return err;
 		}
 		break;
+	case Opt_read_policy:
+		switch (result.uint_32) {
+		case Opt_read_policy_balance:
+			opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS;
+			break;
+		case Opt_read_policy_localize:
+			opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
+			break;
+		default:
+			BUG();
+		}
+		break;
 
 	case Opt_osdtimeout:
 		warn_plog(&log, "Ignoring osdtimeout");
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 15c3afa8089b..da7046db9fbe 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2425,11 +2425,14 @@  static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 
 static void account_request(struct ceph_osd_request *req)
 {
+	struct ceph_osd_client *osdc = req->r_osdc;
+
 	WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
 	WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
 
 	req->r_flags |= CEPH_OSD_FLAG_ONDISK;
-	atomic_inc(&req->r_osdc->num_requests);
+	req->r_flags |= osdc->client->options->osd_req_flags;
+	atomic_inc(&osdc->num_requests);
 
 	req->r_start_stamp = jiffies;
 	req->r_start_latency = ktime_get();