diff mbox

libceph: Complete stuck requests to OSD with EIO

Message ID 4e080919-2df5-1b75-3c8a-3ae95eb8f08d@synesis.ru (mailing list archive)
State New, archived
Headers show

Commit Message

Artur Molchanov Feb. 9, 2017, 1:04 p.m. UTC
From: Artur Molchanov <artur.molchanov@synesis.ru>

Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
If osd_request_timeout equals to 0 (default value) then do nothing with
hung requests (keep default behavior).

Create RBD map option osd_request_timeout to set timeout in seconds. Set
osd_request_timeout to 0 by default.

Signed-off-by: Artur Molchanov <artur.molchanov@synesis.ru>
---
  include/linux/ceph/libceph.h    |  8 +++--
  include/linux/ceph/osd_client.h |  1 +
  net/ceph/ceph_common.c          | 12 +++++++
  net/ceph/osd_client.c           | 72 +++++++++++++++++++++++++++++++++++++++++
  4 files changed, 90 insertions(+), 3 deletions(-)

  	osdc->osdmap = ceph_osdmap_alloc();
@@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct 
ceph_client *client)
  			      osdc->client->options->osd_keepalive_timeout);
  	schedule_delayed_work(&osdc->osds_timeout_work,
  	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
+	schedule_delayed_work(&osdc->osd_request_timeout_work,
+	    round_jiffies_relative(osdc->client->options->osd_request_timeout));

  	return 0;

Comments

Ilya Dryomov Feb. 9, 2017, 2:27 p.m. UTC | #1
On Thu, Feb 9, 2017 at 2:04 PM, Artur Molchanov
<artur.molchanov@synesis.ru> wrote:
> From: Artur Molchanov <artur.molchanov@synesis.ru>
>
> Complete stuck requests to OSD with error EIO after osd_request_timeout
> expired.
> If osd_request_timeout equals to 0 (default value) then do nothing with
> hung requests (keep default behavior).
>
> Create RBD map option osd_request_timeout to set timeout in seconds. Set
> osd_request_timeout to 0 by default.

Hi Artur,

What is the use case?  What do you expect this timeout to be in your
environment?

I actually have a slightly more compact version of this somewhere,
which I deliberately excluded when updating the OSD client a few kernel
releases ago.  IIRC complete_request() wasn't meant to be called
arbitrarily, but we can probably fix that.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Artur Molchanov Feb. 9, 2017, 3:51 p.m. UTC | #2
Hi Ilya,

Currently we're working on moving our product to kubernetes / ceph stack. We're 
actively tuning ceph configuration and ceph cluster often need to be redeployed. 
We have to warm reboot all machines with mounted RBDs as a result. This patch 
allows us to remap and remount RBDs without time consuming reboot.

On 02/09/2017 05:27 PM, Ilya Dryomov wrote:
> What is the use case?  What do you expect this timeout to be in your
> environment?
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wido den Hollander Feb. 9, 2017, 4:11 p.m. UTC | #3
> Op 9 februari 2017 om 16:51 schreef Artur Molchanov <artur.molchanov@synesis.ru>:
> 
> 
> Hi Ilya,
> 
> Currently we're working on moving our product to kubernetes / ceph stack. We're 
> actively tuning ceph configuration and ceph cluster often need to be redeployed. 
> We have to warm reboot all machines with mounted RBDs as a result. This patch 
> allows us to remap and remount RBDs without time consuming reboot.
> 

I understand the usecase. You don't always want to block, sometimes you want to timeout after some time.

Wido

> On 02/09/2017 05:27 PM, Ilya Dryomov wrote:
> > What is the use case?  What do you expect this timeout to be in your
> > environment?
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Feb. 10, 2017, 11:31 a.m. UTC | #4
On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
> From: Artur Molchanov <artur.molchanov@synesis.ru>
> 
> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
> If osd_request_timeout equals to 0 (default value) then do nothing with
> hung requests (keep default behavior).
> 
> Create RBD map option osd_request_timeout to set timeout in seconds. Set
> osd_request_timeout to 0 by default.
> 
Also, what exactly are the requests blocked on when this occurs? Is the
ceph_osd_request_target ending up paused? I wonder if we might be better
off with something that returns a hard error under the circumstances
where you're hanging, rather than depending on timeouts.


> Signed-off-by: Artur Molchanov <artur.molchanov@synesis.ru>
> ---
>   include/linux/ceph/libceph.h    |  8 +++--
>   include/linux/ceph/osd_client.h |  1 +
>   net/ceph/ceph_common.c          | 12 +++++++
>   net/ceph/osd_client.c           | 72 +++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 90 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 1816c5e..10d8acb 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -48,6 +48,7 @@ struct ceph_options {
>   	unsigned long mount_timeout;		/* jiffies */
>   	unsigned long osd_idle_ttl;		/* jiffies */
>   	unsigned long osd_keepalive_timeout;	/* jiffies */
> +	unsigned long osd_request_timeout;	/* jiffies */
> 
>   	/*
>   	 * any type that can't be simply compared or doesn't need need
> @@ -65,9 +66,10 @@ struct ceph_options {
>   /*
>    * defaults
>    */
> -#define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
> -#define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
> -#define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
> +#define CEPH_MOUNT_TIMEOUT_DEFAULT		msecs_to_jiffies(60 * 1000)
> +#define CEPH_OSD_KEEPALIVE_DEFAULT		msecs_to_jiffies(5 * 1000)
> +#define CEPH_OSD_IDLE_TTL_DEFAULT		msecs_to_jiffies(60 * 1000)
> +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT	msecs_to_jiffies(0 * 1000)
> 
>   #define CEPH_MONC_HUNT_INTERVAL		msecs_to_jiffies(3 * 1000)
>   #define CEPH_MONC_PING_INTERVAL		msecs_to_jiffies(10 * 1000)
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 03a6653..e45cba0 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -279,6 +279,7 @@ struct ceph_osd_client {
>   	atomic_t               num_homeless;
>   	struct delayed_work    timeout_work;
>   	struct delayed_work    osds_timeout_work;
> +	struct delayed_work    osd_request_timeout_work;
>   #ifdef CONFIG_DEBUG_FS
>   	struct dentry 	       *debugfs_file;
>   #endif
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index 464e885..81876c8 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -230,6 +230,7 @@ enum {
>   	Opt_osdkeepalivetimeout,
>   	Opt_mount_timeout,
>   	Opt_osd_idle_ttl,
> +	Opt_osd_request_timeout,
>   	Opt_last_int,
>   	/* int args above */
>   	Opt_fsid,
> @@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
>   	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
>   	{Opt_mount_timeout, "mount_timeout=%d"},
>   	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
> +	{Opt_osd_request_timeout, "osd_request_timeout=%d"},
>   	/* int args above */
>   	{Opt_fsid, "fsid=%s"},
>   	{Opt_name, "name=%s"},
> @@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
>   	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
>   	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
>   	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
> +	opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
> 
>   	/* get mon ip(s) */
>   	/* ip1[:port1][,ip2[:port2]...] */
> @@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
>   			}
>   			opt->mount_timeout = msecs_to_jiffies(intval * 1000);
>   			break;
> +		case Opt_osd_request_timeout:
> +			/* 0 is "wait forever" (i.e. infinite timeout) */
> +			if (intval < 0 || intval > INT_MAX / 1000) {
> +				pr_err("osd_request_timeout out of range\n");
> +				err = -EINVAL;
> +				goto out;
> +			}
> +			opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
> +			break;
> 
>   		case Opt_share:
>   			opt->flags &= ~CEPH_OPT_NOSHARE;
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 842f049..2f5a024 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -2554,6 +2554,75 @@ static void handle_timeout(struct work_struct *work)
>   			      osdc->client->options->osd_keepalive_timeout);
>   }
> 
> +/*
> + * Complete request to OSD with error err if it started before cutoff.
> + */
> +static void complete_osd_stuck_request(struct ceph_osd_request *req,
> +				       unsigned long cutoff,
> +				       int err)
> +{
> +	if (!time_before(req->r_stamp, cutoff))
> +		return;
> +
> +	pr_warn_ratelimited("osd req on osd%d timeout expired\n",
> +			    req->r_osd->o_osd);
> +
> +	complete_request(req, err);
> +}
> +
> +/*
> + * Complete all requests to OSD that has been active for more than timeout.
> + */
> +static void complete_osd_stuck_requests(struct ceph_osd *osd,
> +					unsigned long timeout,
> +					int err)
> +{
> +	struct ceph_osd_request *req, *n;
> +	const unsigned long cutoff = jiffies - timeout;
> +
> +	rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) {
> +		complete_osd_stuck_request(req, cutoff, -EIO);
> +	}
> +}
> +
> +/*
> + * Timeout callback, called every N seconds. When 1 or more OSD
> + * requests that has been active for more than N seconds,
> + * we complete it with error EIO.
> + */
> +static void handle_osd_request_timeout(struct work_struct *work)
> +{
> +	struct ceph_osd_client *osdc =
> +		container_of(work, struct ceph_osd_client,
> +			     osd_request_timeout_work.work);
> +	struct ceph_osd *osd, *n;
> +	struct ceph_options *opts = osdc->client->options;
> +	const unsigned long timeout = opts->osd_request_timeout;
> +
> +	dout("%s osdc %p\n", __func__, osdc);
> +
> +	if (!timeout)
> +		return;
> +
> +	down_write(&osdc->lock);
> +
> +	rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) {
> +		complete_osd_stuck_requests(osd, timeout, -EIO);
> +	}
> +
> +	up_write(&osdc->lock);
> +
> +	if (!atomic_read(&osdc->num_homeless))
> +		goto out;
> +
> +	down_write(&osdc->lock);
> +	complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO);
> +	up_write(&osdc->lock);
> +
> +out:
> +	schedule_delayed_work(&osdc->osd_request_timeout_work, timeout);
> +}
> +
>   static void handle_osds_timeout(struct work_struct *work)
>   {
>   	struct ceph_osd_client *osdc =
> @@ -4091,6 +4160,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct 
> ceph_client *client)
>   	osdc->linger_map_checks = RB_ROOT;
>   	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
>   	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
> +	INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout);
> 
>   	err = -ENOMEM;
>   	osdc->osdmap = ceph_osdmap_alloc();
> @@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct 
> ceph_client *client)
>   			      osdc->client->options->osd_keepalive_timeout);
>   	schedule_delayed_work(&osdc->osds_timeout_work,
>   	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
> +	schedule_delayed_work(&osdc->osd_request_timeout_work,
> +	    round_jiffies_relative(osdc->client->options->osd_request_timeout));
> 
>   	return 0;
> 

Having a job that has to wake up every second or so isn't ideal. Perhaps
you would be better off scheduling the delayed work in the request
submission codepath, and only rearm it when the tree isn't empty after
calling complete_osd_stuck_requests?

Also, I don't see where this job is ever cancelled when the osdc is torn
down. That needs to occur or you'll cause a use-after-free oops...
Ilya Dryomov Feb. 10, 2017, 12:25 p.m. UTC | #5
On Fri, Feb 10, 2017 at 12:31 PM, Jeff Layton <jlayton@redhat.com> wrote:
> On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
>> From: Artur Molchanov <artur.molchanov@synesis.ru>
>>
>> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
>> If osd_request_timeout equals to 0 (default value) then do nothing with
>> hung requests (keep default behavior).
>>
>> Create RBD map option osd_request_timeout to set timeout in seconds. Set
>> osd_request_timeout to 0 by default.
>>
> Also, what exactly are the requests blocked on when this occurs? Is the
> ceph_osd_request_target ending up paused? I wonder if we might be better
> off with something that returns a hard error under the circumstances
> where you're hanging, rather than depending on timeouts.
>
>
>> Signed-off-by: Artur Molchanov <artur.molchanov@synesis.ru>
>> ---
>>   include/linux/ceph/libceph.h    |  8 +++--
>>   include/linux/ceph/osd_client.h |  1 +
>>   net/ceph/ceph_common.c          | 12 +++++++
>>   net/ceph/osd_client.c           | 72 +++++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 90 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
>> index 1816c5e..10d8acb 100644
>> --- a/include/linux/ceph/libceph.h
>> +++ b/include/linux/ceph/libceph.h
>> @@ -48,6 +48,7 @@ struct ceph_options {
>>       unsigned long mount_timeout;            /* jiffies */
>>       unsigned long osd_idle_ttl;             /* jiffies */
>>       unsigned long osd_keepalive_timeout;    /* jiffies */
>> +     unsigned long osd_request_timeout;      /* jiffies */
>>
>>       /*
>>        * any type that can't be simply compared or doesn't need need
>> @@ -65,9 +66,10 @@ struct ceph_options {
>>   /*
>>    * defaults
>>    */
>> -#define CEPH_MOUNT_TIMEOUT_DEFAULT   msecs_to_jiffies(60 * 1000)
>> -#define CEPH_OSD_KEEPALIVE_DEFAULT   msecs_to_jiffies(5 * 1000)
>> -#define CEPH_OSD_IDLE_TTL_DEFAULT    msecs_to_jiffies(60 * 1000)
>> +#define CEPH_MOUNT_TIMEOUT_DEFAULT           msecs_to_jiffies(60 * 1000)
>> +#define CEPH_OSD_KEEPALIVE_DEFAULT           msecs_to_jiffies(5 * 1000)
>> +#define CEPH_OSD_IDLE_TTL_DEFAULT            msecs_to_jiffies(60 * 1000)
>> +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT     msecs_to_jiffies(0 * 1000)
>>
>>   #define CEPH_MONC_HUNT_INTERVAL             msecs_to_jiffies(3 * 1000)
>>   #define CEPH_MONC_PING_INTERVAL             msecs_to_jiffies(10 * 1000)
>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>> index 03a6653..e45cba0 100644
>> --- a/include/linux/ceph/osd_client.h
>> +++ b/include/linux/ceph/osd_client.h
>> @@ -279,6 +279,7 @@ struct ceph_osd_client {
>>       atomic_t               num_homeless;
>>       struct delayed_work    timeout_work;
>>       struct delayed_work    osds_timeout_work;
>> +     struct delayed_work    osd_request_timeout_work;
>>   #ifdef CONFIG_DEBUG_FS
>>       struct dentry          *debugfs_file;
>>   #endif
>> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
>> index 464e885..81876c8 100644
>> --- a/net/ceph/ceph_common.c
>> +++ b/net/ceph/ceph_common.c
>> @@ -230,6 +230,7 @@ enum {
>>       Opt_osdkeepalivetimeout,
>>       Opt_mount_timeout,
>>       Opt_osd_idle_ttl,
>> +     Opt_osd_request_timeout,
>>       Opt_last_int,
>>       /* int args above */
>>       Opt_fsid,
>> @@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
>>       {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
>>       {Opt_mount_timeout, "mount_timeout=%d"},
>>       {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
>> +     {Opt_osd_request_timeout, "osd_request_timeout=%d"},
>>       /* int args above */
>>       {Opt_fsid, "fsid=%s"},
>>       {Opt_name, "name=%s"},
>> @@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
>>       opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
>>       opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
>>       opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
>> +     opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
>>
>>       /* get mon ip(s) */
>>       /* ip1[:port1][,ip2[:port2]...] */
>> @@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
>>                       }
>>                       opt->mount_timeout = msecs_to_jiffies(intval * 1000);
>>                       break;
>> +             case Opt_osd_request_timeout:
>> +                     /* 0 is "wait forever" (i.e. infinite timeout) */
>> +                     if (intval < 0 || intval > INT_MAX / 1000) {
>> +                             pr_err("osd_request_timeout out of range\n");
>> +                             err = -EINVAL;
>> +                             goto out;
>> +                     }
>> +                     opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
>> +                     break;
>>
>>               case Opt_share:
>>                       opt->flags &= ~CEPH_OPT_NOSHARE;
>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> index 842f049..2f5a024 100644
>> --- a/net/ceph/osd_client.c
>> +++ b/net/ceph/osd_client.c
>> @@ -2554,6 +2554,75 @@ static void handle_timeout(struct work_struct *work)
>>                             osdc->client->options->osd_keepalive_timeout);
>>   }
>>
>> +/*
>> + * Complete request to OSD with error err if it started before cutoff.
>> + */
>> +static void complete_osd_stuck_request(struct ceph_osd_request *req,
>> +                                    unsigned long cutoff,
>> +                                    int err)
>> +{
>> +     if (!time_before(req->r_stamp, cutoff))
>> +             return;
>> +
>> +     pr_warn_ratelimited("osd req on osd%d timeout expired\n",
>> +                         req->r_osd->o_osd);
>> +
>> +     complete_request(req, err);
>> +}
>> +
>> +/*
>> + * Complete all requests to OSD that has been active for more than timeout.
>> + */
>> +static void complete_osd_stuck_requests(struct ceph_osd *osd,
>> +                                     unsigned long timeout,
>> +                                     int err)
>> +{
>> +     struct ceph_osd_request *req, *n;
>> +     const unsigned long cutoff = jiffies - timeout;
>> +
>> +     rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) {
>> +             complete_osd_stuck_request(req, cutoff, -EIO);
>> +     }
>> +}
>> +
>> +/*
>> + * Timeout callback, called every N seconds. When 1 or more OSD
>> + * requests that has been active for more than N seconds,
>> + * we complete it with error EIO.
>> + */
>> +static void handle_osd_request_timeout(struct work_struct *work)
>> +{
>> +     struct ceph_osd_client *osdc =
>> +             container_of(work, struct ceph_osd_client,
>> +                          osd_request_timeout_work.work);
>> +     struct ceph_osd *osd, *n;
>> +     struct ceph_options *opts = osdc->client->options;
>> +     const unsigned long timeout = opts->osd_request_timeout;
>> +
>> +     dout("%s osdc %p\n", __func__, osdc);
>> +
>> +     if (!timeout)
>> +             return;
>> +
>> +     down_write(&osdc->lock);
>> +
>> +     rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) {
>> +             complete_osd_stuck_requests(osd, timeout, -EIO);
>> +     }
>> +
>> +     up_write(&osdc->lock);
>> +
>> +     if (!atomic_read(&osdc->num_homeless))
>> +             goto out;
>> +
>> +     down_write(&osdc->lock);
>> +     complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO);
>> +     up_write(&osdc->lock);
>> +
>> +out:
>> +     schedule_delayed_work(&osdc->osd_request_timeout_work, timeout);
>> +}
>> +
>>   static void handle_osds_timeout(struct work_struct *work)
>>   {
>>       struct ceph_osd_client *osdc =
>> @@ -4091,6 +4160,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>> ceph_client *client)
>>       osdc->linger_map_checks = RB_ROOT;
>>       INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
>>       INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
>> +     INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout);
>>
>>       err = -ENOMEM;
>>       osdc->osdmap = ceph_osdmap_alloc();
>> @@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>> ceph_client *client)
>>                             osdc->client->options->osd_keepalive_timeout);
>>       schedule_delayed_work(&osdc->osds_timeout_work,
>>           round_jiffies_relative(osdc->client->options->osd_idle_ttl));
>> +     schedule_delayed_work(&osdc->osd_request_timeout_work,
>> +         round_jiffies_relative(osdc->client->options->osd_request_timeout));
>>
>>       return 0;
>>
>
> Having a job that has to wake up every second or so isn't ideal. Perhaps
> you would be better off scheduling the delayed work in the request
> submission codepath, and only rearm it when the tree isn't empty after
> calling complete_osd_stuck_requests?
>
> Also, I don't see where this job is ever cancelled when the osdc is torn
> down. That needs to occur or you'll cause a use-after-free oops...

Yeah, there are other bugs here: rbtree_postorder_for_each_entry_safe()
can't be used because complete_request() calls erase(), etc.  The patch
I had just piggy-backed on handle_timeout(), which is called every five
seconds anyway.

Given that there is interest, I think it's worth adding.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Artur Molchanov Feb. 11, 2017, 8:30 p.m. UTC | #6
Hi Jef,

On Fri, 2017-02-10 at 14:31, Jeff Layton wrote:
> On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
>> From: Artur Molchanov <artur.molchanov@synesis.ru>
>>
>> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
>> If osd_request_timeout equals to 0 (default value) then do nothing with
>> hung requests (keep default behavior).
>>
>> Create RBD map option osd_request_timeout to set timeout in seconds. Set
>> osd_request_timeout to 0 by default.
>>
> Also, what exactly are the requests blocked on when this occurs? Is the
> ceph_osd_request_target ending up paused? I wonder if we might be better
> off with something that returns a hard error under the circumstances
> where you're hanging, rather than depending on timeouts.

I wonder that it is better to complete requests only after timeout expired, just 
because a request can fail due to temporary network issues (e.g. router 
restarted) or restarting machine/services.

> Having a job that has to wake up every second or so isn't ideal. Perhaps
> you would be better off scheduling the delayed work in the request
> submission codepath, and only rearm it when the tree isn't empty after
> calling complete_osd_stuck_requests?

Would you please tell me more about rearming work only if the tree is not empty 
after calling complete_osd_stuck_requests? From what code we should call 
complete_osd_stuck_requests?

As I understood, there are two primary cases:
1 - Requests to OSD failed, but monitors do not return new osdmap (because all 
monitors are offline or monitors did not update osdmap yet).
In this case requests are retried by cyclic calling ceph_con_workfn -> con_fault 
-> ceph_con_workfn. We can check request timestamp and does not call con_fault 
but complete it.

2 - Monitors return new osdmap which does not have any OSD for RBD.
In this case all requests to the last ready OSD will be linked on "homeless" OSD 
and will not be retried until new osdmap with appropriate OSD received. I think 
that we need additional periodic checking timestamp such requests.

Yes, there is already existing job handle_timeout. But the responsibility of 
this job is to sending keepalive requests to slow OSD. I'm not sure that it is a 
good idea to perform additional actions inside this job.
I decided that creating specific job handle_osd_request_timeout is more applicable.

This job will be run only once with a default value of osd_request_timeout (0).
At the same time, I think that user will not use too small value for this 
parameter. I wonder that typical value will be about 1 minute or greater.

> Also, I don't see where this job is ever cancelled when the osdc is torn
> down. That needs to occur or you'll cause a use-after-free oops...

It is my fault, thanks for the correction.
Jeff Layton Feb. 12, 2017, 12:34 p.m. UTC | #7
On Sat, 2017-02-11 at 23:30 +0300, Artur Molchanov wrote:
> Hi Jef,
> 
> On Fri, 2017-02-10 at 14:31, Jeff Layton wrote:
> > On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
> > > From: Artur Molchanov <artur.molchanov@synesis.ru>
> > > 
> > > Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
> > > If osd_request_timeout equals to 0 (default value) then do nothing with
> > > hung requests (keep default behavior).
> > > 
> > > Create RBD map option osd_request_timeout to set timeout in seconds. Set
> > > osd_request_timeout to 0 by default.
> > > 
> > 
> > Also, what exactly are the requests blocked on when this occurs? Is the
> > ceph_osd_request_target ending up paused? I wonder if we might be better
> > off with something that returns a hard error under the circumstances
> > where you're hanging, rather than depending on timeouts.
> 
> I wonder that it is better to complete requests only after timeout expired, just 
> because a request can fail due to temporary network issues (e.g. router 
> restarted) or restarting machine/services.
> 
> > Having a job that has to wake up every second or so isn't ideal. Perhaps
> > you would be better off scheduling the delayed work in the request
> > submission codepath, and only rearm it when the tree isn't empty after
> > calling complete_osd_stuck_requests?
> 
> Would you please tell me more about rearming work only if the tree is not empty 
> after calling complete_osd_stuck_requests? From what code we should call 
> complete_osd_stuck_requests?
> 

Sure. I'm saying you would want to call schedule_delayed_work for the
timeout handler from the request submission path when you link a request
into the tree that has a timeout. Maybe in __submit_request?

Then, instead of unconditionally calling schedule_delayed_work at the
end of handle_request_timeout, you'd only call it if there were no
requests still sitting in the osdc trees.

> As I understood, there are two primary cases:
> 1 - Requests to OSD failed, but monitors do not return new osdmap (because all 
> monitors are offline or monitors did not update osdmap yet).
> In this case requests are retried by cyclic calling ceph_con_workfn -> con_fault 
> -> ceph_con_workfn. We can check request timestamp and does not call con_fault 
> but complete it.
> 
> 2 - Monitors return new osdmap which does not have any OSD for RBD.
> In this case all requests to the last ready OSD will be linked on "homeless" OSD 
> and will not be retried until new osdmap with appropriate OSD received. I think 
> that we need additional periodic checking timestamp such requests.
> 
> Yes, there is already existing job handle_timeout. But the responsibility of 
> this job is to sending keepalive requests to slow OSD. I'm not sure that it is a 
> good idea to perform additional actions inside this job.
> I decided that creating specific job handle_osd_request_timeout is more applicable.
> 
> This job will be run only once with a default value of osd_request_timeout (0).

Ahh, I missed that -- thanks.

> At the same time, I think that user will not use too small value for this 
> parameter. I wonder that typical value will be about 1 minute or greater.
> 
> > Also, I don't see where this job is ever cancelled when the osdc is torn
> > down. That needs to occur or you'll cause a use-after-free oops...
> 
> It is my fault, thanks for the correction.
>
Artur Molchanov Feb. 12, 2017, 3:26 p.m. UTC | #8
On Sun 2017-2-12 at 15:34, Jeff Layton wrote:
> On Sat, 2017-02-11 at 23:30 +0300, Artur Molchanov wrote:
>> Hi Jef,
>>
>> On Fri, 2017-02-10 at 14:31, Jeff Layton wrote:
>>> On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
>>>> From: Artur Molchanov <artur.molchanov@synesis.ru>
>>>>
>>>> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
>>>> If osd_request_timeout equals to 0 (default value) then do nothing with
>>>> hung requests (keep default behavior).
>>>>
>>>> Create RBD map option osd_request_timeout to set timeout in seconds. Set
>>>> osd_request_timeout to 0 by default.
>>>>
>>>
>>> Also, what exactly are the requests blocked on when this occurs? Is the
>>> ceph_osd_request_target ending up paused? I wonder if we might be better
>>> off with something that returns a hard error under the circumstances
>>> where you're hanging, rather than depending on timeouts.
>>
>> I wonder that it is better to complete requests only after timeout expired, just
>> because a request can fail due to temporary network issues (e.g. router
>> restarted) or restarting machine/services.
>>
>>> Having a job that has to wake up every second or so isn't ideal. Perhaps
>>> you would be better off scheduling the delayed work in the request
>>> submission codepath, and only rearm it when the tree isn't empty after
>>> calling complete_osd_stuck_requests?
>>
>> Would you please tell me more about rearming work only if the tree is not empty
>> after calling complete_osd_stuck_requests? From what code we should call
>> complete_osd_stuck_requests?
>>
>
> Sure. I'm saying you would want to call schedule_delayed_work for the
> timeout handler from the request submission path when you link a request
> into the tree that has a timeout. Maybe in __submit_request?
>
> Then, instead of unconditionally calling schedule_delayed_work at the
> end of handle_request_timeout, you'd only call it if there were no
> requests still sitting in the osdc trees.

Thanks for the clarification.
But, unfortunately, requests which are linked to "homeless" OSD will not retry 
until new osdmap with proper OSD received.

>> As I understood, there are two primary cases:
>> 1 - Requests to OSD failed, but monitors do not return new osdmap (because all
>> monitors are offline or monitors did not update osdmap yet).
>> In this case requests are retried by cyclic calling ceph_con_workfn -> con_fault
>> -> ceph_con_workfn. We can check request timestamp and does not call con_fault
>> but complete it.
>>
>> 2 - Monitors return new osdmap which does not have any OSD for RBD.
>> In this case all requests to the last ready OSD will be linked on "homeless" OSD
>> and will not be retried until new osdmap with appropriate OSD received. I think
>> that we need additional periodic checking timestamp such requests.
>>
>> Yes, there is already existing job handle_timeout. But the responsibility of
>> this job is to sending keepalive requests to slow OSD. I'm not sure that it is a
>> good idea to perform additional actions inside this job.
>> I decided that creating specific job handle_osd_request_timeout is more applicable.
>>
>> This job will be run only once with a default value of osd_request_timeout (0).
>
> Ahh, I missed that -- thanks.
>
>> At the same time, I think that user will not use too small value for this
>> parameter. I wonder that typical value will be about 1 minute or greater.
>>
>>> Also, I don't see where this job is ever cancelled when the osdc is torn
>>> down. That needs to occur or you'll cause a use-after-free oops...
>>
>> It is my fault, thanks for the correction.
>>
diff mbox

Patch

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 1816c5e..10d8acb 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -48,6 +48,7 @@  struct ceph_options {
  	unsigned long mount_timeout;		/* jiffies */
  	unsigned long osd_idle_ttl;		/* jiffies */
  	unsigned long osd_keepalive_timeout;	/* jiffies */
+	unsigned long osd_request_timeout;	/* jiffies */

  	/*
  	 * any type that can't be simply compared or doesn't need need
@@ -65,9 +66,10 @@  struct ceph_options {
  /*
   * defaults
   */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
-#define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
-#define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_MOUNT_TIMEOUT_DEFAULT		msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_KEEPALIVE_DEFAULT		msecs_to_jiffies(5 * 1000)
+#define CEPH_OSD_IDLE_TTL_DEFAULT		msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT	msecs_to_jiffies(0 * 1000)

  #define CEPH_MONC_HUNT_INTERVAL		msecs_to_jiffies(3 * 1000)
  #define CEPH_MONC_PING_INTERVAL		msecs_to_jiffies(10 * 1000)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03a6653..e45cba0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -279,6 +279,7 @@  struct ceph_osd_client {
  	atomic_t               num_homeless;
  	struct delayed_work    timeout_work;
  	struct delayed_work    osds_timeout_work;
+	struct delayed_work    osd_request_timeout_work;
  #ifdef CONFIG_DEBUG_FS
  	struct dentry 	       *debugfs_file;
  #endif
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 464e885..81876c8 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -230,6 +230,7 @@  enum {
  	Opt_osdkeepalivetimeout,
  	Opt_mount_timeout,
  	Opt_osd_idle_ttl,
+	Opt_osd_request_timeout,
  	Opt_last_int,
  	/* int args above */
  	Opt_fsid,
@@ -256,6 +257,7 @@  static match_table_t opt_tokens = {
  	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
  	{Opt_mount_timeout, "mount_timeout=%d"},
  	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	{Opt_osd_request_timeout, "osd_request_timeout=%d"},
  	/* int args above */
  	{Opt_fsid, "fsid=%s"},
  	{Opt_name, "name=%s"},
@@ -361,6 +363,7 @@  ceph_parse_options(char *options, const char *dev_name,
  	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
  	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
  	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+	opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;

  	/* get mon ip(s) */
  	/* ip1[:port1][,ip2[:port2]...] */
@@ -473,6 +476,15 @@  ceph_parse_options(char *options, const char *dev_name,
  			}
  			opt->mount_timeout = msecs_to_jiffies(intval * 1000);
  			break;
+		case Opt_osd_request_timeout:
+			/* 0 is "wait forever" (i.e. infinite timeout) */
+			if (intval < 0 || intval > INT_MAX / 1000) {
+				pr_err("osd_request_timeout out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
+			break;

  		case Opt_share:
  			opt->flags &= ~CEPH_OPT_NOSHARE;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 842f049..2f5a024 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2554,6 +2554,75 @@  static void handle_timeout(struct work_struct *work)
  			      osdc->client->options->osd_keepalive_timeout);
  }

+/*
+ * Complete request to OSD with error err if it started before cutoff.
+ */
+static void complete_osd_stuck_request(struct ceph_osd_request *req,
+				       unsigned long cutoff,
+				       int err)
+{
+	if (!time_before(req->r_stamp, cutoff))
+		return;
+
+	pr_warn_ratelimited("osd req on osd%d timeout expired\n",
+			    req->r_osd->o_osd);
+
+	complete_request(req, err);
+}
+
+/*
+ * Complete all requests to OSD that has been active for more than timeout.
+ */
+static void complete_osd_stuck_requests(struct ceph_osd *osd,
+					unsigned long timeout,
+					int err)
+{
+	struct ceph_osd_request *req, *n;
+	const unsigned long cutoff = jiffies - timeout;
+
+	rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) {
+		complete_osd_stuck_request(req, cutoff, -EIO);
+	}
+}
+
+/*
+ * Timeout callback, called every N seconds. When 1 or more OSD
+ * requests that has been active for more than N seconds,
+ * we complete it with error EIO.
+ */
+static void handle_osd_request_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client,
+			     osd_request_timeout_work.work);
+	struct ceph_osd *osd, *n;
+	struct ceph_options *opts = osdc->client->options;
+	const unsigned long timeout = opts->osd_request_timeout;
+
+	dout("%s osdc %p\n", __func__, osdc);
+
+	if (!timeout)
+		return;
+
+	down_write(&osdc->lock);
+
+	rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) {
+		complete_osd_stuck_requests(osd, timeout, -EIO);
+	}
+
+	up_write(&osdc->lock);
+
+	if (!atomic_read(&osdc->num_homeless))
+		goto out;
+
+	down_write(&osdc->lock);
+	complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO);
+	up_write(&osdc->lock);
+
+out:
+	schedule_delayed_work(&osdc->osd_request_timeout_work, timeout);
+}
+
  static void handle_osds_timeout(struct work_struct *work)
  {
  	struct ceph_osd_client *osdc =
@@ -4091,6 +4160,7 @@  int ceph_osdc_init(struct ceph_osd_client *osdc, struct 
ceph_client *client)
  	osdc->linger_map_checks = RB_ROOT;
  	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
  	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+	INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout);

  	err = -ENOMEM;