diff mbox

rbd: add timeout function to rbd driver

Message ID 20170727103257.11608-1-xiongwei.jiang@alibaba-inc.com (mailing list archive)
State New, archived
Headers show

Commit Message

xiongweijiang666@gmail.com July 27, 2017, 10:32 a.m. UTC
From: Xiongwei Jiang <xiongwei.jiang@alibaba-inc.com>

when an application is writing or reading on rbd device, if some or all
OSDs crash, the application will hang and can't be killed because it is
in D state. Even though OSDs comes up later, the application may still
keeps in D state. So we need a timeout mechanism to solve this problem.

Signed-off-by: Xiongwei Jiang <xiongwei.jiang@alibaba-inc.com>
---
 drivers/block/rbd.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

Comments

Ilya Dryomov July 27, 2017, 11:32 a.m. UTC | #1
On Thu, Jul 27, 2017 at 12:32 PM,  <xiongweijiang666@gmail.com> wrote:
> From: Xiongwei Jiang <xiongwei.jiang@alibaba-inc.com>
>
> when an application is writing or reading on rbd device, if some or all
> OSDs crash, the application will hang and can't be killed because it is
> in D state. Even though OSDs comes up later, the application may still
> keeps in D state. So we need a timeout mechanism to solve this problem.

Hi Xiongwei,

This shouldn't happen -- when the OSDs come back up, all requests
should be properly resent and completed.  If you have a scenario where
the OSDs come back up into HEALTH_OK and this doesn't happen, that's
likely a bug.

>
> Signed-off-by: Xiongwei Jiang <xiongwei.jiang@alibaba-inc.com>
> ---
>  drivers/block/rbd.c | 34 +++++++++++++++++++++++++++++++++-
>  1 file changed, 33 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index c16f745..33a1c97 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -159,6 +159,13 @@ struct rbd_image_header {
>         u64 *snap_sizes;        /* format 1 only */
>  };
>
> +
> +struct rbd_request_linker {
> +       struct work_struct work;
> +       void *img_request;
> +};
> +
> +
>  /*
>   * An rbd image specification.
>   *
> @@ -4013,6 +4020,7 @@ static void rbd_queue_workfn(struct work_struct *work)
>         struct request *rq = blk_mq_rq_from_pdu(work);
>         struct rbd_device *rbd_dev = rq->q->queuedata;
>         struct rbd_img_request *img_request;
> +       struct rbd_request_linker *linker;
>         struct ceph_snap_context *snapc = NULL;
>         u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
>         u64 length = blk_rq_bytes(rq);
> @@ -4120,6 +4128,7 @@ static void rbd_queue_workfn(struct work_struct *work)
>                 goto err_unlock;
>         }
>         img_request->rq = rq;
> +       linker->img_request = img_request;
>         snapc = NULL; /* img_request consumes a ref */
>
>         if (op_type == OBJ_OP_DISCARD)
> @@ -4358,9 +4367,32 @@ static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
>         return 0;
>  }
>
> +static enum blk_eh_timer_return rbd_request_timeout(struct request *rq,
> +               bool reserved)
> +{
> +       struct rbd_obj_request *obj_request;
> +       struct rbd_obj_request *next_obj_request;
> +       struct rbd_img_request *img_request;
> +       struct rbd_request_linker *linker = blk_mq_rq_to_pdu(rq);
> +
> +       img_request = (struct rbd_img_request *)linker->img_request;
> +       for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
> +               struct ceph_osd_request *osd_req = obj_request->osd_req;
> +
> +               if (!osd_req)
> +                       printk(KERN_INFO "osd_req is null \n");
> +               else
> +                       ceph_osdc_cancel_request(osd_req);
> +       }
> +       return BLK_EH_HANDLED;
> +}
> +
> +
> +
>  static const struct blk_mq_ops rbd_mq_ops = {
>         .queue_rq       = rbd_queue_rq,
>         .init_request   = rbd_init_request,
> +       .timeout       = rbd_request_timeout,

The default blk-mq timeout is 30 seconds.  This means that any "slow"
request (osd_op_complaint_time ceph.conf option) will be timed out,
forcing the mounted filesystem into read-only mode at best.

>  };
>
>  static int rbd_init_disk(struct rbd_device *rbd_dev)
> @@ -4392,7 +4424,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
>         rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
>         rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
>         rbd_dev->tag_set.nr_hw_queues = 1;
> -       rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
> +       rbd_dev->tag_set.cmd_size = sizeof(struct rbd_request_linker);
>
>         err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
>         if (err)

We already have such a timeout at libceph level -- osd_request_timeout
(similar to rados_osd_op_timeout ceph.conf option).  Note that just as
rados_osd_op_timeout in userspace, osd_request_timeout is disabled by
default and is NOT recommended for use with librbd/krbd.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
kernel test robot July 28, 2017, 1:27 a.m. UTC | #2
Hi Xiongwei,

[auto build test WARNING on ceph-client/for-linus]
[also build test WARNING on v4.13-rc2 next-20170727]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/xiongweijiang666-gmail-com/rbd-add-timeout-function-to-rbd-driver/20170728-085544
base:   https://github.com/ceph/ceph-client.git for-linus
config: i386-randconfig-x071-201730 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   drivers/block/rbd.c: In function 'rbd_queue_workfn':
>> drivers/block/rbd.c:4135:22: warning: 'linker' may be used uninitialized in this function [-Wmaybe-uninitialized]
     linker->img_request = img_request;
     ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~

vim +/linker +4135 drivers/block/rbd.c

  4021	
  4022	static void rbd_queue_workfn(struct work_struct *work)
  4023	{
  4024		struct request *rq = blk_mq_rq_from_pdu(work);
  4025		struct rbd_device *rbd_dev = rq->q->queuedata;
  4026		struct rbd_img_request *img_request;
  4027		struct rbd_request_linker *linker;
  4028		struct ceph_snap_context *snapc = NULL;
  4029		u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
  4030		u64 length = blk_rq_bytes(rq);
  4031		enum obj_operation_type op_type;
  4032		u64 mapping_size;
  4033		bool must_be_locked;
  4034		int result;
  4035	
  4036		switch (req_op(rq)) {
  4037		case REQ_OP_DISCARD:
  4038		case REQ_OP_WRITE_ZEROES:
  4039			op_type = OBJ_OP_DISCARD;
  4040			break;
  4041		case REQ_OP_WRITE:
  4042			op_type = OBJ_OP_WRITE;
  4043			break;
  4044		case REQ_OP_READ:
  4045			op_type = OBJ_OP_READ;
  4046			break;
  4047		default:
  4048			dout("%s: non-fs request type %d\n", __func__, req_op(rq));
  4049			result = -EIO;
  4050			goto err;
  4051		}
  4052	
  4053		/* Ignore/skip any zero-length requests */
  4054	
  4055		if (!length) {
  4056			dout("%s: zero-length request\n", __func__);
  4057			result = 0;
  4058			goto err_rq;
  4059		}
  4060	
  4061		/* Only reads are allowed to a read-only device */
  4062	
  4063		if (op_type != OBJ_OP_READ) {
  4064			if (rbd_dev->mapping.read_only) {
  4065				result = -EROFS;
  4066				goto err_rq;
  4067			}
  4068			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
  4069		}
  4070	
  4071		/*
  4072		 * Quit early if the mapped snapshot no longer exists.  It's
  4073		 * still possible the snapshot will have disappeared by the
  4074		 * time our request arrives at the osd, but there's no sense in
  4075		 * sending it if we already know.
  4076		 */
  4077		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
  4078			dout("request for non-existent snapshot");
  4079			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
  4080			result = -ENXIO;
  4081			goto err_rq;
  4082		}
  4083	
  4084		if (offset && length > U64_MAX - offset + 1) {
  4085			rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
  4086				 length);
  4087			result = -EINVAL;
  4088			goto err_rq;	/* Shouldn't happen */
  4089		}
  4090	
  4091		blk_mq_start_request(rq);
  4092	
  4093		down_read(&rbd_dev->header_rwsem);
  4094		mapping_size = rbd_dev->mapping.size;
  4095		if (op_type != OBJ_OP_READ) {
  4096			snapc = rbd_dev->header.snapc;
  4097			ceph_get_snap_context(snapc);
  4098		}
  4099		up_read(&rbd_dev->header_rwsem);
  4100	
  4101		if (offset + length > mapping_size) {
  4102			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
  4103				 length, mapping_size);
  4104			result = -EIO;
  4105			goto err_rq;
  4106		}
  4107	
  4108		must_be_locked =
  4109		    (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
  4110		    (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
  4111		if (must_be_locked) {
  4112			down_read(&rbd_dev->lock_rwsem);
  4113			if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
  4114			    !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
  4115				if (rbd_dev->opts->exclusive) {
  4116					rbd_warn(rbd_dev, "exclusive lock required");
  4117					result = -EROFS;
  4118					goto err_unlock;
  4119				}
  4120				rbd_wait_state_locked(rbd_dev);
  4121			}
  4122			if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
  4123				result = -EBLACKLISTED;
  4124				goto err_unlock;
  4125			}
  4126		}
  4127	
  4128		img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
  4129						     snapc);
  4130		if (!img_request) {
  4131			result = -ENOMEM;
  4132			goto err_unlock;
  4133		}
  4134		img_request->rq = rq;
> 4135		linker->img_request = img_request;
  4136		snapc = NULL; /* img_request consumes a ref */
  4137	
  4138		if (op_type == OBJ_OP_DISCARD)
  4139			result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
  4140						      NULL);
  4141		else
  4142			result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
  4143						      rq->bio);
  4144		if (result)
  4145			goto err_img_request;
  4146	
  4147		result = rbd_img_request_submit(img_request);
  4148		if (result)
  4149			goto err_img_request;
  4150	
  4151		if (must_be_locked)
  4152			up_read(&rbd_dev->lock_rwsem);
  4153		return;
  4154	
  4155	err_img_request:
  4156		rbd_img_request_put(img_request);
  4157	err_unlock:
  4158		if (must_be_locked)
  4159			up_read(&rbd_dev->lock_rwsem);
  4160	err_rq:
  4161		if (result)
  4162			rbd_warn(rbd_dev, "%s %llx at %llx result %d",
  4163				 obj_op_name(op_type), length, offset, result);
  4164		ceph_put_snap_context(snapc);
  4165	err:
  4166		blk_mq_end_request(rq, errno_to_blk_status(result));
  4167	}
  4168	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox

Patch

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c16f745..33a1c97 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -159,6 +159,13 @@  struct rbd_image_header {
 	u64 *snap_sizes;	/* format 1 only */
 };
 
+
+struct rbd_request_linker {
+	struct work_struct work;
+	void *img_request;
+};
+
+
 /*
  * An rbd image specification.
  *
@@ -4013,6 +4020,7 @@  static void rbd_queue_workfn(struct work_struct *work)
 	struct request *rq = blk_mq_rq_from_pdu(work);
 	struct rbd_device *rbd_dev = rq->q->queuedata;
 	struct rbd_img_request *img_request;
+	struct rbd_request_linker *linker;
 	struct ceph_snap_context *snapc = NULL;
 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
 	u64 length = blk_rq_bytes(rq);
@@ -4120,6 +4128,7 @@  static void rbd_queue_workfn(struct work_struct *work)
 		goto err_unlock;
 	}
 	img_request->rq = rq;
+	linker->img_request = img_request;
 	snapc = NULL; /* img_request consumes a ref */
 
 	if (op_type == OBJ_OP_DISCARD)
@@ -4358,9 +4367,32 @@  static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
 	return 0;
 }
 
+static enum blk_eh_timer_return rbd_request_timeout(struct request *rq,
+		bool reserved)
+{
+	struct rbd_obj_request *obj_request;
+	struct rbd_obj_request *next_obj_request;
+	struct rbd_img_request *img_request;
+	struct rbd_request_linker *linker = blk_mq_rq_to_pdu(rq);
+
+	img_request = (struct rbd_img_request *)linker->img_request;
+	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
+		struct ceph_osd_request *osd_req = obj_request->osd_req;
+
+		if (!osd_req)
+			printk(KERN_INFO "osd_req is null \n");
+		else
+			ceph_osdc_cancel_request(osd_req);
+	}
+	return BLK_EH_HANDLED;
+}
+
+
+
 static const struct blk_mq_ops rbd_mq_ops = {
 	.queue_rq	= rbd_queue_rq,
 	.init_request	= rbd_init_request,
+	.timeout       = rbd_request_timeout,
 };
 
 static int rbd_init_disk(struct rbd_device *rbd_dev)
@@ -4392,7 +4424,7 @@  static int rbd_init_disk(struct rbd_device *rbd_dev)
 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
 	rbd_dev->tag_set.nr_hw_queues = 1;
-	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+	rbd_dev->tag_set.cmd_size = sizeof(struct rbd_request_linker);
 
 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
 	if (err)