Message ID | 20170727103257.11608-1-xiongwei.jiang@alibaba-inc.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Jul 27, 2017 at 12:32 PM, <xiongweijiang666@gmail.com> wrote: > From: Xiongwei Jiang <xiongwei.jiang@alibaba-inc.com> > > when an application is writing or reading on rbd device, if some or all > OSDs crash, the application will hang and can't be killed because it is > in D state. Even though OSDs comes up later, the application may still > keeps in D state. So we need a timeout mechanism to solve this problem. Hi Xiongwei, This shouldn't happen -- when the OSDs come back up, all requests should be properly resent and completed. If you have a scenario where the OSDs come back up into HEALTH_OK and this doesn't happen, that's likely a bug. > > Signed-off-by: Xiongwei Jiang <xiongwei.jiang@alibaba-inc.com> > --- > drivers/block/rbd.c | 34 +++++++++++++++++++++++++++++++++- > 1 file changed, 33 insertions(+), 1 deletion(-) > > diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c > index c16f745..33a1c97 100644 > --- a/drivers/block/rbd.c > +++ b/drivers/block/rbd.c > @@ -159,6 +159,13 @@ struct rbd_image_header { > u64 *snap_sizes; /* format 1 only */ > }; > > + > +struct rbd_request_linker { > + struct work_struct work; > + void *img_request; > +}; > + > + > /* > * An rbd image specification. > * > @@ -4013,6 +4020,7 @@ static void rbd_queue_workfn(struct work_struct *work) > struct request *rq = blk_mq_rq_from_pdu(work); > struct rbd_device *rbd_dev = rq->q->queuedata; > struct rbd_img_request *img_request; > + struct rbd_request_linker *linker; > struct ceph_snap_context *snapc = NULL; > u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; > u64 length = blk_rq_bytes(rq); > @@ -4120,6 +4128,7 @@ static void rbd_queue_workfn(struct work_struct *work) > goto err_unlock; > } > img_request->rq = rq; > + linker->img_request = img_request; > snapc = NULL; /* img_request consumes a ref */ > > if (op_type == OBJ_OP_DISCARD) > @@ -4358,9 +4367,32 @@ static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq, > return 0; > } > > +static enum blk_eh_timer_return rbd_request_timeout(struct request *rq, > + bool reserved) > +{ > + struct rbd_obj_request *obj_request; > + struct rbd_obj_request *next_obj_request; > + struct rbd_img_request *img_request; > + struct rbd_request_linker *linker = blk_mq_rq_to_pdu(rq); > + > + img_request = (struct rbd_img_request *)linker->img_request; > + for_each_obj_request_safe(img_request, obj_request, next_obj_request) { > + struct ceph_osd_request *osd_req = obj_request->osd_req; > + > + if (!osd_req) > + printk(KERN_INFO "osd_req is null \n"); > + else > + ceph_osdc_cancel_request(osd_req); > + } > + return BLK_EH_HANDLED; > +} > + > + > + > static const struct blk_mq_ops rbd_mq_ops = { > .queue_rq = rbd_queue_rq, > .init_request = rbd_init_request, > + .timeout = rbd_request_timeout, The default blk-mq timeout is 30 seconds. This means that any "slow" request (osd_op_complaint_time ceph.conf option) will be timed out, forcing the mounted filesystem into read-only mode at best. > }; > > static int rbd_init_disk(struct rbd_device *rbd_dev) > @@ -4392,7 +4424,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) > rbd_dev->tag_set.numa_node = NUMA_NO_NODE; > rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; > rbd_dev->tag_set.nr_hw_queues = 1; > - rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); > + rbd_dev->tag_set.cmd_size = sizeof(struct rbd_request_linker); > > err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); > if (err) We already have such a timeout at libceph level -- osd_request_timeout (similar to rados_osd_op_timeout ceph.conf option). Note that just as rados_osd_op_timeout in userspace, osd_request_timeout is disabled by default and is NOT recommended for use with librbd/krbd. Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Xiongwei, [auto build test WARNING on ceph-client/for-linus] [also build test WARNING on v4.13-rc2 next-20170727] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/xiongweijiang666-gmail-com/rbd-add-timeout-function-to-rbd-driver/20170728-085544 base: https://github.com/ceph/ceph-client.git for-linus config: i386-randconfig-x071-201730 (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: # save the attached .config to linux build tree make ARCH=i386 Note: it may well be a FALSE warning. FWIW you are at least aware of it now. http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings All warnings (new ones prefixed by >>): drivers/block/rbd.c: In function 'rbd_queue_workfn': >> drivers/block/rbd.c:4135:22: warning: 'linker' may be used uninitialized in this function [-Wmaybe-uninitialized] linker->img_request = img_request; ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~ vim +/linker +4135 drivers/block/rbd.c 4021 4022 static void rbd_queue_workfn(struct work_struct *work) 4023 { 4024 struct request *rq = blk_mq_rq_from_pdu(work); 4025 struct rbd_device *rbd_dev = rq->q->queuedata; 4026 struct rbd_img_request *img_request; 4027 struct rbd_request_linker *linker; 4028 struct ceph_snap_context *snapc = NULL; 4029 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 4030 u64 length = blk_rq_bytes(rq); 4031 enum obj_operation_type op_type; 4032 u64 mapping_size; 4033 bool must_be_locked; 4034 int result; 4035 4036 switch (req_op(rq)) { 4037 case REQ_OP_DISCARD: 4038 case REQ_OP_WRITE_ZEROES: 4039 op_type = OBJ_OP_DISCARD; 4040 break; 4041 case REQ_OP_WRITE: 4042 op_type = OBJ_OP_WRITE; 4043 break; 4044 case REQ_OP_READ: 4045 op_type = OBJ_OP_READ; 4046 break; 4047 default: 4048 dout("%s: non-fs request type %d\n", __func__, req_op(rq)); 4049 result = -EIO; 4050 goto err; 4051 } 4052 4053 /* Ignore/skip any zero-length requests */ 4054 4055 if (!length) { 4056 dout("%s: zero-length request\n", __func__); 4057 result = 0; 4058 goto err_rq; 4059 } 4060 4061 /* Only reads are allowed to a read-only device */ 4062 4063 if (op_type != OBJ_OP_READ) { 4064 if (rbd_dev->mapping.read_only) { 4065 result = -EROFS; 4066 goto err_rq; 4067 } 4068 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 4069 } 4070 4071 /* 4072 * Quit early if the mapped snapshot no longer exists. It's 4073 * still possible the snapshot will have disappeared by the 4074 * time our request arrives at the osd, but there's no sense in 4075 * sending it if we already know. 4076 */ 4077 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 4078 dout("request for non-existent snapshot"); 4079 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 4080 result = -ENXIO; 4081 goto err_rq; 4082 } 4083 4084 if (offset && length > U64_MAX - offset + 1) { 4085 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 4086 length); 4087 result = -EINVAL; 4088 goto err_rq; /* Shouldn't happen */ 4089 } 4090 4091 blk_mq_start_request(rq); 4092 4093 down_read(&rbd_dev->header_rwsem); 4094 mapping_size = rbd_dev->mapping.size; 4095 if (op_type != OBJ_OP_READ) { 4096 snapc = rbd_dev->header.snapc; 4097 ceph_get_snap_context(snapc); 4098 } 4099 up_read(&rbd_dev->header_rwsem); 4100 4101 if (offset + length > mapping_size) { 4102 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 4103 length, mapping_size); 4104 result = -EIO; 4105 goto err_rq; 4106 } 4107 4108 must_be_locked = 4109 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && 4110 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read); 4111 if (must_be_locked) { 4112 down_read(&rbd_dev->lock_rwsem); 4113 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED && 4114 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { 4115 if (rbd_dev->opts->exclusive) { 4116 rbd_warn(rbd_dev, "exclusive lock required"); 4117 result = -EROFS; 4118 goto err_unlock; 4119 } 4120 rbd_wait_state_locked(rbd_dev); 4121 } 4122 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { 4123 result = -EBLACKLISTED; 4124 goto err_unlock; 4125 } 4126 } 4127 4128 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 4129 snapc); 4130 if (!img_request) { 4131 result = -ENOMEM; 4132 goto err_unlock; 4133 } 4134 img_request->rq = rq; > 4135 linker->img_request = img_request; 4136 snapc = NULL; /* img_request consumes a ref */ 4137 4138 if (op_type == OBJ_OP_DISCARD) 4139 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 4140 NULL); 4141 else 4142 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 4143 rq->bio); 4144 if (result) 4145 goto err_img_request; 4146 4147 result = rbd_img_request_submit(img_request); 4148 if (result) 4149 goto err_img_request; 4150 4151 if (must_be_locked) 4152 up_read(&rbd_dev->lock_rwsem); 4153 return; 4154 4155 err_img_request: 4156 rbd_img_request_put(img_request); 4157 err_unlock: 4158 if (must_be_locked) 4159 up_read(&rbd_dev->lock_rwsem); 4160 err_rq: 4161 if (result) 4162 rbd_warn(rbd_dev, "%s %llx at %llx result %d", 4163 obj_op_name(op_type), length, offset, result); 4164 ceph_put_snap_context(snapc); 4165 err: 4166 blk_mq_end_request(rq, errno_to_blk_status(result)); 4167 } 4168 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c16f745..33a1c97 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -159,6 +159,13 @@ struct rbd_image_header { u64 *snap_sizes; /* format 1 only */ }; + +struct rbd_request_linker { + struct work_struct work; + void *img_request; +}; + + /* * An rbd image specification. * @@ -4013,6 +4020,7 @@ static void rbd_queue_workfn(struct work_struct *work) struct request *rq = blk_mq_rq_from_pdu(work); struct rbd_device *rbd_dev = rq->q->queuedata; struct rbd_img_request *img_request; + struct rbd_request_linker *linker; struct ceph_snap_context *snapc = NULL; u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; u64 length = blk_rq_bytes(rq); @@ -4120,6 +4128,7 @@ static void rbd_queue_workfn(struct work_struct *work) goto err_unlock; } img_request->rq = rq; + linker->img_request = img_request; snapc = NULL; /* img_request consumes a ref */ if (op_type == OBJ_OP_DISCARD) @@ -4358,9 +4367,32 @@ static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq, return 0; } +static enum blk_eh_timer_return rbd_request_timeout(struct request *rq, + bool reserved) +{ + struct rbd_obj_request *obj_request; + struct rbd_obj_request *next_obj_request; + struct rbd_img_request *img_request; + struct rbd_request_linker *linker = blk_mq_rq_to_pdu(rq); + + img_request = (struct rbd_img_request *)linker->img_request; + for_each_obj_request_safe(img_request, obj_request, next_obj_request) { + struct ceph_osd_request *osd_req = obj_request->osd_req; + + if (!osd_req) + printk(KERN_INFO "osd_req is null \n"); + else + ceph_osdc_cancel_request(osd_req); + } + return BLK_EH_HANDLED; +} + + + static const struct blk_mq_ops rbd_mq_ops = { .queue_rq = rbd_queue_rq, .init_request = rbd_init_request, + .timeout = rbd_request_timeout, }; static int rbd_init_disk(struct rbd_device *rbd_dev) @@ -4392,7 +4424,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->tag_set.numa_node = NUMA_NO_NODE; rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; rbd_dev->tag_set.nr_hw_queues = 1; - rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); + rbd_dev->tag_set.cmd_size = sizeof(struct rbd_request_linker); err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); if (err)