diff mbox

[v4] blk-mq: Fix race conditions in request timeout handling

Message ID 20180412133529.GU793541@devbig577.frc2.facebook.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tejun Heo April 12, 2018, 1:35 p.m. UTC
Hello, Israel.

On Thu, Apr 12, 2018 at 11:59:11AM +0300, Israel Rukshin wrote:
> On 4/12/2018 12:31 AM, tj@kernel.org wrote:
> >Hey, again.
> >
> >On Wed, Apr 11, 2018 at 10:07:33AM -0700, tj@kernel.org wrote:
> >>Hello, Israel.
> >>
> >>On Wed, Apr 11, 2018 at 07:16:14PM +0300, Israel Rukshin wrote:
> >>>>Just noticed this one, this looks interesting to me as well. Israel,
> >>>>can you run your test with this patch?
> >>>Yes, I just did and it looks good.
> >>Awesome.
> >Just to be sure, you tested the combined patch and saw the XXX debug
> >messages, right?
> 
> I am not sure I understand.
> What is the combined patch?
> What are the debug messages that I need to see?

There are total of three patches and I posted the combined patch + a
debug message in another post.  Can you please test the following
patch?  It'll print out something like the following (possibly many of
them).

 XXX blk_mq_timeout_reset_cleanup(): executed %d missed completions

Thanks.

Comments

Israel Rukshin April 15, 2018, 12:28 p.m. UTC | #1
Hi,

On 4/12/2018 4:35 PM, tj@kernel.org wrote:
> Hello, Israel.
>
> On Thu, Apr 12, 2018 at 11:59:11AM +0300, Israel Rukshin wrote:
>> On 4/12/2018 12:31 AM, tj@kernel.org wrote:
>>> Hey, again.
>>>
>>> On Wed, Apr 11, 2018 at 10:07:33AM -0700, tj@kernel.org wrote:
>>>> Hello, Israel.
>>>>
>>>> On Wed, Apr 11, 2018 at 07:16:14PM +0300, Israel Rukshin wrote:
>>>>>> Just noticed this one, this looks interesting to me as well. Israel,
>>>>>> can you run your test with this patch?
>>>>> Yes, I just did and it looks good.
>>>> Awesome.
>>> Just to be sure, you tested the combined patch and saw the XXX debug
>>> messages, right?
>> I am not sure I understand.
>> What is the combined patch?
>> What are the debug messages that I need to see?
> There are total of three patches and I posted the combined patch + a
> debug message in another post.  Can you please test the following
> patch?  It'll print out something like the following (possibly many of
> them).

I tested this combined patch with your debug print and I got the 
following call trace:

Apr 15 11:42:35 nvme nvme1: I/O 55 QID 10 timeout, reset controller
Apr 15 11:42:35 XXX blk_mq_timeout_reset_cleanup(): executed 1 missed 
completions
Apr 15 11:42:35 XXX blk_mq_timeout_reset_cleanup(): executed 1 missed 
completions
Apr 15 11:42:35 XXX blk_mq_timeout_reset_cleanup(): executed 1 missed 
completions
Apr 15 11:42:35 WARNING: CPU: 1 PID: 648 at block/blk-mq.c:534 
__blk_mq_complete_request+0x154/0x160
Apr 15 11:42:35 Modules linked in: nvme_rdma rdma_cm iw_cm ib_cm 
nvme_fabrics nfsv3 rpcsec_gss_krb5 nfsv4 nfs fscache iscsi_tcp 
libiscsi_tcp libiscsi scsi_transport_iscsi netconsole mlx4_ib ib_core 
mlx4_en mlx4_core xfs libcrc32c dm_mirror dm_region_hash dm_log 
dm_multipath scsi_dh_rdac scsi_dh_emc dm_mod dax scsi_dh_alua 
x86_pkg_temp_thermal coretemp kvm_intel kvm irqbypass 
ghash_clmulni_intel pcbc iTCO_wdt aesni_intel nfsd iTCO_vendor_support 
aes_x86_64 ipmi_si crypto_simd lpc_ich cryptd ipmi_devintf shpchp wmi 
acpi_pad glue_helper ipmi_msghandler pcspkr sg i2c_i801 mfd_core ioatdma 
auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 
sd_mod igb i2c_algo_bit ahci i2c_core xhci_pci crc32c_intel xhci_hcd 
libahci dca configfs nvme nvme_core ipv6 crc_ccitt autofs4 [last 
unloaded: nvme_fabrics]
Apr 15 11:42:35 CPU: 1 PID: 648 Comm: kworker/1:1H Not tainted 4.16.0+ #8
Apr 15 11:42:35 Hardware name: Supermicro SYS-6018R-WTR/X10DRW-i, BIOS 
2.0 12/17/2015
Apr 15 11:42:35 Workqueue: kblockd blk_mq_timeout_work
Apr 15 11:42:35 RIP: 0010:__blk_mq_complete_request+0x154/0x160
Apr 15 11:42:35 RSP: 0018:ffffc9000407bd68 EFLAGS: 00010297
Apr 15 11:42:35 RAX: 0000000000000000 RBX: ffff8808310236c0 RCX: 
0000000000000009
Apr 15 11:42:35 RDX: 0000000000000009 RSI: 0000000000000000 RDI: 
ffff8808310236c0
Apr 15 11:42:35 RBP: ffffe8ffffae12c0 R08: 0000000000000000 R09: 
00000000000004ea
Apr 15 11:42:35 R10: 000000000000006c R11: ffffc90003dfda80 R12: 
0000000000000000
Apr 15 11:42:35 R13: 0000000000000010 R14: 0000000000000010 R15: 
0000000000000000
Apr 15 11:42:35 FS:  0000000000000000(0000) GS:ffff88047fa40000(0000) 
knlGS:0000000000000000
Apr 15 11:42:35 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Apr 15 11:42:35 CR2: 00007f6ab2db3140 CR3: 0000000001e0a006 CR4: 
00000000001606e0
Apr 15 11:42:35 Call Trace:
Apr 15 11:42:35  blk_mq_complete_request+0x4b/0x90
Apr 15 11:42:35  blk_mq_timeout_reset_cleanup+0x27/0x40
Apr 15 11:42:35  bt_iter+0x43/0x50
Apr 15 11:42:35  blk_mq_queue_tag_busy_iter+0xfb/0x230
Apr 15 11:42:35  ? blk_mq_complete_request+0x90/0x90
Apr 15 11:42:35  ? blk_mq_complete_request+0x90/0x90
Apr 15 11:42:35  ? __call_rcu.constprop.72+0x170/0x1c0
Apr 15 11:42:35  blk_mq_timeout_work+0x191/0x1f0
Apr 15 11:42:35  process_one_work+0x140/0x2a0
Apr 15 11:42:35  worker_thread+0x3f/0x3c0
Apr 15 11:42:35  kthread+0xeb/0x120
Apr 15 11:42:35  ? process_one_work+0x2a0/0x2a0
Apr 15 11:42:35  ? kthread_bind+0x10/0x10
Apr 15 11:42:35  ? SyS_exit_group+0xb/0x10
Apr 15 11:42:35  ret_from_fork+0x35/0x40
Apr 15 11:42:35 Code: 00 00 00 00 00 8b 7d 40 e8 0a 20 e7 ff e9 6e ff ff 
ff 48 8b 35 1e ca ba 00 48 83 c7 10 48 83 c6 64 e8 71 ee e5 ff e9 77 ff 
ff ff <0f> 0b e9 c3 fe ff ff 0f 1f 44 00 00 41 54 45 31 e4 55 53 48 8b
Apr 15 11:42:35 ---[ end trace fbf397c4b27ea0b8 ]---
Apr 15 11:42:35 XXX blk_mq_timeout_reset_cleanup(): executed 1 missed 
completions
Apr 15 11:42:35 BUG: unable to handle kernel NULL pointer dereference at 
0000000000000018

Regards,
Israel
diff mbox

Patch

Index: work/block/blk-mq.c
===================================================================
--- work.orig/block/blk-mq.c
+++ work/block/blk-mq.c
@@ -642,6 +642,8 @@  void blk_mq_complete_request(struct requ
 	hctx_lock(hctx, &srcu_idx);
 	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
 		__blk_mq_complete_request(rq);
+	else
+		rq->missed_completion = true;
 	hctx_unlock(hctx, srcu_idx);
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
@@ -684,6 +686,7 @@  void blk_mq_start_request(struct request
 
 	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
 	blk_add_timer(rq);
+	rq->missed_completion = false;
 
 	write_seqcount_end(&rq->gstate_seq);
 	preempt_enable();
@@ -818,7 +821,8 @@  struct blk_mq_timeout_data {
 	unsigned int nr_expired;
 };
 
-static void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct blk_mq_hw_ctx *hctx, struct request *req,
+				int *nr_resets, bool reserved)
 {
 	const struct blk_mq_ops *ops = req->q->mq_ops;
 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
@@ -833,13 +837,10 @@  static void blk_mq_rq_timed_out(struct r
 		__blk_mq_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
-		/*
-		 * As nothing prevents from completion happening while
-		 * ->aborted_gstate is set, this may lead to ignored
-		 * completions and further spurious timeouts.
-		 */
-		blk_mq_rq_update_aborted_gstate(req, 0);
 		blk_add_timer(req);
+		req->rq_flags |= RQF_MQ_TIMEOUT_RESET;
+		(*nr_resets)++;
+		hctx->need_sync_rcu = true;
 		break;
 	case BLK_EH_NOT_HANDLED:
 		break;
@@ -876,13 +877,35 @@  static void blk_mq_check_expired(struct
 	    time_after_eq(jiffies, deadline)) {
 		blk_mq_rq_update_aborted_gstate(rq, gstate);
 		data->nr_expired++;
-		hctx->nr_expired++;
+		hctx->need_sync_rcu = true;
 	} else if (!data->next_set || time_after(data->next, deadline)) {
 		data->next = deadline;
 		data->next_set = 1;
 	}
 }
 
+static void blk_mq_timeout_sync_rcu(struct request_queue *q, bool clear)
+{
+	struct blk_mq_hw_ctx *hctx;
+	bool has_rcu = false;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (!hctx->need_sync_rcu)
+			continue;
+
+		if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+			has_rcu = true;
+		else
+			synchronize_srcu(hctx->srcu);
+
+		if (clear)
+			hctx->need_sync_rcu = false;
+	}
+	if (has_rcu)
+		synchronize_rcu();
+}
+
 static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
@@ -895,7 +918,45 @@  static void blk_mq_terminate_expired(str
 	 */
 	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
 	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
-		blk_mq_rq_timed_out(rq, reserved);
+		blk_mq_rq_timed_out(hctx, rq, priv, reserved);
+}
+
+static void blk_mq_timeout_reset_return(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, void *priv, bool reserved)
+{
+	/*
+	 * @rq's timer reset has gone through rcu synchronization and is
+	 * visible now.  Allow normal completions again by resetting
+	 * ->aborted_gstate.  Don't clear RQF_MQ_TIMEOUT_RESET here as
+	 * blk_mq_timeout_reset_cleanup() needs it again and there's no
+	 * memory ordering around ->aborted_gstate making it the only field
+	 * safe to update.  Let blk_add_timer() clear it later when the
+	 * request is recycled or times out again.
+	 */
+	if (rq->rq_flags & RQF_MQ_TIMEOUT_RESET)
+		blk_mq_rq_update_aborted_gstate(rq, 0);
+}
+
+static void blk_mq_timeout_reset_cleanup(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, void *priv, bool reserved)
+{
+	int cnt = 0;
+
+	/*
+	 * @rq is now fully returned to the normal path.  If normal
+	 * completion raced timeout handling, execute the missed completion
+	 * here.  This is safe because 1. ->missed_completion can no longer
+	 * be asserted because nothing is timing out right now and 2. if
+	 * ->missed_completion is set, @rq is safe from recycling because
+	 * nobody could have completed it.
+	 */
+	if ((rq->rq_flags & RQF_MQ_TIMEOUT_RESET) && rq->missed_completion) {
+		blk_mq_complete_request(rq);
+		cnt++;
+	}
+
+	printk("XXX blk_mq_timeout_reset_cleanup(): executed %d missed completions\n",
+	       cnt);
 }
 
 static void blk_mq_timeout_work(struct work_struct *work)
@@ -930,7 +991,7 @@  static void blk_mq_timeout_work(struct w
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 
 	if (data.nr_expired) {
-		bool has_rcu = false;
+		int nr_resets = 0;
 
 		/*
 		 * Wait till everyone sees ->aborted_gstate.  The
@@ -938,22 +999,25 @@  static void blk_mq_timeout_work(struct w
 		 * becomes a problem, we can add per-hw_ctx rcu_head and
 		 * wait in parallel.
 		 */
-		queue_for_each_hw_ctx(q, hctx, i) {
-			if (!hctx->nr_expired)
-				continue;
-
-			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
-				has_rcu = true;
-			else
-				synchronize_srcu(hctx->srcu);
-
-			hctx->nr_expired = 0;
-		}
-		if (has_rcu)
-			synchronize_rcu();
+		blk_mq_timeout_sync_rcu(q, true);
 
 		/* terminate the ones we won */
-		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired,
+					   &nr_resets);
+
+		/*
+		 * For BLK_EH_RESET_TIMER, release the requests after
+		 * blk_add_timer() from above is visible to avoid timer
+		 * reset racing against recycling.
+		 */
+		if (nr_resets) {
+			blk_mq_timeout_sync_rcu(q, false);
+			blk_mq_queue_tag_busy_iter(q,
+					blk_mq_timeout_reset_return, NULL);
+			blk_mq_timeout_sync_rcu(q, true);
+			blk_mq_queue_tag_busy_iter(q,
+					blk_mq_timeout_reset_cleanup, NULL);
+		}
 	}
 
 	if (data.next_set) {
Index: work/include/linux/blk-mq.h
===================================================================
--- work.orig/include/linux/blk-mq.h
+++ work/include/linux/blk-mq.h
@@ -51,7 +51,7 @@  struct blk_mq_hw_ctx {
 	unsigned int		queue_num;
 
 	atomic_t		nr_active;
-	unsigned int		nr_expired;
+	bool			need_sync_rcu;
 
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
Index: work/block/blk-timeout.c
===================================================================
--- work.orig/block/blk-timeout.c
+++ work/block/blk-timeout.c
@@ -216,7 +216,7 @@  void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	blk_rq_set_deadline(req, jiffies + req->timeout);
-	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
+	req->rq_flags &= ~(RQF_MQ_TIMEOUT_EXPIRED | RQF_MQ_TIMEOUT_RESET);
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
Index: work/include/linux/blkdev.h
===================================================================
--- work.orig/include/linux/blkdev.h
+++ work/include/linux/blkdev.h
@@ -127,8 +127,10 @@  typedef __u32 __bitwise req_flags_t;
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
 /* timeout is expired */
 #define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
+/* timeout is expired */
+#define RQF_MQ_TIMEOUT_RESET	((__force req_flags_t)(1 << 21))
 /* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 21))
+#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 22))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -225,6 +227,8 @@  struct request {
 
 	unsigned int extra_len;	/* length of alignment and padding */
 
+	bool missed_completion;
+
 	/*
 	 * On blk-mq, the lower bits of ->gstate (generation number and
 	 * state) carry the MQ_RQ_* state value and the upper bits the