diff mbox

null_blk: add 'requeue' fault attribute

Message ID 1317b3c8-0f6e-a229-414a-de81bb69be2f@kernel.dk (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe Feb. 28, 2018, 3:28 p.m. UTC
On 2/28/18 1:51 AM, Omar Sandoval wrote:
> On Tue, Feb 27, 2018 at 03:34:53PM -0700, Jens Axboe wrote:
>> Similarly to the support we have for testing/faking timeouts for
>> null_blk, this adds support for triggering a requeue condition.
>> Considering the issues around restart we've been seeing, this should be
>> a useful addition to the testing arsenal to ensure that we are handling
>> requeue conditions correctly.
>>
>> This works for queue mode 1 (legacy request_fn based path) and 2 (blk-mq
>> path), as there's no good way to do requeue with a bio based driver.
>> This is similar to the timeout path.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>
>> ---
>>
>>  null_blk.c |   55 +++++++++++++++++++++++++++++++++++++++++++------------
>>  1 file changed, 43 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
>> index 287a09611c0f..363536572e19 100644
>> --- a/drivers/block/null_blk.c
>> +++ b/drivers/block/null_blk.c
> 
> [snip]
> 
>> @@ -1422,10 +1440,12 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
>>  
>>  	blk_mq_start_request(bd->rq);
>>  
>> -	if (!should_timeout_request(bd->rq))
>> -		return null_handle_cmd(cmd);
>> +	if (should_requeue_request(bd->rq))
>> +		return BLK_STS_RESOURCE;
> 
> Hm, this goes through the less interesting requeue path, add to the
> dispatch list and __blk_mq_requeue_request(). blk_mq_requeue_request()
> is the one that I wanted to test since that's the one that needs to call
> the scheduler hook.

Until recently, it would have :-)

Both of them are interesting to test, though. Most of the core stall
cases would have been triggered by going through the STS_RESOURCE case.
How about we just make it exercise both? The below patch alternates
between them when we have chosen to requeue.

Comments

Omar Sandoval Feb. 28, 2018, 4:14 p.m. UTC | #1
On Wed, Feb 28, 2018 at 08:28:25AM -0700, Jens Axboe wrote:
> On 2/28/18 1:51 AM, Omar Sandoval wrote:
> > On Tue, Feb 27, 2018 at 03:34:53PM -0700, Jens Axboe wrote:
> >> Similarly to the support we have for testing/faking timeouts for
> >> null_blk, this adds support for triggering a requeue condition.
> >> Considering the issues around restart we've been seeing, this should be
> >> a useful addition to the testing arsenal to ensure that we are handling
> >> requeue conditions correctly.
> >>
> >> This works for queue mode 1 (legacy request_fn based path) and 2 (blk-mq
> >> path), as there's no good way to do requeue with a bio based driver.
> >> This is similar to the timeout path.
> >>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >>
> >> ---
> >>
> >>  null_blk.c |   55 +++++++++++++++++++++++++++++++++++++++++++------------
> >>  1 file changed, 43 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
> >> index 287a09611c0f..363536572e19 100644
> >> --- a/drivers/block/null_blk.c
> >> +++ b/drivers/block/null_blk.c
> > 
> > [snip]
> > 
> >> @@ -1422,10 +1440,12 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
> >>  
> >>  	blk_mq_start_request(bd->rq);
> >>  
> >> -	if (!should_timeout_request(bd->rq))
> >> -		return null_handle_cmd(cmd);
> >> +	if (should_requeue_request(bd->rq))
> >> +		return BLK_STS_RESOURCE;
> > 
> > Hm, this goes through the less interesting requeue path, add to the
> > dispatch list and __blk_mq_requeue_request(). blk_mq_requeue_request()
> > is the one that I wanted to test since that's the one that needs to call
> > the scheduler hook.
> 
> Until recently, it would have :-)
> 
> Both of them are interesting to test, though. Most of the core stall
> cases would have been triggered by going through the STS_RESOURCE case.
> How about we just make it exercise both? The below patch alternates
> between them when we have chosen to requeue.

Works for me. One idle thought, if we set this up to always requeue,
then it won't make any progress. Maybe we should limit the number of
times each request can be requeued so people (me) don't lock up their
test systems? Either way,

Reviewed-by: Omar Sandoval <osandov@fb.com>
Jens Axboe Feb. 28, 2018, 4:15 p.m. UTC | #2
On 2/28/18 9:14 AM, Omar Sandoval wrote:
> On Wed, Feb 28, 2018 at 08:28:25AM -0700, Jens Axboe wrote:
>> On 2/28/18 1:51 AM, Omar Sandoval wrote:
>>> On Tue, Feb 27, 2018 at 03:34:53PM -0700, Jens Axboe wrote:
>>>> Similarly to the support we have for testing/faking timeouts for
>>>> null_blk, this adds support for triggering a requeue condition.
>>>> Considering the issues around restart we've been seeing, this should be
>>>> a useful addition to the testing arsenal to ensure that we are handling
>>>> requeue conditions correctly.
>>>>
>>>> This works for queue mode 1 (legacy request_fn based path) and 2 (blk-mq
>>>> path), as there's no good way to do requeue with a bio based driver.
>>>> This is similar to the timeout path.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>
>>>> ---
>>>>
>>>>  null_blk.c |   55 +++++++++++++++++++++++++++++++++++++++++++------------
>>>>  1 file changed, 43 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
>>>> index 287a09611c0f..363536572e19 100644
>>>> --- a/drivers/block/null_blk.c
>>>> +++ b/drivers/block/null_blk.c
>>>
>>> [snip]
>>>
>>>> @@ -1422,10 +1440,12 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
>>>>  
>>>>  	blk_mq_start_request(bd->rq);
>>>>  
>>>> -	if (!should_timeout_request(bd->rq))
>>>> -		return null_handle_cmd(cmd);
>>>> +	if (should_requeue_request(bd->rq))
>>>> +		return BLK_STS_RESOURCE;
>>>
>>> Hm, this goes through the less interesting requeue path, add to the
>>> dispatch list and __blk_mq_requeue_request(). blk_mq_requeue_request()
>>> is the one that I wanted to test since that's the one that needs to call
>>> the scheduler hook.
>>
>> Until recently, it would have :-)
>>
>> Both of them are interesting to test, though. Most of the core stall
>> cases would have been triggered by going through the STS_RESOURCE case.
>> How about we just make it exercise both? The below patch alternates
>> between them when we have chosen to requeue.
> 
> Works for me. One idle thought, if we set this up to always requeue,
> then it won't make any progress. Maybe we should limit the number of
> times each request can be requeued so people (me) don't lock up their
> test systems? Either way,

Dunno, that gets into the "doctor it hurts when I shoot myself in the
foot" territory. Same can be said for the timeout setting. I think we
should just ignore that.
Omar Sandoval Feb. 28, 2018, 4:18 p.m. UTC | #3
On Wed, Feb 28, 2018 at 09:15:37AM -0700, Jens Axboe wrote:
> On 2/28/18 9:14 AM, Omar Sandoval wrote:
> > On Wed, Feb 28, 2018 at 08:28:25AM -0700, Jens Axboe wrote:
> >> On 2/28/18 1:51 AM, Omar Sandoval wrote:
> >>> On Tue, Feb 27, 2018 at 03:34:53PM -0700, Jens Axboe wrote:
> >>>> Similarly to the support we have for testing/faking timeouts for
> >>>> null_blk, this adds support for triggering a requeue condition.
> >>>> Considering the issues around restart we've been seeing, this should be
> >>>> a useful addition to the testing arsenal to ensure that we are handling
> >>>> requeue conditions correctly.
> >>>>
> >>>> This works for queue mode 1 (legacy request_fn based path) and 2 (blk-mq
> >>>> path), as there's no good way to do requeue with a bio based driver.
> >>>> This is similar to the timeout path.
> >>>>
> >>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >>>>
> >>>> ---
> >>>>
> >>>>  null_blk.c |   55 +++++++++++++++++++++++++++++++++++++++++++------------
> >>>>  1 file changed, 43 insertions(+), 12 deletions(-)
> >>>>
> >>>> diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
> >>>> index 287a09611c0f..363536572e19 100644
> >>>> --- a/drivers/block/null_blk.c
> >>>> +++ b/drivers/block/null_blk.c
> >>>
> >>> [snip]
> >>>
> >>>> @@ -1422,10 +1440,12 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
> >>>>  
> >>>>  	blk_mq_start_request(bd->rq);
> >>>>  
> >>>> -	if (!should_timeout_request(bd->rq))
> >>>> -		return null_handle_cmd(cmd);
> >>>> +	if (should_requeue_request(bd->rq))
> >>>> +		return BLK_STS_RESOURCE;
> >>>
> >>> Hm, this goes through the less interesting requeue path, add to the
> >>> dispatch list and __blk_mq_requeue_request(). blk_mq_requeue_request()
> >>> is the one that I wanted to test since that's the one that needs to call
> >>> the scheduler hook.
> >>
> >> Until recently, it would have :-)
> >>
> >> Both of them are interesting to test, though. Most of the core stall
> >> cases would have been triggered by going through the STS_RESOURCE case.
> >> How about we just make it exercise both? The below patch alternates
> >> between them when we have chosen to requeue.
> > 
> > Works for me. One idle thought, if we set this up to always requeue,
> > then it won't make any progress. Maybe we should limit the number of
> > times each request can be requeued so people (me) don't lock up their
> > test systems? Either way,
> 
> Dunno, that gets into the "doctor it hurts when I shoot myself in the
> foot" territory. Same can be said for the timeout setting. I think we
> should just ignore that.

Ack, fine with me.
diff mbox

Patch

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 287a09611c0f..d12d7a8325ad 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -29,6 +29,7 @@ 
 
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
 static DECLARE_FAULT_ATTR(null_timeout_attr);
+static DECLARE_FAULT_ATTR(null_requeue_attr);
 #endif
 
 static inline u64 mb_per_tick(int mbps)
@@ -53,6 +54,7 @@  struct nullb_queue {
 	wait_queue_head_t wait;
 	unsigned int queue_depth;
 	struct nullb_device *dev;
+	unsigned int requeue_selection;
 
 	struct nullb_cmd *cmds;
 };
@@ -170,6 +172,9 @@  MODULE_PARM_DESC(home_node, "Home node for the device");
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
 static char g_timeout_str[80];
 module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+
+static char g_requeue_str[80];
+module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), S_IRUGO);
 #endif
 
 static int g_queue_mode = NULL_Q_MQ;
@@ -1380,7 +1385,15 @@  static bool should_timeout_request(struct request *rq)
 	if (g_timeout_str[0])
 		return should_fail(&null_timeout_attr, 1);
 #endif
+	return false;
+}
 
+static bool should_requeue_request(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+	if (g_requeue_str[0])
+		return should_fail(&null_requeue_attr, 1);
+#endif
 	return false;
 }
 
@@ -1391,11 +1404,17 @@  static void null_request_fn(struct request_queue *q)
 	while ((rq = blk_fetch_request(q)) != NULL) {
 		struct nullb_cmd *cmd = rq->special;
 
-		if (!should_timeout_request(rq)) {
-			spin_unlock_irq(q->queue_lock);
-			null_handle_cmd(cmd);
-			spin_lock_irq(q->queue_lock);
+		/* just ignore the request */
+		if (should_timeout_request(rq))
+			continue;
+		if (should_requeue_request(rq)) {
+			blk_requeue_request(q, rq);
+			continue;
 		}
+
+		spin_unlock_irq(q->queue_lock);
+		null_handle_cmd(cmd);
+		spin_lock_irq(q->queue_lock);
 	}
 }
 
@@ -1422,10 +1441,23 @@  static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	blk_mq_start_request(bd->rq);
 
-	if (!should_timeout_request(bd->rq))
-		return null_handle_cmd(cmd);
+	if (should_requeue_request(bd->rq)) {
+		/*
+		 * Alternate between hitting the core BUSY path, and the
+		 * driver driven requeue path
+		 */
+		nq->requeue_selection++;
+		if (nq->requeue_selection & 1)
+			return BLK_STS_RESOURCE;
+		else {
+			blk_mq_requeue_request(bd->rq, true);
+			return BLK_STS_OK;
+		}
+	}
+	if (should_timeout_request(bd->rq))
+		return BLK_STS_OK;
 
-	return BLK_STS_OK;
+	return null_handle_cmd(cmd);
 }
 
 static const struct blk_mq_ops null_mq_ops = {
@@ -1659,16 +1691,27 @@  static void null_validate_conf(struct nullb_device *dev)
 		dev->mbps = 0;
 }
 
-static bool null_setup_fault(void)
-{
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
-	if (!g_timeout_str[0])
+static bool __null_setup_fault(struct fault_attr *attr, char *str)
+{
+	if (!str[0])
 		return true;
 
-	if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
+	if (!setup_fault_attr(attr, str))
 		return false;
 
-	null_timeout_attr.verbose = 0;
+	attr->verbose = 0;
+	return true;
+}
+#endif
+
+static bool null_setup_fault(void)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+	if (!__null_setup_fault(&null_timeout_attr, g_timeout_str))
+		return false;
+	if (!__null_setup_fault(&null_requeue_attr, g_requeue_str))
+		return false;
 #endif
 	return true;
 }