diff mbox

kernel BUG at drivers/scsi/qla2xxx/qla_target.c:3319!

Message ID 1498807372.24896.45.camel@haakon3.risingtidesystems.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nicholas A. Bellinger June 30, 2017, 7:22 a.m. UTC
Hi Pascal & Co,

On Mon, 2017-06-26 at 15:20 +0200, Pascal de Bruijn wrote:

<SNIP>

> In the meanwhile I think we now know what's causing the circumstances 
> that lets this issue surface.
> 
> We can reliably reproduce the issue when we run 'iozone -a -I' in a VM 
> that's placed on the LIO provided storage and kick off a md check:
> 
> echo check > /sys/block/md0/md/sync_action
> 
> This is also probably why we were experiencing these issues consistently 
> on the night between Saturday and Sunday, as an md check would be kicked 
> off from cron.
> 
> Since we were experimenting with md tuning udev rules, particularly our 
> sync_speed_min change may have exacerbated things:
> 
> SUBSYSTEM=="block", KERNEL=="md*", ACTION=="change", 
> TEST=="md/sync_speed_min", ATTR{md/sync_speed_min}="100000"
> SUBSYSTEM=="block", KERNEL=="md*", ACTION=="change", 
> TEST=="md/sync_speed_max", ATTR{md/sync_speed_max}="1000000"
> 

So it looks like the backend storage device provided to LIO is reaching
sufficiently high enough latency to cause ESX ABORT_TASK timeouts to
fire..

> > After staring at this, there are two scenarios during ABORT_TASK where
> > this scenario could occur.
> >
> > Namely, there are two cases where tcm_qla2xxx.c calls
> > tcm_qla2xxx_free_cmd() internally during ABORT_TASK:
> >
> >    - During tcm_qla2xxx_handle_data_work() when cmd->aborted is detected.
> >    - During tcm_qla2xxx_aborted_task() when QLA_TGT_STATE_NEW or
> >      QLA_TGT_STATE_DATA_IN are set.
> >
> > My guess is the latter case is happening, that is resulting in
> > tcm_qla2xxx_free_cmd() queuing up qla_tgt_cmd->work from
> > transport_cmd_finish_abort() -> tcm_qla2xxx_aborted_task(), immediately
> > before target_put_sess_cmd() is called from core_tmr_abort_task() to
> > perform the final kref_put(&cmd->kref, ...).
> >

<SNIP>

> > The second is the following patch to catch this scenario in progress
> > during qlt_free_cmd(), and clear ->cmd_in-wq + dump the state before
> > doing a cancel_work_sync().
> >
> > diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h
> > index d644202..65752af 100644
> > --- a/drivers/scsi/qla2xxx/qla_target.h
> > +++ b/drivers/scsi/qla2xxx/qla_target.h
> > @@ -876,7 +876,7 @@ struct qla_tgt_cmd {
> >   	unsigned int q_full:1;
> >   	unsigned int term_exchg:1;
> >   	unsigned int cmd_sent_to_fw:1;
> > -	unsigned int cmd_in_wq:1;
> > +	unsigned int cmd_in_wq;
> >   	unsigned int aborted:1;
> >   	unsigned int data_work:1;
> >   	unsigned int data_work_free:1;
> > diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
> > index 75aeb9f..3ceb371 100644
> > --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
> > +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
> > @@ -336,6 +336,17 @@ static void tcm_qla2xxx_release_cmd(struct se_cmd *se_cmd)
> >   	}
> >   
> >   	cmd = container_of(se_cmd, struct qla_tgt_cmd, se_cmd);
> > +
> > +	if (cmpxchg(&cmd->cmd_in_wq, 1, 0) == 1) {
> > +		printk("Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1"
> > +			" state: %d data_work: %d data_work_free: %d trc_flags: 0x%08x"
> > +			" se_cmd->se_cmd_flags: 0x%08x se_cmd->transport_state: 0x%08x\n"
> > +			" se_cmd->t_task_cdb: 0x%02x\n", cmd->state, cmd->data_work,
> > +			cmd->data_work_free, cmd->trc_flags, cmd->se_cmd.se_cmd_flags,
> > +			cmd->se_cmd.transport_state, cmd->se_cmd.t_task_cdb[0]);
> > +		cancel_work_sync(&cmd->work);
> > +	}
> > +
> >   	qlt_free_cmd(cmd);
> >   }
> 
> This patch does apply cleanly to v4.9.34, however it did not compile, as 
> data_work, data_work_free and trc_flags are not yet present. After 
> removing those references it did compile fine.
> 
> Here's the relevant part of dmesg:
> 
> [  103.740391] md: data-check of RAID array md0
> [  103.740395] md: minimum _guaranteed_  speed: 100000 KB/sec/disk.
> [  103.740397] md: using maximum available idle IO bandwidth (but not more than 1000000 KB/sec) for data-check.
> [  103.740416] md: using 128k window, over a total of 405798912k.
> [  435.673247] ABORT_TASK: Found referenced qla2xxx task_tag: 1222180
> [  435.927330] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
> [  435.927351] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1222180
> [  565.785280] qla2xxx/21:00:00:24:ff:08:c4:8a: Unsupported SCSI Opcode 0x85, sending CHECK_CONDITION.
> [  702.818402] ABORT_TASK: Found referenced qla2xxx task_tag: 1230964
> [  703.090441] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
> [  703.090457] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1230964
> [  888.912452] ABORT_TASK: Found referenced qla2xxx task_tag: 1215316
> [  889.133709] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
> [  889.133713] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1215316
> [ 1741.356248] ABORT_TASK: Found referenced qla2xxx task_tag: 1222900
> [ 1741.609044] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
> [ 1741.609047] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1222900
> [ 2365.885579] qla2xxx/21:00:00:24:ff:01:fa:9d: Unsupported SCSI Opcode 0x85, sending CHECK_CONDITION.
> [ 2425.729887] ABORT_TASK: Found referenced qla2xxx task_tag: 1242772
> [ 2425.901292] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091se_cmd->t_task_cdb: 0x2a
> [ 2425.901296] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
> 1242772
> [ 2778.797321] md: md0: data-check done.
> [ 4165.989290] qla2xxx/21:00:00:24:ff:01:fa:9d: Unsupported SCSI Opcode 
> 0x85, sending CHECK_CONDITION.
> 
> Presumably every "Cancelling..." would have been a lockup?

Fantastic, this is exactly the info I was looking for.  :)

So decoding the 'Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1' bits:

*) qla_tgt_cmd->state: 2

qla_tgt_cmd->state = 2 is QLA_TGT_STATE_DATA_IN, which means FCP
data-out was successfully received, and the WRITE CDB has been submitted
into LIO backend storage.

*) se_cmd->se_cmd_flags: 0x01c00109

This bitmask means the following flags are set:
 
   SCF_SUPPORTED_SAM_OPCODE
   SCF_SCSI_DATA_CDB
   SCF_SE_LUN_CMD
   SCF_ACK_KREF
   SCF_USE_CPUID
   SCF_TASK_ATTR_SET

Nothing out of the ordinarily here..

*) se_cmd->transport_state: 0x00000091

  CMD_T_ABORTED
  CMD_T_SENT;
  CMD_T_DEV_ACTIVE;

This means the WRITE command sent to the backend device, but did not
complete before it was explicitly aborted by the ESX host.

Based on what's in the logs above, this also looks as expected.

*) se_cmd->t_task_cdb: 0x2a

So this command is a WRITE_10 = 0x2a.

This would indicate WRITE latency for the LIO backend storage is getting
sufficiently high to trigger ESX host side ABORT_TASK timeouts, which
are much more aggressive than normal Linux host I/O timeouts.

So separate from the qla2xxx BUG_ON here discussed below, the high
backend device latency is worth looking into, as usually once ESX host
starts triggering ABORT_TASK timeouts it ends up taking the datastore
offline.

That said, I'm certain you're hitting the bogus tcm_qla2xxx_free_cmd()
in tcm_qla2xxx_aborted_task(), which based on the initial analysis is
definitely the wrong thing to do.

So here's a patch against v4.12-rc that addresses the scenario you where
able to trigger, and fixes the second scenario as well.

As you noted above, it will take a little massaging to apply to v4.9.x
due to qla_tgt_cmd->cmd->cmd_flags becoming qla_tgt_cmd->data_work and
->data_work_free.  But that shouldn't be too hard.  :)

So please test without the initial debug patch on your v4.9.y based
setup.

Himanshu + Quinn, please review as after further groking of what Pascal
has observed, these are two obvious bugs.

From 4db6a8145940d0bbd10265020d681961ce2d3238 Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Fri, 30 Jun 2017 00:08:13 -0700
Subject: [PATCH] qla2xxx: Fix incorrect tcm_qla2xxx_free_cmd use during TMR
 ABORT

This patch drops two incorrect usages of tcm_qla2xxx_free_cmd()
during TMR ABORT within tcm_qla2xxx_handle_data_work() and
tcm_qla2xxx_aborted_task(), which where attempting to dispatch
into workqueue context to do tcm_qla2xxx_complete_free() and
subsequently invoke transport_generic_free_cmd().

This is incorrect because during TMR ABORT target-core will
drop the outstanding se_cmd->cmd_kref references once it has
quiesced the se_cmd via transport_wait_for_tasks(), and in
the case of qla2xxx it should not attempt to do it's own
transport_generic_free_cmd().

As reported by Pascal, this was originally manifesting as a
BUG_ON(cmd->cmd_in_wq) in qlt_free_cmd() during TMR ABORT
with a LIO backend with sufficently high WRITE latency to
trigger a host side TMR ABORT_TASK to occur.

In addition, for the case in tcm_qla2xxx_write_pending_status()
and tcm_qla2xxx_handle_data_work() that waits for outstanding
FCP WRITE data transfer to complete before preceedin with a
TMR ABORT, avoid using se_cmd->t_transport_stop_comp that is
already used by transport_wait_for_tasks() and use a qla2xxx
internal struct completion instead.

Reported-by: Pascal de Bruijn <p.debruijn@unilogic.nl>
Cc: Himanshu Madhani <himanshu.madhani@cavium.com>
Cc: Quinn Tran <quinn.tran@cavium.com>
Cc: <stable@vger.kernel.org> # 3.10+
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/scsi/qla2xxx/qla_target.c  |  4 ++++
 drivers/scsi/qla2xxx/qla_target.h  |  1 +
 drivers/scsi/qla2xxx/tcm_qla2xxx.c | 36 +++---------------------------------
 3 files changed, 8 insertions(+), 33 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Pascal de Bruijn | Unilogic B.V. July 3, 2017, 2:03 p.m. UTC | #1
On 30-06-17 09:22, Nicholas A. Bellinger wrote:
> Hi Pascal & Co,
>
> On Mon, 2017-06-26 at 15:20 +0200, Pascal de Bruijn wrote:
>
> <SNIP>
>
>> In the meanwhile I think we now know what's causing the circumstances
>> that lets this issue surface.
>>
>> We can reliably reproduce the issue when we run 'iozone -a -I' in a VM
>> that's placed on the LIO provided storage and kick off a md check:
>>
>> echo check > /sys/block/md0/md/sync_action
>>
>> This is also probably why we were experiencing these issues consistently
>> on the night between Saturday and Sunday, as an md check would be kicked
>> off from cron.
>>
>> Since we were experimenting with md tuning udev rules, particularly our
>> sync_speed_min change may have exacerbated things:
>>
>> SUBSYSTEM=="block", KERNEL=="md*", ACTION=="change",
>> TEST=="md/sync_speed_min", ATTR{md/sync_speed_min}="100000"
>> SUBSYSTEM=="block", KERNEL=="md*", ACTION=="change",
>> TEST=="md/sync_speed_max", ATTR{md/sync_speed_max}="1000000"
>>
> So it looks like the backend storage device provided to LIO is reaching
> sufficiently high enough latency to cause ESX ABORT_TASK timeouts to
> fire..
>
>>> After staring at this, there are two scenarios during ABORT_TASK where
>>> this scenario could occur.
>>>
>>> Namely, there are two cases where tcm_qla2xxx.c calls
>>> tcm_qla2xxx_free_cmd() internally during ABORT_TASK:
>>>
>>>     - During tcm_qla2xxx_handle_data_work() when cmd->aborted is detected.
>>>     - During tcm_qla2xxx_aborted_task() when QLA_TGT_STATE_NEW or
>>>       QLA_TGT_STATE_DATA_IN are set.
>>>
>>> My guess is the latter case is happening, that is resulting in
>>> tcm_qla2xxx_free_cmd() queuing up qla_tgt_cmd->work from
>>> transport_cmd_finish_abort() -> tcm_qla2xxx_aborted_task(), immediately
>>> before target_put_sess_cmd() is called from core_tmr_abort_task() to
>>> perform the final kref_put(&cmd->kref, ...).
>>>
> <SNIP>
>
>>> The second is the following patch to catch this scenario in progress
>>> during qlt_free_cmd(), and clear ->cmd_in-wq + dump the state before
>>> doing a cancel_work_sync().
>>>
>>> diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h
>>> index d644202..65752af 100644
>>> --- a/drivers/scsi/qla2xxx/qla_target.h
>>> +++ b/drivers/scsi/qla2xxx/qla_target.h
>>> @@ -876,7 +876,7 @@ struct qla_tgt_cmd {
>>>    	unsigned int q_full:1;
>>>    	unsigned int term_exchg:1;
>>>    	unsigned int cmd_sent_to_fw:1;
>>> -	unsigned int cmd_in_wq:1;
>>> +	unsigned int cmd_in_wq;
>>>    	unsigned int aborted:1;
>>>    	unsigned int data_work:1;
>>>    	unsigned int data_work_free:1;
>>> diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
>>> index 75aeb9f..3ceb371 100644
>>> --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
>>> +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
>>> @@ -336,6 +336,17 @@ static void tcm_qla2xxx_release_cmd(struct se_cmd *se_cmd)
>>>    	}
>>>    
>>>    	cmd = container_of(se_cmd, struct qla_tgt_cmd, se_cmd);
>>> +
>>> +	if (cmpxchg(&cmd->cmd_in_wq, 1, 0) == 1) {
>>> +		printk("Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1"
>>> +			" state: %d data_work: %d data_work_free: %d trc_flags: 0x%08x"
>>> +			" se_cmd->se_cmd_flags: 0x%08x se_cmd->transport_state: 0x%08x\n"
>>> +			" se_cmd->t_task_cdb: 0x%02x\n", cmd->state, cmd->data_work,
>>> +			cmd->data_work_free, cmd->trc_flags, cmd->se_cmd.se_cmd_flags,
>>> +			cmd->se_cmd.transport_state, cmd->se_cmd.t_task_cdb[0]);
>>> +		cancel_work_sync(&cmd->work);
>>> +	}
>>> +
>>>    	qlt_free_cmd(cmd);
>>>    }
>> This patch does apply cleanly to v4.9.34, however it did not compile, as
>> data_work, data_work_free and trc_flags are not yet present. After
>> removing those references it did compile fine.
>>
>> Here's the relevant part of dmesg:
>>
>> [  103.740391] md: data-check of RAID array md0
>> [  103.740395] md: minimum _guaranteed_  speed: 100000 KB/sec/disk.
>> [  103.740397] md: using maximum available idle IO bandwidth (but not more than 1000000 KB/sec) for data-check.
>> [  103.740416] md: using 128k window, over a total of 405798912k.
>> [  435.673247] ABORT_TASK: Found referenced qla2xxx task_tag: 1222180
>> [  435.927330] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
>> [  435.927351] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1222180
>> [  565.785280] qla2xxx/21:00:00:24:ff:08:c4:8a: Unsupported SCSI Opcode 0x85, sending CHECK_CONDITION.
>> [  702.818402] ABORT_TASK: Found referenced qla2xxx task_tag: 1230964
>> [  703.090441] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
>> [  703.090457] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1230964
>> [  888.912452] ABORT_TASK: Found referenced qla2xxx task_tag: 1215316
>> [  889.133709] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
>> [  889.133713] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1215316
>> [ 1741.356248] ABORT_TASK: Found referenced qla2xxx task_tag: 1222900
>> [ 1741.609044] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
>> [ 1741.609047] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1222900
>> [ 2365.885579] qla2xxx/21:00:00:24:ff:01:fa:9d: Unsupported SCSI Opcode 0x85, sending CHECK_CONDITION.
>> [ 2425.729887] ABORT_TASK: Found referenced qla2xxx task_tag: 1242772
>> [ 2425.901292] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091se_cmd->t_task_cdb: 0x2a
>> [ 2425.901296] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag:
>> 1242772
>> [ 2778.797321] md: md0: data-check done.
>> [ 4165.989290] qla2xxx/21:00:00:24:ff:01:fa:9d: Unsupported SCSI Opcode
>> 0x85, sending CHECK_CONDITION.
>>
>> Presumably every "Cancelling..." would have been a lockup?
> Fantastic, this is exactly the info I was looking for.  :)
>
> So decoding the 'Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1' bits:
>
> *) qla_tgt_cmd->state: 2
>
> qla_tgt_cmd->state = 2 is QLA_TGT_STATE_DATA_IN, which means FCP
> data-out was successfully received, and the WRITE CDB has been submitted
> into LIO backend storage.
>
> *) se_cmd->se_cmd_flags: 0x01c00109
>
> This bitmask means the following flags are set:
>   
>     SCF_SUPPORTED_SAM_OPCODE
>     SCF_SCSI_DATA_CDB
>     SCF_SE_LUN_CMD
>     SCF_ACK_KREF
>     SCF_USE_CPUID
>     SCF_TASK_ATTR_SET
>
> Nothing out of the ordinarily here..
>
> *) se_cmd->transport_state: 0x00000091
>
>    CMD_T_ABORTED
>    CMD_T_SENT;
>    CMD_T_DEV_ACTIVE;
>
> This means the WRITE command sent to the backend device, but did not
> complete before it was explicitly aborted by the ESX host.
>
> Based on what's in the logs above, this also looks as expected.
>
> *) se_cmd->t_task_cdb: 0x2a
>
> So this command is a WRITE_10 = 0x2a.
>
> This would indicate WRITE latency for the LIO backend storage is getting
> sufficiently high to trigger ESX host side ABORT_TASK timeouts, which
> are much more aggressive than normal Linux host I/O timeouts.

Would you happen to know whether the ESX I/O timeouts depend on the disk 
type (SSD vs Non-SSD) advertised by LIO?

And if so, would it be possible to ask LIO to lie about this, or would 
that be inadvisable for other reasons?

> So separate from the qla2xxx BUG_ON here discussed below, the high
> backend device latency is worth looking into, as usually once ESX host
> starts triggering ABORT_TASK timeouts it ends up taking the datastore
> offline.

Presumably this is due to the md check running without any practical 
limit to sync_speed_max,
and having a noop scheduler on the ssds where ioniceing the resync 
kernel thread might not be particularly helpful.

> That said, I'm certain you're hitting the bogus tcm_qla2xxx_free_cmd()
> in tcm_qla2xxx_aborted_task(), which based on the initial analysis is
> definitely the wrong thing to do.
>
> So here's a patch against v4.12-rc that addresses the scenario you where
> able to trigger, and fixes the second scenario as well.
>
> As you noted above, it will take a little massaging to apply to v4.9.x
> due to qla_tgt_cmd->cmd->cmd_flags becoming qla_tgt_cmd->data_work and
> ->data_work_free.  But that shouldn't be too hard.  :)

Building your proposed solution below (4db6a81459) with the 4.9.x tree,
seems to hinge on cherrypicking the following:

 From 1eb42f965cedafb700e9c902ddafb1c51e3117f7 Mon Sep 17 00:00:00 2001
From: Quinn Tran <quinn.tran@cavium.com>
Date: Thu, 19 Jan 2017 22:27:55 -0800
Subject: qla2xxx: Make trace flags more readable

So for our testing we built 4.9.35 without these patches, and could 
still reproduce the issue.

Then we built 4.9.35 with the two patches applied (results below).

> So please test without the initial debug patch on your v4.9.y based
> setup.
>
> Himanshu + Quinn, please review as after further groking of what Pascal
> has observed, these are two obvious bugs.
>
>  From 4db6a8145940d0bbd10265020d681961ce2d3238 Mon Sep 17 00:00:00 2001
> From: Nicholas Bellinger <nab@linux-iscsi.org>
> Date: Fri, 30 Jun 2017 00:08:13 -0700
> Subject: [PATCH] qla2xxx: Fix incorrect tcm_qla2xxx_free_cmd use during TMR
>   ABORT
> <SNIP>

So we've had 4.9.35+1eb42f965c+4db6a81459 running on three machines for 
a few days now, all quite stable thus far.

Most of the kernel messages below can be observed multiple times, I've 
only included each one once for sake of brevity.

On the first machine (connected to two ESX hosts, via two single switch 
fabrics), with md check disabled for the time being:

[  476.568636] qla2xxx/21:00:00:24:ff:23:6b:83: Unsupported SCSI Opcode 
0x85, sending CHECK_CONDITION.

On the second machine (connected to nine ESX hosts, via two multi switch 
fabrics), with md check disabled for the time being:

[  102.436426] qla2xxx/21:00:00:24:ff:3c:d0:95: Unsupported SCSI Opcode 
0x85, sending CHECK_CONDITION.
[  438.710613] Detected MISCOMPARE for addr: ffffa0d49425d000 buf: 
ffffa0d496c63600
[  438.718139] Target/iblock: Send MISCOMPARE check condition and sense
[154986.514515] TARGET_CORE[qla2xxx]: Detected NON_EXISTENT_LUN Access 
for 0x00000001

On the third machine (connected to a single ESX host, via two multi 
switch fabrics), with md check forced many many times (purposefully with 
a practically unlimited sync_speed_max):

[218406.984078] md: data-check of RAID array md0
[218406.984082] md: minimum _guaranteed_  speed: 10000 KB/sec/disk.
[218406.984083] md: using maximum available idle IO bandwidth (but not 
more than 1000000 KB/sec) for data-check.
[218406.984102] md: using 128k window, over a total of 405798912k.
[219111.385372] qla2xxx/21:00:00:24:ff:01:fa:9d: Unsupported SCSI Opcode 
0x85, sending CHECK_CONDITION.
[219497.980914] ABORT_TASK: Found referenced qla2xxx task_tag: 1195684
[219497.986460] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
1195684
[219502.807141] ABORT_TASK: Found referenced qla2xxx task_tag: 1196116
[219503.092716] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
1196116
[219503.092725] ABORT_TASK: Sending TMR_TASK_DOES_NOT_EXIST for ref_tag: 
1196068
[219508.154863] Detected MISCOMPARE for addr: ffff880f86e98000 buf: 
ffff880f84fce000
[219508.154865] Target/iblock: Send MISCOMPARE check condition and sense
[220140.288970] ABORT_TASK: Found referenced qla2xxx task_tag: 1243972
[220140.520201] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
1243972
[220161.302883] ABORT_TASK: Found referenced qla2xxx task_tag: 1245124
[220161.505205] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
1245124
[220176.303282] ABORT_TASK: Found referenced qla2xxx task_tag: 1245700
[220176.462535] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
1245700
[220528.353205] md: md0: data-check done

So abort_task can still be observed, but they do not result in a 
non-functional not-quite-PANICked machine anymore.

Regards,
Pascal de Bruijn


--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Madhani, Himanshu July 5, 2017, 5:52 p.m. UTC | #2
Nic, 

> On Jun 30, 2017, at 12:22 AM, Nicholas A. Bellinger <nab@linux-iscsi.org> wrote:
> 
> Hi Pascal & Co,
> 
> On Mon, 2017-06-26 at 15:20 +0200, Pascal de Bruijn wrote:
> 
> <SNIP>
> 
>> In the meanwhile I think we now know what's causing the circumstances 
>> that lets this issue surface.
>> 
>> We can reliably reproduce the issue when we run 'iozone -a -I' in a VM 
>> that's placed on the LIO provided storage and kick off a md check:
>> 
>> echo check > /sys/block/md0/md/sync_action
>> 
>> This is also probably why we were experiencing these issues consistently 
>> on the night between Saturday and Sunday, as an md check would be kicked 
>> off from cron.
>> 
>> Since we were experimenting with md tuning udev rules, particularly our 
>> sync_speed_min change may have exacerbated things:
>> 
>> SUBSYSTEM=="block", KERNEL=="md*", ACTION=="change", 
>> TEST=="md/sync_speed_min", ATTR{md/sync_speed_min}="100000"
>> SUBSYSTEM=="block", KERNEL=="md*", ACTION=="change", 
>> TEST=="md/sync_speed_max", ATTR{md/sync_speed_max}="1000000"
>> 
> 
> So it looks like the backend storage device provided to LIO is reaching
> sufficiently high enough latency to cause ESX ABORT_TASK timeouts to
> fire..
> 
>>> After staring at this, there are two scenarios during ABORT_TASK where
>>> this scenario could occur.
>>> 
>>> Namely, there are two cases where tcm_qla2xxx.c calls
>>> tcm_qla2xxx_free_cmd() internally during ABORT_TASK:
>>> 
>>>   - During tcm_qla2xxx_handle_data_work() when cmd->aborted is detected.
>>>   - During tcm_qla2xxx_aborted_task() when QLA_TGT_STATE_NEW or
>>>     QLA_TGT_STATE_DATA_IN are set.
>>> 
>>> My guess is the latter case is happening, that is resulting in
>>> tcm_qla2xxx_free_cmd() queuing up qla_tgt_cmd->work from
>>> transport_cmd_finish_abort() -> tcm_qla2xxx_aborted_task(), immediately
>>> before target_put_sess_cmd() is called from core_tmr_abort_task() to
>>> perform the final kref_put(&cmd->kref, ...).
>>> 
> 
> <SNIP>
> 
>>> The second is the following patch to catch this scenario in progress
>>> during qlt_free_cmd(), and clear ->cmd_in-wq + dump the state before
>>> doing a cancel_work_sync().
>>> 
>>> diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h
>>> index d644202..65752af 100644
>>> --- a/drivers/scsi/qla2xxx/qla_target.h
>>> +++ b/drivers/scsi/qla2xxx/qla_target.h
>>> @@ -876,7 +876,7 @@ struct qla_tgt_cmd {
>>>  	unsigned int q_full:1;
>>>  	unsigned int term_exchg:1;
>>>  	unsigned int cmd_sent_to_fw:1;
>>> -	unsigned int cmd_in_wq:1;
>>> +	unsigned int cmd_in_wq;
>>>  	unsigned int aborted:1;
>>>  	unsigned int data_work:1;
>>>  	unsigned int data_work_free:1;
>>> diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
>>> index 75aeb9f..3ceb371 100644
>>> --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
>>> +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
>>> @@ -336,6 +336,17 @@ static void tcm_qla2xxx_release_cmd(struct se_cmd *se_cmd)
>>>  	}
>>> 
>>>  	cmd = container_of(se_cmd, struct qla_tgt_cmd, se_cmd);
>>> +
>>> +	if (cmpxchg(&cmd->cmd_in_wq, 1, 0) == 1) {
>>> +		printk("Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1"
>>> +			" state: %d data_work: %d data_work_free: %d trc_flags: 0x%08x"
>>> +			" se_cmd->se_cmd_flags: 0x%08x se_cmd->transport_state: 0x%08x\n"
>>> +			" se_cmd->t_task_cdb: 0x%02x\n", cmd->state, cmd->data_work,
>>> +			cmd->data_work_free, cmd->trc_flags, cmd->se_cmd.se_cmd_flags,
>>> +			cmd->se_cmd.transport_state, cmd->se_cmd.t_task_cdb[0]);
>>> +		cancel_work_sync(&cmd->work);
>>> +	}
>>> +
>>>  	qlt_free_cmd(cmd);
>>>  }
>> 
>> This patch does apply cleanly to v4.9.34, however it did not compile, as 
>> data_work, data_work_free and trc_flags are not yet present. After 
>> removing those references it did compile fine.
>> 
>> Here's the relevant part of dmesg:
>> 
>> [  103.740391] md: data-check of RAID array md0
>> [  103.740395] md: minimum _guaranteed_  speed: 100000 KB/sec/disk.
>> [  103.740397] md: using maximum available idle IO bandwidth (but not more than 1000000 KB/sec) for data-check.
>> [  103.740416] md: using 128k window, over a total of 405798912k.
>> [  435.673247] ABORT_TASK: Found referenced qla2xxx task_tag: 1222180
>> [  435.927330] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
>> [  435.927351] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1222180
>> [  565.785280] qla2xxx/21:00:00:24:ff:08:c4:8a: Unsupported SCSI Opcode 0x85, sending CHECK_CONDITION.
>> [  702.818402] ABORT_TASK: Found referenced qla2xxx task_tag: 1230964
>> [  703.090441] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
>> [  703.090457] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1230964
>> [  888.912452] ABORT_TASK: Found referenced qla2xxx task_tag: 1215316
>> [  889.133709] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
>> [  889.133713] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1215316
>> [ 1741.356248] ABORT_TASK: Found referenced qla2xxx task_tag: 1222900
>> [ 1741.609044] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091 se_cmd->t_task_cdb: 0x2a
>> [ 1741.609047] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1222900
>> [ 2365.885579] qla2xxx/21:00:00:24:ff:01:fa:9d: Unsupported SCSI Opcode 0x85, sending CHECK_CONDITION.
>> [ 2425.729887] ABORT_TASK: Found referenced qla2xxx task_tag: 1242772
>> [ 2425.901292] Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1 state: 2 se_cmd->se_cmd_flags: 0x01c00109 se_cmd->transport_state: 0x00000091se_cmd->t_task_cdb: 0x2a
>> [ 2425.901296] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
>> 1242772
>> [ 2778.797321] md: md0: data-check done.
>> [ 4165.989290] qla2xxx/21:00:00:24:ff:01:fa:9d: Unsupported SCSI Opcode 
>> 0x85, sending CHECK_CONDITION.
>> 
>> Presumably every "Cancelling..." would have been a lockup?
> 
> Fantastic, this is exactly the info I was looking for.  :)
> 
> So decoding the 'Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1' bits:
> 
> *) qla_tgt_cmd->state: 2
> 
> qla_tgt_cmd->state = 2 is QLA_TGT_STATE_DATA_IN, which means FCP
> data-out was successfully received, and the WRITE CDB has been submitted
> into LIO backend storage.
> 
> *) se_cmd->se_cmd_flags: 0x01c00109
> 
> This bitmask means the following flags are set:
> 
>   SCF_SUPPORTED_SAM_OPCODE
>   SCF_SCSI_DATA_CDB
>   SCF_SE_LUN_CMD
>   SCF_ACK_KREF
>   SCF_USE_CPUID
>   SCF_TASK_ATTR_SET
> 
> Nothing out of the ordinarily here..
> 
> *) se_cmd->transport_state: 0x00000091
> 
>  CMD_T_ABORTED
>  CMD_T_SENT;
>  CMD_T_DEV_ACTIVE;
> 
> This means the WRITE command sent to the backend device, but did not
> complete before it was explicitly aborted by the ESX host.
> 
> Based on what's in the logs above, this also looks as expected.
> 
> *) se_cmd->t_task_cdb: 0x2a
> 
> So this command is a WRITE_10 = 0x2a.
> 
> This would indicate WRITE latency for the LIO backend storage is getting
> sufficiently high to trigger ESX host side ABORT_TASK timeouts, which
> are much more aggressive than normal Linux host I/O timeouts.
> 
> So separate from the qla2xxx BUG_ON here discussed below, the high
> backend device latency is worth looking into, as usually once ESX host
> starts triggering ABORT_TASK timeouts it ends up taking the datastore
> offline.
> 
> That said, I'm certain you're hitting the bogus tcm_qla2xxx_free_cmd()
> in tcm_qla2xxx_aborted_task(), which based on the initial analysis is
> definitely the wrong thing to do.
> 
> So here's a patch against v4.12-rc that addresses the scenario you where
> able to trigger, and fixes the second scenario as well.
> 
> As you noted above, it will take a little massaging to apply to v4.9.x
> due to qla_tgt_cmd->cmd->cmd_flags becoming qla_tgt_cmd->data_work and
> ->data_work_free.  But that shouldn't be too hard.  :)
> 
> So please test without the initial debug patch on your v4.9.y based
> setup.
> 
> Himanshu + Quinn, please review as after further groking of what Pascal
> has observed, these are two obvious bugs.
> 
> From 4db6a8145940d0bbd10265020d681961ce2d3238 Mon Sep 17 00:00:00 2001
> From: Nicholas Bellinger <nab@linux-iscsi.org>
> Date: Fri, 30 Jun 2017 00:08:13 -0700
> Subject: [PATCH] qla2xxx: Fix incorrect tcm_qla2xxx_free_cmd use during TMR
> ABORT
> 
> This patch drops two incorrect usages of tcm_qla2xxx_free_cmd()
> during TMR ABORT within tcm_qla2xxx_handle_data_work() and
> tcm_qla2xxx_aborted_task(), which where attempting to dispatch
> into workqueue context to do tcm_qla2xxx_complete_free() and
> subsequently invoke transport_generic_free_cmd().
> 
> This is incorrect because during TMR ABORT target-core will
> drop the outstanding se_cmd->cmd_kref references once it has
> quiesced the se_cmd via transport_wait_for_tasks(), and in
> the case of qla2xxx it should not attempt to do it's own
> transport_generic_free_cmd().
> 
> As reported by Pascal, this was originally manifesting as a
> BUG_ON(cmd->cmd_in_wq) in qlt_free_cmd() during TMR ABORT
> with a LIO backend with sufficently high WRITE latency to
> trigger a host side TMR ABORT_TASK to occur.
> 
> In addition, for the case in tcm_qla2xxx_write_pending_status()
> and tcm_qla2xxx_handle_data_work() that waits for outstanding
> FCP WRITE data transfer to complete before preceedin with a
> TMR ABORT, avoid using se_cmd->t_transport_stop_comp that is
> already used by transport_wait_for_tasks() and use a qla2xxx
> internal struct completion instead.
> 
> Reported-by: Pascal de Bruijn <p.debruijn@unilogic.nl>
> Cc: Himanshu Madhani <himanshu.madhani@cavium.com>
> Cc: Quinn Tran <quinn.tran@cavium.com>
> Cc: <stable@vger.kernel.org> # 3.10+
> Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
> ---
> drivers/scsi/qla2xxx/qla_target.c  |  4 ++++
> drivers/scsi/qla2xxx/qla_target.h  |  1 +
> drivers/scsi/qla2xxx/tcm_qla2xxx.c | 36 +++---------------------------------
> 3 files changed, 8 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c
> index 401e245..8f8ece9 100644
> --- a/drivers/scsi/qla2xxx/qla_target.c
> +++ b/drivers/scsi/qla2xxx/qla_target.c
> @@ -4079,6 +4079,8 @@ static struct qla_tgt_cmd *qlt_get_tag(scsi_qla_host_t *vha,
> 	cmd = &((struct qla_tgt_cmd *)se_sess->sess_cmd_map)[tag];
> 	memset(cmd, 0, sizeof(struct qla_tgt_cmd));
> 
> +	init_completion(&cmd->write_pending_abort_comp);
> +
> 	memcpy(&cmd->atio, atio, sizeof(*atio));
> 	cmd->state = QLA_TGT_STATE_NEW;
> 	cmd->tgt = vha->vha_tgt.qla_tgt;
> @@ -5083,6 +5085,8 @@ static int __qlt_send_busy(struct scsi_qla_host *vha,
> 
> 	qlt_incr_num_pend_cmds(vha);
> 	INIT_LIST_HEAD(&cmd->cmd_list);
> +	init_completion(&cmd->write_pending_abort_comp);
> +
> 	memcpy(&cmd->atio, atio, sizeof(*atio));
> 
> 	cmd->tgt = vha->vha_tgt.qla_tgt;
> diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h
> index d644202..939e93c5 100644
> --- a/drivers/scsi/qla2xxx/qla_target.h
> +++ b/drivers/scsi/qla2xxx/qla_target.h
> @@ -868,6 +868,7 @@ struct qla_tgt_cmd {
> 	unsigned char sense_buffer[TRANSPORT_SENSE_BUFFER];
> 
> 	spinlock_t cmd_lock;
> +	struct completion write_pending_abort_comp;
> 	/* to save extra sess dereferences */
> 	unsigned int conf_compl_supported:1;
> 	unsigned int sg_mapped:1;
> diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
> index 75aeb9f..8c1bf9b 100644
> --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
> +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
> @@ -415,6 +415,7 @@ static int tcm_qla2xxx_write_pending(struct se_cmd *se_cmd)
> 
> static int tcm_qla2xxx_write_pending_status(struct se_cmd *se_cmd)
> {
> +	struct qla_tgt_cmd *cmd = container_of(se_cmd, struct qla_tgt_cmd, se_cmd);
> 	unsigned long flags;
> 	/*
> 	 * Check for WRITE_PENDING status to determine if we need to wait for
> @@ -424,8 +425,7 @@ static int tcm_qla2xxx_write_pending_status(struct se_cmd *se_cmd)
> 	if (se_cmd->t_state == TRANSPORT_WRITE_PENDING ||
> 	    se_cmd->t_state == TRANSPORT_COMPLETE_QF_WP) {
> 		spin_unlock_irqrestore(&se_cmd->t_state_lock, flags);
> -		wait_for_completion_timeout(&se_cmd->t_transport_stop_comp,
> -						50);
> +		wait_for_completion(&cmd->write_pending_abort_comp);
> 		return 0;
> 	}
> 	spin_unlock_irqrestore(&se_cmd->t_state_lock, flags);
> @@ -501,7 +501,6 @@ static int tcm_qla2xxx_handle_cmd(scsi_qla_host_t *vha, struct qla_tgt_cmd *cmd,
> static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
> {
> 	struct qla_tgt_cmd *cmd = container_of(work, struct qla_tgt_cmd, work);
> -	unsigned long flags;
> 
> 	/*
> 	 * Ensure that the complete FCP WRITE payload has been received.
> @@ -509,17 +508,6 @@ static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
> 	 */
> 	cmd->cmd_in_wq = 0;
> 
> -	spin_lock_irqsave(&cmd->cmd_lock, flags);
> -	cmd->data_work = 1;
> -	if (cmd->aborted) {
> -		cmd->data_work_free = 1;
> -		spin_unlock_irqrestore(&cmd->cmd_lock, flags);
> -
> -		tcm_qla2xxx_free_cmd(cmd);
> -		return;
> -	}
> -	spin_unlock_irqrestore(&cmd->cmd_lock, flags);
> -
> 	cmd->vha->tgt_counters.qla_core_ret_ctio++;
> 	if (!cmd->write_data_transferred) {
> 		/*
> @@ -527,7 +515,7 @@ static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
> 		 * waiting upon completion in tcm_qla2xxx_write_pending_status()
> 		 */
> 		if (cmd->se_cmd.transport_state & CMD_T_ABORTED) {
> -			complete(&cmd->se_cmd.t_transport_stop_comp);
> +			complete(&cmd->write_pending_abort_comp);
> 			return;
> 		}
> 
> @@ -753,31 +741,13 @@ static void tcm_qla2xxx_queue_tm_rsp(struct se_cmd *se_cmd)
> 	qlt_xmit_tm_rsp(mcmd);
> }
> 
> -#define DATA_WORK_NOT_FREE(_cmd) (_cmd->data_work && !_cmd->data_work_free)
> static void tcm_qla2xxx_aborted_task(struct se_cmd *se_cmd)
> {
> 	struct qla_tgt_cmd *cmd = container_of(se_cmd,
> 				struct qla_tgt_cmd, se_cmd);
> -	unsigned long flags;
> 
> 	if (qlt_abort_cmd(cmd))
> 		return;
> -
> -	spin_lock_irqsave(&cmd->cmd_lock, flags);
> -	if ((cmd->state == QLA_TGT_STATE_NEW)||
> -	    ((cmd->state == QLA_TGT_STATE_DATA_IN) &&
> -		DATA_WORK_NOT_FREE(cmd))) {
> -		cmd->data_work_free = 1;
> -		spin_unlock_irqrestore(&cmd->cmd_lock, flags);
> -		/*
> -		 * cmd has not reached fw, Use this trigger to free it.
> -		 */
> -		tcm_qla2xxx_free_cmd(cmd);
> -		return;
> -	}
> -	spin_unlock_irqrestore(&cmd->cmd_lock, flags);
> -	return;
> -
> }
> 
> static void tcm_qla2xxx_clear_sess_lookup(struct tcm_qla2xxx_lport *,
> 

Patch looks good. 

Acked-by: Himanshu Madhani <himanshu.madhani@cavium.com>

Thanks,
- Himanshu

--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicholas A. Bellinger July 7, 2017, 4:26 a.m. UTC | #3
On Mon, 2017-07-03 at 16:03 +0200, Pascal de Bruijn wrote:
> On 30-06-17 09:22, Nicholas A. Bellinger wrote:

<SNIP>

> > Fantastic, this is exactly the info I was looking for.  :)
> >
> > So decoding the 'Canceling outstanding qla_tgt_cmd->cmd_in_wq = 1' bits:
> >
> > *) qla_tgt_cmd->state: 2
> >
> > qla_tgt_cmd->state = 2 is QLA_TGT_STATE_DATA_IN, which means FCP
> > data-out was successfully received, and the WRITE CDB has been submitted
> > into LIO backend storage.
> >
> > *) se_cmd->se_cmd_flags: 0x01c00109
> >
> > This bitmask means the following flags are set:
> >   
> >     SCF_SUPPORTED_SAM_OPCODE
> >     SCF_SCSI_DATA_CDB
> >     SCF_SE_LUN_CMD
> >     SCF_ACK_KREF
> >     SCF_USE_CPUID
> >     SCF_TASK_ATTR_SET
> >
> > Nothing out of the ordinarily here..
> >
> > *) se_cmd->transport_state: 0x00000091
> >
> >    CMD_T_ABORTED
> >    CMD_T_SENT;
> >    CMD_T_DEV_ACTIVE;
> >
> > This means the WRITE command sent to the backend device, but did not
> > complete before it was explicitly aborted by the ESX host.
> >
> > Based on what's in the logs above, this also looks as expected.
> >
> > *) se_cmd->t_task_cdb: 0x2a
> >
> > So this command is a WRITE_10 = 0x2a.
> >
> > This would indicate WRITE latency for the LIO backend storage is getting
> > sufficiently high to trigger ESX host side ABORT_TASK timeouts, which
> > are much more aggressive than normal Linux host I/O timeouts.
> 
> Would you happen to know whether the ESX I/O timeouts depend on the disk 
> type (SSD vs Non-SSD) advertised by LIO?
> 
> And if so, would it be possible to ask LIO to lie about this, or would 
> that be inadvisable for other reasons?
> 

Unfortunately, this timeout is completely host OS dependent and there is
nothing defined in SCSI for a target to explicitly modify it.

With ESX + iSCSI for example, the I/O timeout value before a TMR
ABORT_TASK gets generated is hard-coded (eg: no user configurable in ESX
5.x+) to only 5 seconds.  Vendors who ship commercial products usually
end up doing a bunch of tricks in their backend driver to generate a
SAM_STAT_BUSY (eg: retriable status) to avoid TMR ABORT_TASK if the I/O
is taking longer than a few seconds to complete.

IIRC with ESX + FCP, this value is configurable but is specific to the
individual SCSI host LLDs.  Here's the doc for qla2xxx:

http://filedownloads.qlogic.com/files/manual/69771/FC0054606-00.pdf

Himanshu + Quinn, do you recall what this tunable is called..?

> > So separate from the qla2xxx BUG_ON here discussed below, the high
> > backend device latency is worth looking into, as usually once ESX host
> > starts triggering ABORT_TASK timeouts it ends up taking the datastore
> > offline.
> 
> Presumably this is due to the md check running without any practical 
> limit to sync_speed_max,
> and having a noop scheduler on the ssds where ioniceing the resync 
> kernel thread might not be particularly helpful.

That would most certainly (with spinning media) be driving up backend
latency sufficiently high to trigger ESX's default I/O timeouts.

> > So please test without the initial debug patch on your v4.9.y based
> > setup.
> >
> > Himanshu + Quinn, please review as after further groking of what Pascal
> > has observed, these are two obvious bugs.
> >
> >  From 4db6a8145940d0bbd10265020d681961ce2d3238 Mon Sep 17 00:00:00 2001
> > From: Nicholas Bellinger <nab@linux-iscsi.org>
> > Date: Fri, 30 Jun 2017 00:08:13 -0700
> > Subject: [PATCH] qla2xxx: Fix incorrect tcm_qla2xxx_free_cmd use during TMR
> >   ABORT
> > <SNIP>
> 
> So we've had 4.9.35+1eb42f965c+4db6a81459 running on three machines for 
> a few days now, all quite stable thus far.

Great.  Thanks for confirming.

> 
> Most of the kernel messages below can be observed multiple times, I've 
> only included each one once for sake of brevity.
> 
> On the first machine (connected to two ESX hosts, via two single switch 
> fabrics), with md check disabled for the time being:
> 
> [  476.568636] qla2xxx/21:00:00:24:ff:23:6b:83: Unsupported SCSI Opcode 
> 0x85, sending CHECK_CONDITION.

Yep, that's a normal warning for ATA_PASSHTHOUGH being unsupported from
ESX's smartd.

> 
> On the second machine (connected to nine ESX hosts, via two multi switch 
> fabrics), with md check disabled for the time being:
> 
> [  102.436426] qla2xxx/21:00:00:24:ff:3c:d0:95: Unsupported SCSI Opcode 
> 0x85, sending CHECK_CONDITION.
> [  438.710613] Detected MISCOMPARE for addr: ffffa0d49425d000 buf: 
> ffffa0d496c63600
> [  438.718139] Target/iblock: Send MISCOMPARE check condition and sense
> [154986.514515] TARGET_CORE[qla2xxx]: Detected NON_EXISTENT_LUN Access 
> for 0x00000001

Normal COMPARE_AND_WRITE warnings when multiple ESX hosts are attempting
to get exclusive WRITE access via VAAI AtomicTestandSet for a particular
VMFS extent.

> 
> On the third machine (connected to a single ESX host, via two multi 
> switch fabrics), with md check forced many many times (purposefully with 
> a practically unlimited sync_speed_max):
> 
> [218406.984078] md: data-check of RAID array md0
> [218406.984082] md: minimum _guaranteed_  speed: 10000 KB/sec/disk.
> [218406.984083] md: using maximum available idle IO bandwidth (but not 
> more than 1000000 KB/sec) for data-check.
> [218406.984102] md: using 128k window, over a total of 405798912k.
> [219111.385372] qla2xxx/21:00:00:24:ff:01:fa:9d: Unsupported SCSI Opcode 
> 0x85, sending CHECK_CONDITION.
> [219497.980914] ABORT_TASK: Found referenced qla2xxx task_tag: 1195684
> [219497.986460] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
> 1195684
> [219502.807141] ABORT_TASK: Found referenced qla2xxx task_tag: 1196116
> [219503.092716] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
> 1196116
> [219503.092725] ABORT_TASK: Sending TMR_TASK_DOES_NOT_EXIST for ref_tag: 
> 1196068
> [219508.154863] Detected MISCOMPARE for addr: ffff880f86e98000 buf: 
> ffff880f84fce000
> [219508.154865] Target/iblock: Send MISCOMPARE check condition and sense
> [220140.288970] ABORT_TASK: Found referenced qla2xxx task_tag: 1243972
> [220140.520201] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
> 1243972
> [220161.302883] ABORT_TASK: Found referenced qla2xxx task_tag: 1245124
> [220161.505205] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
> 1245124
> [220176.303282] ABORT_TASK: Found referenced qla2xxx task_tag: 1245700
> [220176.462535] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 
> 1245700
> [220528.353205] md: md0: data-check done
> 

Btw, the difference between "Found referenced qla2xxx task tag", and
"Sending TMR_FUNCTION_COMPLETE" tells how much time between when the
ABORT_TASK was received by LIO, vs. when your backend storage completed
I/O back to LIO.

For example, with:

[220161.302883] ABORT_TASK: Found referenced qla2xxx task_tag: 1245124
[220161.505205] ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for ref_tag: 1245124

Which was ~200 milliseconds.  So the total amount of time to this
particular I/O to your MD device would be the default ESX FCP LLD host
timeout (assuming 5 seconds), plus ~200 milliseconds.

> So abort_task can still be observed, but they do not result in a 
> non-functional not-quite-PANICked machine anymore.
> 

Thank alot for the bug report and your continuous testing to get this
resolved.  The patch is queued up in target-pending/for-next with your
Tested-by, and will be CC' to stable so the older v4.x.y and v3.x.y
kernels get this fix as well.

Thanks again.

--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pascal de Bruijn | Unilogic B.V. July 10, 2017, 3:04 p.m. UTC | #4
On 07-07-17 06:26, Nicholas A. Bellinger wrote:
> On Mon, 2017-07-03 at 16:03 +0200, Pascal de Bruijn wrote:
>> So abort_task can still be observed, but they do not result in a
>> non-functional not-quite-PANICked machine anymore.
> Thank alot for the bug report and your continuous testing to get this
> resolved.  The patch is queued up in target-pending/for-next with your
> Tested-by, and will be CC' to stable so the older v4.x.y and v3.x.y
> kernels get this fix as well.
>
> Thanks again.

I'm afraid we may not be quite there yet after all...

So we had the other two machines run an md check this weekend
as well, again with a rediculously high synx_speed_max:

Jul  9 04:00:01 myhost kernel: [661309.794774] md: data-check of RAID 
array md0
Jul  9 04:00:01 myhost kernel: [661309.799173] md: minimum _guaranteed_  
speed: 10000 KB/sec/disk.
Jul  9 04:00:01 myhost kernel: [661309.805219] md: using maximum 
available idle IO bandwidth (but not more than 1000000 KB/sec) for 
data-check.
Jul  9 04:00:01 myhost kernel: [661309.815194] md: using 128k window, 
over a total of 3252682752k.
Jul  9 04:00:42 myhost kernel: [661351.076391] 
qla2xxx/21:00:00:24:ff:4b:8f:58: Unsupported SCSI Opcode 0x85, sending 
CHECK_CONDITION.
Jul  9 04:02:01 myhost kernel: [661429.985082] 
qla2xxx/21:00:00:24:ff:4b:9e:19: Unsupported SCSI Opcode 0x85, sending 
CHECK_CONDITION.
Jul  9 04:04:24 myhost kernel: [661573.395245] 
qla2xxx/50:01:43:80:28:ca:86:36: Unsupported SCSI Opcode 0x85, sending 
CHECK_CONDITION.
Jul  9 04:04:57 myhost kernel: [661605.837694] 
qla2xxx/50:01:43:80:28:ca:86:e6: Unsupported SCSI Opcode 0x85, sending 
CHECK_CONDITION.
Jul  9 04:09:19 myhost kernel: [661868.261211] 
qla2xxx/21:00:00:24:ff:54:9e:ab: Unsupported SCSI Opcode 0x85, sending 
CHECK_CONDITION.
Jul  9 04:13:17 myhost kernel: [662105.788459] ABORT_TASK: Found 
referenced qla2xxx task_tag: 1175332
Jul  9 04:13:17 myhost kernel: [662105.794794] ABORT_TASK: Sending 
TMR_TASK_DOES_NOT_EXIST for ref_tag: 1175332
Jul  9 04:13:17 myhost kernel: [662105.990584] ABORT_TASK: Found 
referenced qla2xxx task_tag: 1175380
Jul  9 04:13:18 myhost kernel: [662106.510403] ABORT_TASK: Sending 
TMR_FUNCTION_COMPLETE for ref_tag: 1175380
Jul  9 04:13:20 myhost kernel: [662108.988526] ABORT_TASK: Found 
referenced qla2xxx task_tag: 1175620
Jul  9 04:13:31 myhost kernel: [662119.684969] ABORT_TASK: Found 
referenced qla2xxx task_tag: 1211140
Jul  9 04:16:42 myhost kernel: [662310.617910] 
qla2xxx/21:00:00:24:ff:92:bf:43: Unsupported SCSI Opcode 0x85, sending 
CHECK_CONDITION.
Jul  9 04:18:00 myhost kernel: [662389.415853] 
qla2xxx/21:00:00:24:ff:54:a1:33: Unsupported SCSI Opcode 0x85, sending 
CHECK_CONDITION.
Jul  9 04:18:22 myhost kernel: [662411.066461] 
qla2xxx/21:00:00:24:ff:92:bf:59: Unsupported SCSI Opcode 0x85, sending 
CHECK_CONDITION.
Jul  9 04:20:23 myhost kernel: [662531.852833] 
qla2xxx/21:00:00:24:ff:3c:d0:94: Unsupported SCSI Opcode 0x85, sending 
CHECK_CONDITION.
Jul  9 07:00:28 myhost kernel: [672137.325166] md: md0: data-check done.

The machine in question was still responsive (was accepting
SSH logins), however it seemed VMware hosts weren't seeing
the volume anymore (presumably due to the heavy IO on the backend).

Also, several (12?) kworkers seemed stuck in a D state.

When my collegue tried to reboot the machine it got
(presumably) stuck on

/usr/bin/targetctl clear

After which it was forcefully rebooted :)

Sorry we don't have any more detailed info at this point.
We haven't been able to reproduce this on a
different machine yet :(

Regards,
Pascal de Bruijn


--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c
index 401e245..8f8ece9 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -4079,6 +4079,8 @@  static struct qla_tgt_cmd *qlt_get_tag(scsi_qla_host_t *vha,
 	cmd = &((struct qla_tgt_cmd *)se_sess->sess_cmd_map)[tag];
 	memset(cmd, 0, sizeof(struct qla_tgt_cmd));
 
+	init_completion(&cmd->write_pending_abort_comp);
+
 	memcpy(&cmd->atio, atio, sizeof(*atio));
 	cmd->state = QLA_TGT_STATE_NEW;
 	cmd->tgt = vha->vha_tgt.qla_tgt;
@@ -5083,6 +5085,8 @@  static int __qlt_send_busy(struct scsi_qla_host *vha,
 
 	qlt_incr_num_pend_cmds(vha);
 	INIT_LIST_HEAD(&cmd->cmd_list);
+	init_completion(&cmd->write_pending_abort_comp);
+
 	memcpy(&cmd->atio, atio, sizeof(*atio));
 
 	cmd->tgt = vha->vha_tgt.qla_tgt;
diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h
index d644202..939e93c5 100644
--- a/drivers/scsi/qla2xxx/qla_target.h
+++ b/drivers/scsi/qla2xxx/qla_target.h
@@ -868,6 +868,7 @@  struct qla_tgt_cmd {
 	unsigned char sense_buffer[TRANSPORT_SENSE_BUFFER];
 
 	spinlock_t cmd_lock;
+	struct completion write_pending_abort_comp;
 	/* to save extra sess dereferences */
 	unsigned int conf_compl_supported:1;
 	unsigned int sg_mapped:1;
diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
index 75aeb9f..8c1bf9b 100644
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@@ -415,6 +415,7 @@  static int tcm_qla2xxx_write_pending(struct se_cmd *se_cmd)
 
 static int tcm_qla2xxx_write_pending_status(struct se_cmd *se_cmd)
 {
+	struct qla_tgt_cmd *cmd = container_of(se_cmd, struct qla_tgt_cmd, se_cmd);
 	unsigned long flags;
 	/*
 	 * Check for WRITE_PENDING status to determine if we need to wait for
@@ -424,8 +425,7 @@  static int tcm_qla2xxx_write_pending_status(struct se_cmd *se_cmd)
 	if (se_cmd->t_state == TRANSPORT_WRITE_PENDING ||
 	    se_cmd->t_state == TRANSPORT_COMPLETE_QF_WP) {
 		spin_unlock_irqrestore(&se_cmd->t_state_lock, flags);
-		wait_for_completion_timeout(&se_cmd->t_transport_stop_comp,
-						50);
+		wait_for_completion(&cmd->write_pending_abort_comp);
 		return 0;
 	}
 	spin_unlock_irqrestore(&se_cmd->t_state_lock, flags);
@@ -501,7 +501,6 @@  static int tcm_qla2xxx_handle_cmd(scsi_qla_host_t *vha, struct qla_tgt_cmd *cmd,
 static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
 {
 	struct qla_tgt_cmd *cmd = container_of(work, struct qla_tgt_cmd, work);
-	unsigned long flags;
 
 	/*
 	 * Ensure that the complete FCP WRITE payload has been received.
@@ -509,17 +508,6 @@  static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
 	 */
 	cmd->cmd_in_wq = 0;
 
-	spin_lock_irqsave(&cmd->cmd_lock, flags);
-	cmd->data_work = 1;
-	if (cmd->aborted) {
-		cmd->data_work_free = 1;
-		spin_unlock_irqrestore(&cmd->cmd_lock, flags);
-
-		tcm_qla2xxx_free_cmd(cmd);
-		return;
-	}
-	spin_unlock_irqrestore(&cmd->cmd_lock, flags);
-
 	cmd->vha->tgt_counters.qla_core_ret_ctio++;
 	if (!cmd->write_data_transferred) {
 		/*
@@ -527,7 +515,7 @@  static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
 		 * waiting upon completion in tcm_qla2xxx_write_pending_status()
 		 */
 		if (cmd->se_cmd.transport_state & CMD_T_ABORTED) {
-			complete(&cmd->se_cmd.t_transport_stop_comp);
+			complete(&cmd->write_pending_abort_comp);
 			return;
 		}
 
@@ -753,31 +741,13 @@  static void tcm_qla2xxx_queue_tm_rsp(struct se_cmd *se_cmd)
 	qlt_xmit_tm_rsp(mcmd);
 }
 
-#define DATA_WORK_NOT_FREE(_cmd) (_cmd->data_work && !_cmd->data_work_free)
 static void tcm_qla2xxx_aborted_task(struct se_cmd *se_cmd)
 {
 	struct qla_tgt_cmd *cmd = container_of(se_cmd,
 				struct qla_tgt_cmd, se_cmd);
-	unsigned long flags;
 
 	if (qlt_abort_cmd(cmd))
 		return;
-
-	spin_lock_irqsave(&cmd->cmd_lock, flags);
-	if ((cmd->state == QLA_TGT_STATE_NEW)||
-	    ((cmd->state == QLA_TGT_STATE_DATA_IN) &&
-		DATA_WORK_NOT_FREE(cmd))) {
-		cmd->data_work_free = 1;
-		spin_unlock_irqrestore(&cmd->cmd_lock, flags);
-		/*
-		 * cmd has not reached fw, Use this trigger to free it.
-		 */
-		tcm_qla2xxx_free_cmd(cmd);
-		return;
-	}
-	spin_unlock_irqrestore(&cmd->cmd_lock, flags);
-	return;
-
 }
 
 static void tcm_qla2xxx_clear_sess_lookup(struct tcm_qla2xxx_lport *,