[v4] sd: Check for unaligned partial completion
diff mbox

Message ID 20170215021230.11181-1-damien.lemoal@wdc.com
State Superseded, archived
Headers show

Commit Message

Damien Le Moal Feb. 15, 2017, 2:12 a.m. UTC
Commit "mpt3sas: Force request partial completion alignment" was not
considering the case of REQ_TYPE_FS commands not operating on sector
size units (e.g. REQ_OP_ZONE_REPORT and its 64B aligned partial
replies). This could result is incorrectly retrying (forever) those
commands.

Move the partial completion alignement check of mpt3sas to a generic
implementation in sd_done so that the check comes after good_bytes and
resid corrections done in that function depending on the request
command. The check is added to the initial command switch so that
commands with special payload size handling are ignored.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>

---

Changes from v3:
- Moved check to initial switch-case so that commands with special payload
  handling are ignored.

Changes from v2:
- Fixed good_bytes calculation after correction of unaligned resid
  It should be good_bytes=scsi_buflen() - resid, and not good_bytes-=resid

 drivers/scsi/mpt3sas/mpt3sas_scsih.c | 15 ---------------
 drivers/scsi/sd.c                    | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 15 deletions(-)

Comments

Christoph Hellwig Feb. 15, 2017, 6:34 a.m. UTC | #1
Hi Damien,

this looks reasonable, but we should ask Guilherme and Ram to confirm
it fixes their originally reported issue.  I've added them to Cc.

On Wed, Feb 15, 2017 at 11:12:30AM +0900, Damien Le Moal wrote:
> Commit "mpt3sas: Force request partial completion alignment" was not
> considering the case of REQ_TYPE_FS commands not operating on sector
> size units (e.g. REQ_OP_ZONE_REPORT and its 64B aligned partial
> replies). This could result is incorrectly retrying (forever) those
> commands.
> 
> Move the partial completion alignement check of mpt3sas to a generic
> implementation in sd_done so that the check comes after good_bytes and
> resid corrections done in that function depending on the request
> command. The check is added to the initial command switch so that
> commands with special payload size handling are ignored.
> 
> Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
> 
> ---
> 
> Changes from v3:
> - Moved check to initial switch-case so that commands with special payload
>   handling are ignored.
> 
> Changes from v2:
> - Fixed good_bytes calculation after correction of unaligned resid
>   It should be good_bytes=scsi_buflen() - resid, and not good_bytes-=resid
> 
>  drivers/scsi/mpt3sas/mpt3sas_scsih.c | 15 ---------------
>  drivers/scsi/sd.c                    | 20 ++++++++++++++++++++
>  2 files changed, 20 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
> index 0b5b423..1961535 100644
> --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
> +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
> @@ -4658,7 +4658,6 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply)
>  	struct MPT3SAS_DEVICE *sas_device_priv_data;
>  	u32 response_code = 0;
>  	unsigned long flags;
> -	unsigned int sector_sz;
>  
>  	mpi_reply = mpt3sas_base_get_reply_virt_addr(ioc, reply);
>  	scmd = _scsih_scsi_lookup_get_clear(ioc, smid);
> @@ -4717,20 +4716,6 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply)
>  	}
>  
>  	xfer_cnt = le32_to_cpu(mpi_reply->TransferCount);
> -
> -	/* In case of bogus fw or device, we could end up having
> -	 * unaligned partial completion. We can force alignment here,
> -	 * then scsi-ml does not need to handle this misbehavior.
> -	 */
> -	sector_sz = scmd->device->sector_size;
> -	if (unlikely(scmd->request->cmd_type == REQ_TYPE_FS && sector_sz &&
> -		     xfer_cnt % sector_sz)) {
> -		sdev_printk(KERN_INFO, scmd->device,
> -		    "unaligned partial completion avoided (xfer_cnt=%u, sector_sz=%u)\n",
> -			    xfer_cnt, sector_sz);
> -		xfer_cnt = round_down(xfer_cnt, sector_sz);
> -	}
> -
>  	scsi_set_resid(scmd, scsi_bufflen(scmd) - xfer_cnt);
>  	if (ioc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE)
>  		log_info =  le32_to_cpu(mpi_reply->IOCLogInfo);
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 1f5d92a..d05a328 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -1790,6 +1790,8 @@ static int sd_done(struct scsi_cmnd *SCpnt)
>  {
>  	int result = SCpnt->result;
>  	unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt);
> +	unsigned int sector_size = SCpnt->device->sector_size;
> +	unsigned int resid;
>  	struct scsi_sense_hdr sshdr;
>  	struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk);
>  	struct request *req = SCpnt->request;
> @@ -1820,6 +1822,24 @@ static int sd_done(struct scsi_cmnd *SCpnt)
>  			scsi_set_resid(SCpnt, blk_rq_bytes(req));
>  		}
>  		break;
> +	default:
> +		/*
> +	 	 * In case of bogus fw or device, we could end up having
> +	 	 * an unaligned partial completion. Check this here and force
> +	 	 * alignment.
> +	 	 */
> +		resid = scsi_get_resid(SCpnt);
> +		if (resid & (sector_size - 1)) {
> +			SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
> +				"Unaligned partial completion (resid=%u, sector_sz=%u)\n",
> +				resid, sector_size));
> +			resid = round_up(resid, sector_size);
> +			if (resid < good_bytes)
> +				good_bytes -= resid;
> +			else
> +				good_bytes = 0;
> +			scsi_set_resid(SCpnt, resid);
> +		}
>  	}
>  
>  	if (result) {
> -- 
> 2.9.3
> 
> Western Digital Corporation (and its subsidiaries) E-mail Confidentiality Notice & Disclaimer:
> 
> This e-mail and any files transmitted with it may contain confidential or legally privileged information of WDC and/or its affiliates, and are intended solely for the use of the individual or entity to which they are addressed. If you are not the intended recipient, any disclosure, copying, distribution or any action taken or omitted to be taken in reliance on it, is prohibited. If you have received this e-mail in error, please notify the sender immediately and delete the e-mail in its entirety from your system.
---end quoted text---
Damien Le Moal Feb. 15, 2017, 6:48 a.m. UTC | #2
Christoph,

On 2/15/17 15:34, Christoph Hellwig wrote:
> this looks reasonable, but we should ask Guilherme and Ram to confirm
> it fixes their originally reported issue.  I've added them to Cc.

Thank you.

Guilherme, Ram,

Please test ! The original fix breaks the zoned block device support
that was newly added to 4.10. So we need a fix for that and your issue
combined before Linus releases 4.10 stable this weekend.

Thanks.

Best regards.
Ram Pai Feb. 15, 2017, 7:06 a.m. UTC | #3
On Wed, Feb 15, 2017 at 03:48:52PM +0900, Damien Le Moal wrote:
> Christoph,
> 
> On 2/15/17 15:34, Christoph Hellwig wrote:
> > this looks reasonable, but we should ask Guilherme and Ram to confirm
> > it fixes their originally reported issue.  I've added them to Cc.
> 
> Thank you.
> 
> Guilherme, Ram,
> 
> Please test ! The original fix breaks the zoned block device support
> that was newly added to 4.10. So we need a fix for that and your issue
> combined before Linus releases 4.10 stable this weekend.

Yes logically it looks correct and should fix our issue. 

I doubt Guilherme has access to the same adapter firmware that exposed
the original problem. We will probably have to induce the erroneous
behavior in the driver to reproduce the issue and verify the fix.

Will let you know,
RP
Bart Van Assche Feb. 15, 2017, 3:10 p.m. UTC | #4
On Wed, 2017-02-15 at 11:12 +0900, Damien Le Moal wrote:
> +			resid = round_up(resid, sector_size);

> +			if (resid < good_bytes)

> +				good_bytes -= resid;

> +			else

> +				good_bytes = 0;

> +			scsi_set_resid(SCpnt, resid);


Hello Damien,

If the Data-Out buffer is smaller than a single logical block, can the above
code cause resid to exceed the Data-Out buffer size? I think we should avoid
to convert a residual overflow into a residual underflow. Additionally, will
round_up() work correctly if resid is negative (residual underflow)? How
about using the following (untested) code instead of the above?

if (resid > 0) {
        resid = min(good_bytes, round_up(resid, sector_size));
        good_bytes -= resid;
        scsi_set_resid(SCpnt, resid);
}

Thanks,

Bart.
Bart Van Assche Feb. 15, 2017, 4:42 p.m. UTC | #5
On Wed, 2017-02-15 at 11:12 +0900, Damien Le Moal wrote:
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 1f5d92a..d05a328 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -1790,6 +1790,8 @@ static int sd_done(struct scsi_cmnd *SCpnt)
>  {
>  	int result = SCpnt->result;
>  	unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt);
> +	unsigned int sector_size = SCpnt->device->sector_size;
> +	unsigned int resid;
>  	struct scsi_sense_hdr sshdr;
>  	struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk);
>  	struct request *req = SCpnt->request;
> @@ -1820,6 +1822,24 @@ static int sd_done(struct scsi_cmnd *SCpnt)
>  			scsi_set_resid(SCpnt, blk_rq_bytes(req));
>  		}
>  		break;
> +	default:
> +		/*
> +	 	 * In case of bogus fw or device, we could end up having
> +	 	 * an unaligned partial completion. Check this here and force
> +	 	 * alignment.
> +	 	 */
> +		resid = scsi_get_resid(SCpnt);
> +		if (resid & (sector_size - 1)) {
> +			SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
> +				"Unaligned partial completion (resid=%u, sector_sz=%u)\n",
> +				resid, sector_size));
> +			resid = round_up(resid, sector_size);
> +			if (resid < good_bytes)
> +				good_bytes -= resid;
> +			else
> +				good_bytes = 0;
> +			scsi_set_resid(SCpnt, resid);
> +		}
>  	}
>  
>  	if (result) {

An additional concern: what if the size of the Data-Out buffer is not a
multiple of the logical block size? Shouldn't we round down (good_bytes -
resid) instead of rounding up resid?

Thanks,

Bart.
Damien Le Moal Feb. 16, 2017, 12:50 a.m. UTC | #6
Bart,

On 2/16/17 00:10, Bart Van Assche wrote:
> On Wed, 2017-02-15 at 11:12 +0900, Damien Le Moal wrote:
>> +			resid = round_up(resid, sector_size);
>> +			if (resid < good_bytes)
>> +				good_bytes -= resid;
>> +			else
>> +				good_bytes = 0;
>> +			scsi_set_resid(SCpnt, resid);
> 
> Hello Damien,
> 
> If the Data-Out buffer is smaller than a single logical block, can the above
> code cause resid to exceed the Data-Out buffer size? I think we should avoid
> to convert a residual overflow into a residual underflow. Additionally, will
> round_up() work correctly if resid is negative (residual underflow)? How
> about using the following (untested) code instead of the above?
> 
> if (resid > 0) {
>         resid = min(good_bytes, round_up(resid, sector_size));
>         good_bytes -= resid;
>         scsi_set_resid(SCpnt, resid);
> }

Indeed, that would be more solid and cover all possible weird values of
resid. I will resend a v5.

Thanks.
Damien Le Moal Feb. 16, 2017, 12:54 a.m. UTC | #7
Bart,

On 2/16/17 01:42, Bart Van Assche wrote:
> On Wed, 2017-02-15 at 11:12 +0900, Damien Le Moal wrote:
>> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
>> index 1f5d92a..d05a328 100644
>> --- a/drivers/scsi/sd.c
>> +++ b/drivers/scsi/sd.c
>> @@ -1790,6 +1790,8 @@ static int sd_done(struct scsi_cmnd *SCpnt)
>>  {
>>  	int result = SCpnt->result;
>>  	unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt);
>> +	unsigned int sector_size = SCpnt->device->sector_size;
>> +	unsigned int resid;
>>  	struct scsi_sense_hdr sshdr;
>>  	struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk);
>>  	struct request *req = SCpnt->request;
>> @@ -1820,6 +1822,24 @@ static int sd_done(struct scsi_cmnd *SCpnt)
>>  			scsi_set_resid(SCpnt, blk_rq_bytes(req));
>>  		}
>>  		break;
>> +	default:
>> +		/*
>> +	 	 * In case of bogus fw or device, we could end up having
>> +	 	 * an unaligned partial completion. Check this here and force
>> +	 	 * alignment.
>> +	 	 */
>> +		resid = scsi_get_resid(SCpnt);
>> +		if (resid & (sector_size - 1)) {
>> +			SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
>> +				"Unaligned partial completion (resid=%u, sector_sz=%u)\n",
>> +				resid, sector_size));
>> +			resid = round_up(resid, sector_size);
>> +			if (resid < good_bytes)
>> +				good_bytes -= resid;
>> +			else
>> +				good_bytes = 0;
>> +			scsi_set_resid(SCpnt, resid);
>> +		}
>>  	}
>>  
>>  	if (result) {
> 
> An additional concern: what if the size of the Data-Out buffer is not a
> multiple of the logical block size? Shouldn't we round down (good_bytes -
> resid) instead of rounding up resid?

The only REQ_TYPE_FS request that may not operate on LBA size aligned
payloads is REQ_OP_ZONE_REPORT which is handles in a different case of
the switch statement. So I think it is safe.

Best regards.
Bart Van Assche Feb. 16, 2017, 1:10 a.m. UTC | #8
On Thu, 2017-02-16 at 09:54 +0900, Damien Le Moal wrote:
> On 2/16/17 01:42, Bart Van Assche wrote:
> > An additional concern: what if the size of the Data-Out buffer is not a
> > multiple of the logical block size? Shouldn't we round down (good_bytes -
> > resid) instead of rounding up resid?
> 
> The only REQ_TYPE_FS request that may not operate on LBA size aligned
> payloads is REQ_OP_ZONE_REPORT which is handles in a different case of
> the switch statement. So I think it is safe.

Hello Damien,

Are you aware that it is the software that submits a request that controls
the buffer length and not the device that processes a request? Submitting
Data-Out buffers with odd sizes is one of the tests in the libiscsi test
suite. See e.g. the source file test-tool/test_write10_residuals.c in that
test suite. The request submitted by the libiscsi test suite reach the
kernel of the target system either through SG IO or through the iSCSI
target driver. When using iSCSI, both the LIO and SCST SCSI target
frameworks translate these requests into REQ_TYPE_FS requests. I think we
should aim not to affect the outcome of libiscsi tests when the underlying
device is a SCSI disk.

Thanks,

Bart.
Damien Le Moal Feb. 16, 2017, 2:52 a.m. UTC | #9
Bart,

On 2/16/17 10:10, Bart Van Assche wrote:
> On Thu, 2017-02-16 at 09:54 +0900, Damien Le Moal wrote:
>> On 2/16/17 01:42, Bart Van Assche wrote:
>>> An additional concern: what if the size of the Data-Out buffer is not a
>>> multiple of the logical block size? Shouldn't we round down (good_bytes -
>>> resid) instead of rounding up resid?
>>
>> The only REQ_TYPE_FS request that may not operate on LBA size aligned
>> payloads is REQ_OP_ZONE_REPORT which is handles in a different case of
>> the switch statement. So I think it is safe.
> 
> Hello Damien,
> 
> Are you aware that it is the software that submits a request that controls
> the buffer length and not the device that processes a request? Submitting
> Data-Out buffers with odd sizes is one of the tests in the libiscsi test
> suite. See e.g. the source file test-tool/test_write10_residuals.c in that
> test suite. The request submitted by the libiscsi test suite reach the
> kernel of the target system either through SG IO or through the iSCSI
> target driver. When using iSCSI, both the LIO and SCST SCSI target
> frameworks translate these requests into REQ_TYPE_FS requests. I think we
> should aim not to affect the outcome of libiscsi tests when the underlying
> device is a SCSI disk.

Thanks for the pointers. I checked libiscsi tests. And from what is done
there, it seems to me that it is basically impossible to distinguished
between a buggy hardware response and an in-purpose (or buggy) not
aligned data-out buffer size.
E.g. the test in test_write10_residuals.c which issues a 1 block write
with a buf size of 10000 and checks that residual is 10000 - block size.
For that case, with both this patch and the original mpt3sas patch, the
rounding up of resid to the block size will break the test.

Yet, it is very surprising that LIO and SCST_SCSI issue the "buggy"
request as is without checking and eventually correcting things first.
So unless LIO and SCST_SCSI are fixed to do that on the request they
send to the device, both patches should be dropped.
Users with buggy mpt3 HBAs will need to update the HBA FW...

So the conclusion may be that we need to drop everything ? The mpt3sas
patch breaks ZBC now, so that must be removed too.

Thoughts ?

Best regards.
Damien Le Moal Feb. 16, 2017, 3 a.m. UTC | #10
Bart,

On 2/16/17 11:52, Damien Le Moal wrote:
> Bart,
> 
> On 2/16/17 10:10, Bart Van Assche wrote:
>> On Thu, 2017-02-16 at 09:54 +0900, Damien Le Moal wrote:
>>> On 2/16/17 01:42, Bart Van Assche wrote:
>>>> An additional concern: what if the size of the Data-Out buffer is not a
>>>> multiple of the logical block size? Shouldn't we round down (good_bytes -
>>>> resid) instead of rounding up resid?
>>>
>>> The only REQ_TYPE_FS request that may not operate on LBA size aligned
>>> payloads is REQ_OP_ZONE_REPORT which is handles in a different case of
>>> the switch statement. So I think it is safe.
>>
>> Hello Damien,
>>
>> Are you aware that it is the software that submits a request that controls
>> the buffer length and not the device that processes a request? Submitting
>> Data-Out buffers with odd sizes is one of the tests in the libiscsi test
>> suite. See e.g. the source file test-tool/test_write10_residuals.c in that
>> test suite. The request submitted by the libiscsi test suite reach the
>> kernel of the target system either through SG IO or through the iSCSI
>> target driver. When using iSCSI, both the LIO and SCST SCSI target
>> frameworks translate these requests into REQ_TYPE_FS requests. I think we
>> should aim not to affect the outcome of libiscsi tests when the underlying
>> device is a SCSI disk.
> 
> Thanks for the pointers. I checked libiscsi tests. And from what is done
> there, it seems to me that it is basically impossible to distinguished
> between a buggy hardware response and an in-purpose (or buggy) not
> aligned data-out buffer size.
> E.g. the test in test_write10_residuals.c which issues a 1 block write
> with a buf size of 10000 and checks that residual is 10000 - block size.
> For that case, with both this patch and the original mpt3sas patch, the
> rounding up of resid to the block size will break the test.
> 
> Yet, it is very surprising that LIO and SCST_SCSI issue the "buggy"
> request as is without checking and eventually correcting things first.
> So unless LIO and SCST_SCSI are fixed to do that on the request they
> send to the device, both patches should be dropped.
> Users with buggy mpt3 HBAs will need to update the HBA FW...
> 
> So the conclusion may be that we need to drop everything ? The mpt3sas
> patch breaks ZBC now, so that must be removed too.
> 
> Thoughts ?

Or we rewrite to this:

if (!(good_bytes & (sector_size - 1)) &&
    resid & (sector_size - 1)) {
        resid = min(good_bytes, round_up(resid, sector_size));
        good_bytes -= resid;
        scsi_set_resid(SCpnt, resid);
}

That is, do the resid correction only and only if the command buffer
size is aligned. But still not sure that is safe in all possible cases.
Bart Van Assche Feb. 16, 2017, 3:28 a.m. UTC | #11
On Thu, 2017-02-16 at 11:52 +0900, Damien Le Moal wrote:
> Thanks for the pointers. I checked libiscsi tests. And from what is done

> there, it seems to me that it is basically impossible to distinguished

> between a buggy hardware response and an in-purpose (or buggy) not

> aligned data-out buffer size.

> E.g. the test in test_write10_residuals.c which issues a 1 block write

> with a buf size of 10000 and checks that residual is 10000 - block size.

> For that case, with both this patch and the original mpt3sas patch, the

> rounding up of resid to the block size will break the test.

> 

> Yet, it is very surprising that LIO and SCST_SCSI issue the "buggy"

> request as is without checking and eventually correcting things first.

> So unless LIO and SCST_SCSI are fixed to do that on the request they

> send to the device, both patches should be dropped.

> Users with buggy mpt3 HBAs will need to update the HBA FW...

> 

> So the conclusion may be that we need to drop everything ? The mpt3sas

> patch breaks ZBC now, so that must be removed too.


Hello Damien,

For the example you cited (bufflen - residual) = block size. This is
why I proposed to round (bufflen - residual) instead of residual. Whether
bufflen or (bufflen - residual) is rounded doesn't matter if bufflen is a
multiple of the logical block size. But it matters if bufflen is not a
multiple of the logical block size.

Bart.
Martin K. Petersen Feb. 16, 2017, 3:36 a.m. UTC | #12
>>>>> "Damien" == Damien Le Moal <damien.lemoal@wdc.com> writes:

Damien,

Damien> So the conclusion may be that we need to drop everything ? The
Damien> mpt3sas patch breaks ZBC now, so that must be removed too.

Nah.

But it's important that we restrict the rounding to TYPE_FS requests
(i.e. I/Os issued by the kernel where we control what's going on). Your
patch already does this by virtue of changing sd_done(). So for that
reason I'm OK with relocating the rounding from mpt3sas to sd.c.

However, I still think it's a fundamental problem that mpt3sas reports
completion in terms of "SAS frame bytes (successfully?) transferred"
instead of "blocks acknowledged by the target". The drive can obviously
not write a partial sector. So that approach to completion reporting
seems completely bogus to me.

Anyway. My main comment is that it seems like the rounding logic would
be better contained in sd_completed_bytes() since that is already doing
sanity checking on what's reported.

Also, I am no fan of perpetuating the arcane SCSI logging stuff now that
we have tracing. So just make it a regular sd_printk().

Thanks!
Damien Le Moal Feb. 16, 2017, 4:16 a.m. UTC | #13
Martin,

On 2/16/17 12:36, Martin K. Petersen wrote:
> Damien,
> 
> Damien> So the conclusion may be that we need to drop everything ? The
> Damien> mpt3sas patch breaks ZBC now, so that must be removed too.
> 
> Nah.
> 
> But it's important that we restrict the rounding to TYPE_FS requests
> (i.e. I/Os issued by the kernel where we control what's going on). Your
> patch already does this by virtue of changing sd_done(). So for that
> reason I'm OK with relocating the rounding from mpt3sas to sd.c.
> 
> However, I still think it's a fundamental problem that mpt3sas reports
> completion in terms of "SAS frame bytes (successfully?) transferred"
> instead of "blocks acknowledged by the target". The drive can obviously
> not write a partial sector. So that approach to completion reporting
> seems completely bogus to me.
> 
> Anyway. My main comment is that it seems like the rounding logic would
> be better contained in sd_completed_bytes() since that is already doing
> sanity checking on what's reported.

Indeed, that would be nicer. But sd_completed_bytes is only called when
sense data (a sense key) was also returned with the command. It is not
clear to me if that is always the case when an unaligned resid is
reported by the bogus mpt3sas HBA.

Handling all possible (bogus) cases may require a more extensive rewrite
of sd_completed_bytes, which may not be the best of choice this late in
the RC cycle.

Should I keep the fix in its current place and defer the move to
sd_completed_bytes to a later patch for 4.11 ?

> Also, I am no fan of perpetuating the arcane SCSI logging stuff now that
> we have tracing. So just make it a regular sd_printk().

Sure, will do.

Thank you.
Damien Le Moal Feb. 16, 2017, 5:17 a.m. UTC | #14
Bart,

On 2/16/17 12:28, Bart Van Assche wrote:
> On Thu, 2017-02-16 at 11:52 +0900, Damien Le Moal wrote:
>> Thanks for the pointers. I checked libiscsi tests. And from what is done
>> there, it seems to me that it is basically impossible to distinguished
>> between a buggy hardware response and an in-purpose (or buggy) not
>> aligned data-out buffer size.
>> E.g. the test in test_write10_residuals.c which issues a 1 block write
>> with a buf size of 10000 and checks that residual is 10000 - block size.
>> For that case, with both this patch and the original mpt3sas patch, the
>> rounding up of resid to the block size will break the test.
>>
>> Yet, it is very surprising that LIO and SCST_SCSI issue the "buggy"
>> request as is without checking and eventually correcting things first.
>> So unless LIO and SCST_SCSI are fixed to do that on the request they
>> send to the device, both patches should be dropped.
>> Users with buggy mpt3 HBAs will need to update the HBA FW...
>>
>> So the conclusion may be that we need to drop everything ? The mpt3sas
>> patch breaks ZBC now, so that must be removed too.
> 
> Hello Damien,
> 
> For the example you cited (bufflen - residual) = block size. This is
> why I proposed to round (bufflen - residual) instead of residual. Whether
> bufflen or (bufflen - residual) is rounded doesn't matter if bufflen is a
> multiple of the logical block size. But it matters if bufflen is not a
> multiple of the logical block size.

To be sure, I went through the scsi submission and completion path of
REQ_TYPE_FS again.

For REQ_OP_DISCARD, REQ_OP_WRITE_SAME, REQ_OP_ZONE_REPORT and
REQ_OP_ZONE_RESET, the resid check in sd_done is skipped thanks to the
special handling for these commands in the switch-case statement. So we
do not need to worry about scsi_buflen() alignment.

For REQ_OP_FLUSH, scsi_buflen() == 0. So no problem either.

All that remains of REQ_TYPE_FS are REQ_OP_READ and REQ_OP_WRITE. For
these, cmd->sdb.length (that is scsi_buflen) is set to count *
sector_size in sd_setup_read_write_cmnd. So scsi_buflen() will always be
aligned. This also means that things like the libiscsi tests with
incorrect data buffer size will actually not be seen as is by sd_done.
For the example of the test_write10_residuals.c of 1 block write using a
10000B buffer, sd_done will see scsi_buflen() of 1 block.

So it seems to me that for the buggy mpt3sas hardware, we really only
need to check resid alignment and correct it if necessary. The code you
proposed

if (resid & (sector_size - 1)) {
        resid = min(good_bytes, round_up(resid, sector_size));
        good_bytes -= resid;
        scsi_set_resid(SCpnt, resid);
}

looks just perfect for this.

Let me know if I missed something.

Best regards.
Guilherme G. Piccoli Feb. 16, 2017, 7:34 p.m. UTC | #15
On 15/02/2017 05:06, Ram Pai wrote:
> On Wed, Feb 15, 2017 at 03:48:52PM +0900, Damien Le Moal wrote:
>> Christoph,
>>
>> On 2/15/17 15:34, Christoph Hellwig wrote:
>>> this looks reasonable, but we should ask Guilherme and Ram to confirm
>>> it fixes their originally reported issue.  I've added them to Cc.
>>
>> Thank you.
>>
>> Guilherme, Ram,
>>
>> Please test ! The original fix breaks the zoned block device support
>> that was newly added to 4.10. So we need a fix for that and your issue
>> combined before Linus releases 4.10 stable this weekend.
> 
> Yes logically it looks correct and should fix our issue. 
> 
> I doubt Guilherme has access to the same adapter firmware that exposed
> the original problem. We will probably have to induce the erroneous
> behavior in the driver to reproduce the issue and verify the fix.

Ram, I do have access but the environment changed a lot, including
disks, setup of the distributed filesystem and tests...and so I wasn't
able to reproduce the issue anymore.

Since the logic of the new approach looks correct, I'm ok with the patch.

Thanks,


Guilherme

> 
> Will let you know,
> RP
>

Patch
diff mbox

diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index 0b5b423..1961535 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -4658,7 +4658,6 @@  _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply)
 	struct MPT3SAS_DEVICE *sas_device_priv_data;
 	u32 response_code = 0;
 	unsigned long flags;
-	unsigned int sector_sz;
 
 	mpi_reply = mpt3sas_base_get_reply_virt_addr(ioc, reply);
 	scmd = _scsih_scsi_lookup_get_clear(ioc, smid);
@@ -4717,20 +4716,6 @@  _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply)
 	}
 
 	xfer_cnt = le32_to_cpu(mpi_reply->TransferCount);
-
-	/* In case of bogus fw or device, we could end up having
-	 * unaligned partial completion. We can force alignment here,
-	 * then scsi-ml does not need to handle this misbehavior.
-	 */
-	sector_sz = scmd->device->sector_size;
-	if (unlikely(scmd->request->cmd_type == REQ_TYPE_FS && sector_sz &&
-		     xfer_cnt % sector_sz)) {
-		sdev_printk(KERN_INFO, scmd->device,
-		    "unaligned partial completion avoided (xfer_cnt=%u, sector_sz=%u)\n",
-			    xfer_cnt, sector_sz);
-		xfer_cnt = round_down(xfer_cnt, sector_sz);
-	}
-
 	scsi_set_resid(scmd, scsi_bufflen(scmd) - xfer_cnt);
 	if (ioc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE)
 		log_info =  le32_to_cpu(mpi_reply->IOCLogInfo);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 1f5d92a..d05a328 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1790,6 +1790,8 @@  static int sd_done(struct scsi_cmnd *SCpnt)
 {
 	int result = SCpnt->result;
 	unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt);
+	unsigned int sector_size = SCpnt->device->sector_size;
+	unsigned int resid;
 	struct scsi_sense_hdr sshdr;
 	struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk);
 	struct request *req = SCpnt->request;
@@ -1820,6 +1822,24 @@  static int sd_done(struct scsi_cmnd *SCpnt)
 			scsi_set_resid(SCpnt, blk_rq_bytes(req));
 		}
 		break;
+	default:
+		/*
+	 	 * In case of bogus fw or device, we could end up having
+	 	 * an unaligned partial completion. Check this here and force
+	 	 * alignment.
+	 	 */
+		resid = scsi_get_resid(SCpnt);
+		if (resid & (sector_size - 1)) {
+			SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
+				"Unaligned partial completion (resid=%u, sector_sz=%u)\n",
+				resid, sector_size));
+			resid = round_up(resid, sector_size);
+			if (resid < good_bytes)
+				good_bytes -= resid;
+			else
+				good_bytes = 0;
+			scsi_set_resid(SCpnt, resid);
+		}
 	}
 
 	if (result) {