diff mbox

[RFC] hv_storvsc: error handling.

Message ID DM5PR03MB24903E60DFFC3660DD23C564A02A0@DM5PR03MB2490.namprd03.prod.outlook.com (mailing list archive)
State Changes Requested, archived
Headers show

Commit Message

KY Srinivasan March 4, 2017, 11:55 p.m. UTC
> -----Original Message-----

> From: KY Srinivasan

> Sent: Saturday, March 4, 2017 1:40 PM

> To: 'James Bottomley' <jejb@linux.vnet.ibm.com>; Stephen Hemminger

> <stephen@networkplumber.org>

> Cc: Hannes Reinecke <hare@suse.de>; Christoph Hellwig <hch@lst.de>; Jens

> Axboe <axboe@kernel.dk>; Linus Torvalds <torvalds@linux-

> foundation.org>; Martin K. Petersen <martin.petersen@oracle.com>;

> Dexuan Cui <decui@microsoft.com>; Long Li <longli@microsoft.com>; Josh

> Poulson <jopoulso@microsoft.com>; Adrian Suhov (Cloudbase Solutions SRL)

> <v-adsuho@microsoft.com>; linux-scsi@vger.kernel.org; Haiyang Zhang

> <haiyangz@microsoft.com>

> Subject: RE: [RFC] hv_storvsc: error handling.

> 

> 

> 

> > -----Original Message-----

> > From: James Bottomley [mailto:jejb@linux.vnet.ibm.com]

> > Sent: Saturday, March 4, 2017 1:37 PM

> > To: KY Srinivasan <kys@microsoft.com>; Stephen Hemminger

> > <stephen@networkplumber.org>

> > Cc: Hannes Reinecke <hare@suse.de>; Christoph Hellwig <hch@lst.de>;

> Jens

> > Axboe <axboe@kernel.dk>; Linus Torvalds <torvalds@linux-

> > foundation.org>; Martin K. Petersen <martin.petersen@oracle.com>;

> > Dexuan Cui <decui@microsoft.com>; Long Li <longli@microsoft.com>; Josh

> > Poulson <jopoulso@microsoft.com>; Adrian Suhov (Cloudbase Solutions

> SRL)

> > <v-adsuho@microsoft.com>; linux-scsi@vger.kernel.org; Haiyang Zhang

> > <haiyangz@microsoft.com>

> > Subject: Re: [RFC] hv_storvsc: error handling.

> >

> > On Sat, 2017-03-04 at 21:03 +0000, KY Srinivasan wrote:

> > >

> > > > -----Original Message-----

> > > > From: Stephen Hemminger [mailto:stephen@networkplumber.org]

> > > > Sent: Friday, March 3, 2017 4:50 PM

> > > > To: James Bottomley <James.Bottomley@Hansenpartnership.com>

> > > > Cc: Hannes Reinecke <hare@suse.de>; Christoph Hellwig

> <hch@lst.de>;

> > > > James Bottomley <jejb@linux.vnet.ibm.com>; Jens Axboe

> > > > <axboe@kernel.dk>; Linus Torvalds <torvalds@linux-foundation.org>;

> > > > Martin K. Petersen <martin.petersen@oracle.com>; KY Srinivasan

> > > > <kys@microsoft.com>; Dexuan Cui <decui@microsoft.com>; Long Li

> > > > <longli@microsoft.com>; Josh Poulson <jopoulso@microsoft.com>;

> > > > Adrian

> > > > Suhov (Cloudbase Solutions SRL) <v-adsuho@microsoft.com>; linux-

> > > > scsi@vger.kernel.org; Haiyang Zhang <haiyangz@microsoft.com>

> > > > Subject: [RFC] hv_storvsc: error handling.

> > > >

> > > > Needs more testing but this does fix the observed problem.

> > > >

> > > > From: Stephen Hemminger <sthemmin@microsoft.com>

> > > >

> > > > Subject: [PATCH] hv_storvsc: fix error handling

> > > >

> > > > The Hyper-V storvsc SCSI driver was hiding all errors in INQUIRY

> > > > and

> > > > MODE_SENSE commands. This caused the scan process to incorrectly

> > > > think

> > > > devices were present and online.  Also invalid LUN errors were not

> > > > being handled correctly.

> > > >

> > > > This fixes problems booting a GEN2 VM on Hyper-V. It effectively

> > > > reverts commit 4ed51a21c0f69 ("Staging: hv: storvsc: Fixup

> > > > srb and scsi status for INQUIRY and MODE_SENSE")

> > > >

> > > > Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>

> > > > ---

> > > >  drivers/scsi/storvsc_drv.c | 48 ++++------------------------------

> > > > ------------

> > > >  1 file changed, 4 insertions(+), 44 deletions(-)

> > > >

> > > > diff --git a/drivers/scsi/storvsc_drv.c

> > > > b/drivers/scsi/storvsc_drv.c

> > > > index 638e5f427c90..8cc241fc54b8 100644

> > > > --- a/drivers/scsi/storvsc_drv.c

> > > > +++ b/drivers/scsi/storvsc_drv.c

> > > > @@ -543,28 +543,6 @@ static void storvsc_host_scan(struct

> > > > work_struct

> > > > *work)

> > > >  	kfree(wrk);

> > > >  }

> > > >

> > > > -static void storvsc_remove_lun(struct work_struct *work)

> > > > -{

> > > > -	struct storvsc_scan_work *wrk;

> > > > -	struct scsi_device *sdev;

> > > > -

> > > > -	wrk = container_of(work, struct storvsc_scan_work, work);

> > > > -	if (!scsi_host_get(wrk->host))

> > > > -		goto done;

> > > > -

> > > > -	sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk

> > > > ->lun);

> > > > -

> > > > -	if (sdev) {

> > > > -		scsi_remove_device(sdev);

> > > > -		scsi_device_put(sdev);

> > > > -	}

> > > > -	scsi_host_put(wrk->host);

> > > > -

> > > > -done:

> > > > -	kfree(wrk);

> > > > -}

> > > > -

> > > > -

> > > >  /*

> > > >   * We can get incoming messages from the host that are not in

> > > > response to

> > > >   * messages that we have sent out. An example of this would be

> > > > messages

> > > > @@ -955,8 +933,7 @@ static void storvsc_handle_error(struct

> > > > vmscsi_request *vm_srb,

> > > >  		}

> > > >  		break;

> > > >  	case SRB_STATUS_INVALID_LUN:

> > > > -		do_work = true;

> > > > -		process_err_fn = storvsc_remove_lun;

> > > > +		set_host_byte(scmnd, DID_NO_CONNECT);

> > > >  		break;

> > > >  	case SRB_STATUS_ABORTED:

> > > >  		if (vm_srb->srb_status &

> > > > SRB_STATUS_AUTOSENSE_VALID

> > > > &&

> > > > @@ -1050,32 +1027,15 @@ static void storvsc_on_io_completion(struct

> > > > storvsc_device *stor_device,

> > > >

> > > >  	stor_pkt = &request->vstor_packet;

> > > >

> > > > -	/*

> > > > -	 * The current SCSI handling on the host side does

> > > > -	 * not correctly handle:

> > > > -	 * INQUIRY command with page code parameter set to 0x80

> > > > -	 * MODE_SENSE command with cmd[2] == 0x1c

> > > > -	 *

> > > > -	 * Setup srb and scsi status so this won't be fatal.

> > > > -	 * We do this so we can distinguish truly fatal failues

> > > > -	 * (srb status == 0x4) and off-line the device in that

> > > > case.

> > > > -	 */

> > > > -

> > > > -	if ((stor_pkt->vm_srb.cdb[0] == INQUIRY) ||

> > > > -	   (stor_pkt->vm_srb.cdb[0] == MODE_SENSE)) {

> > > > -		vstor_packet->vm_srb.scsi_status = 0;

> > > > -		vstor_packet->vm_srb.srb_status =

> > > > SRB_STATUS_SUCCESS;

> > > > -	}

> > > > -

> > > > -

> > > >  	/* Copy over the status...etc */

> > > >  	stor_pkt->vm_srb.scsi_status = vstor_packet

> > > > ->vm_srb.scsi_status;

> > > >  	stor_pkt->vm_srb.srb_status = vstor_packet

> > > > ->vm_srb.srb_status;

> > > >  	stor_pkt->vm_srb.sense_info_length =

> > > >  	vstor_packet->vm_srb.sense_info_length;

> > > >

> > > > -	if (vstor_packet->vm_srb.scsi_status != 0 ||

> > > > -	    vstor_packet->vm_srb.srb_status != SRB_STATUS_SUCCESS)

> > > > +	if (stor_pkt->vm_srb.cdb[0] != INQUIRY &&

> > > > +	    (vstor_packet->vm_srb.scsi_status != 0 ||

> > > > +	     vstor_packet->vm_srb.srb_status !=

> > > > SRB_STATUS_SUCCESS))

> > > >  		storvsc_log(device, STORVSC_LOGGING_WARN,

> > > >  			"cmd 0x%x scsi status 0x%x srb status

> > > > 0x%x\n",

> > > >  			stor_pkt->vm_srb.cdb[0],

> > > > --

> > >

> > > This patch gets rid of the ability to "hot remove" LUNs. I don't

> > > think that can be part of any

> > > solution. The INQUIRY hack I put in a long time ago was to deal with

> > > host bugs on prior versions of

> > > Windows server. WS2016 should not be trigerring this code. Stephen,

> > > could you please test this patch -

> > > a quick hack:

> > >

> > > From b97f24f224a71a6e745c42e5640045a553eb407c Mon Sep 17 00:00:00

> > > 2001

> > > From: K. Y. Srinivasan <kys@microsoft.com>

> > > Date: Sat, 4 Mar 2017 14:00:46 -0700

> > > Subject: [PATCH 1/1] scsi: storvsc: Fix a bug in LUN removal code

> > > Reply-To: kys@microsoft.com

> > >

> > > Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>

> > > ---

> > >  drivers/scsi/storvsc_drv.c |   13 +++++++++++++

> > >  1 files changed, 13 insertions(+), 0 deletions(-)

> > >

> > > diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c

> > > index 05526b7..27eb682 100644

> > > --- a/drivers/scsi/storvsc_drv.c

> > > +++ b/drivers/scsi/storvsc_drv.c

> > > @@ -885,6 +885,7 @@ static void storvsc_handle_error(struct

> > > vmscsi_request *vm_srb,

> > >         struct storvsc_scan_work *wrk;

> > >         void (*process_err_fn)(struct work_struct *work);

> > >         bool do_work = false;

> > > +       struct scsi_device *sdev;

> > >

> > >         switch (SRB_STATUS(vm_srb->srb_status)) {

> > >         case SRB_STATUS_ERROR:

> > > @@ -911,6 +912,18 @@ static void storvsc_handle_error(struct

> > > vmscsi_request *vm_srb,

> > >                 }

> > >                 break;

> > >         case SRB_STATUS_INVALID_LUN:

> > > +               if (!scsi_host_get(host)) {

> > > +                       set_host_byte(scmnd, DID_NO_CONNECT);

> > > +                       break;

> > > +               }

> > > +

> > > +               sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id,

> > > wrk->lun);

> > > +

> > > +               if (!sdev) {

> > > +                       set_host_byte(scmnd, DID_NO_CONNECT);

> > > +                       break;

> > > +               }

> > > +

> >

> > You're now getting two references (one to the host and one to the

> > device) that you don't put either in error handling or in

> > storvsc_remove_lun().

> >

> > Probably you should eliminate the scsi_host_get and scsi_device_lookup

> > in storvsc_remove_lun() (making it basically remove device put device

> > put host) and add a host put in the !sdev if above.

> 

> Yes; this was a quick hack to prove a theory. This should work (I hope); I will

> cleanup and resend once

> Stephen verifies.


Stephen,

Please test this patch. Have addressed the ref count issue that James pointed out.

From 6d57fb68c40165e61d84b6766b2263747d5e241d Mon Sep 17 00:00:00 2001
From: K. Y. Srinivasan <kys@microsoft.com>

Date: Sat, 4 Mar 2017 15:57:26 -0700
Subject: [PATCH 1/1] scsi: storvsc: Fix a bug in LUN removal code
Reply-To: kys@microsoft.com

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>

---
 drivers/scsi/storvsc_drv.c |   33 +++++++++++++++++++++++----------
 1 files changed, 23 insertions(+), 10 deletions(-)

--
1.7.1

> 

> K. Y

> >

> > James
diff mbox

Patch

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 05526b7..2bd132d 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -478,6 +478,7 @@  struct hv_host_device {
 struct storvsc_scan_work {
        struct work_struct work;
        struct Scsi_Host *host;
+       struct scsi_device *sdev;
        u8 lun;
        u8 tgt_id;
 };
@@ -531,24 +532,20 @@  static void storvsc_host_scan(struct work_struct *work)
        kfree(wrk);
 }

+/*
+ * On Entry we have a reference on both the host and the device.
+ * Drop them here.
+ */
 static void storvsc_remove_lun(struct work_struct *work)
 {
        struct storvsc_scan_work *wrk;
-       struct scsi_device *sdev;

        wrk = container_of(work, struct storvsc_scan_work, work);
-       if (!scsi_host_get(wrk->host))
-               goto done;
-
-       sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk->lun);

-       if (sdev) {
-               scsi_remove_device(sdev);
-               scsi_device_put(sdev);
-       }
+       scsi_remove_device(wrk->sdev);
+       scsi_device_put(wrk->sdev);
        scsi_host_put(wrk->host);

-done:
        kfree(wrk);
 }

@@ -885,6 +882,7 @@  static void storvsc_handle_error(struct vmscsi_request *vm_srb,
        struct storvsc_scan_work *wrk;
        void (*process_err_fn)(struct work_struct *work);
        bool do_work = false;
+       struct scsi_device *sdev;

        switch (SRB_STATUS(vm_srb->srb_status)) {
        case SRB_STATUS_ERROR:
@@ -911,9 +909,23 @@  static void storvsc_handle_error(struct vmscsi_request *vm_srb,
                }
                break;
        case SRB_STATUS_INVALID_LUN:
+               if (!scsi_host_get(host))
+                       goto host_not_known;
+
+               sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk->lun);
+
+               if (!sdev)
+                       goto lun_not_known;
+
                do_work = true;
                process_err_fn = storvsc_remove_lun;
                break;
+
+lun_not_known:
+               scsi_host_put(host);
+host_not_known:
+               set_host_byte(scmnd, DID_NO_CONNECT);
+               break;
        case SRB_STATUS_ABORTED:
                if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID &&
                    (asc == 0x2a) && (ascq == 0x9)) {
@@ -939,6 +951,7 @@  static void storvsc_handle_error(struct vmscsi_request *vm_srb,
                return;
        }

+       wrk->sdev = sdev;
        wrk->host = host;
        wrk->lun = vm_srb->lun;
        wrk->tgt_id = vm_srb->target_id;