diff mbox series

[RESEND,v2,09/11] scsi: scsi_debug: Drop sdebug_dev_info.num_in_q

Message ID 20230313093114.1498305-10-john.g.garry@oracle.com (mailing list archive)
State Accepted
Headers show
Series scsi_debug: Some minor improvements | expand

Commit Message

John Garry March 13, 2023, 9:31 a.m. UTC
In schedule_resp(), under certain conditions we check whether the
per-device queue is full (num_in_q == queue depth - 1) and we may inject a
"task set full" (TSF) error if it is.

However how we read num_in_q is racy - many threads may see the same
"queue is full" value (and also issue a TSF).

There is per-queue locking in reading per-device num_in_q, but that would
not help.

Replace how we read num_in_q at this location with a call to
scsi_device_busy(). Calling scsi_device_busy() is likewise racy (as reading
num_in_q), so nothing lost or gained. Calling scsi_device_busy() is also
slow as it needs to read all bits in the per-device budget bitmap, but we
can live with that since we're just a simulator and it's only under
a certain configs which we would see this.

Also move the "task set full" print earlier as it would only be called
now under this condition. However, previously it may not have been
called - like returning early - but keep it simple and always call it.

At this point we can drop sdebug_dev_info.num_in_q - it is difficult to
maintain properly and adds extra normal case command processing.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 drivers/scsi/scsi_debug.c | 63 ++++++++++-----------------------------
 1 file changed, 16 insertions(+), 47 deletions(-)

Comments

kernel test robot March 20, 2023, 5:31 a.m. UTC | #1
Greeting,

FYI, we noticed blktests.scsi/004.fail due to commit (built with gcc-11):

commit: 76fe86662c283bf322ac1e032c46ddaa0da5cb16 ("[PATCH RESEND v2 09/11] scsi: scsi_debug: Drop sdebug_dev_info.num_in_q")
url: https://github.com/intel-lab-lkp/linux/commits/John-Garry/scsi-scsi_debug-Don-t-hold-driver-host-struct-pointer-in-host-hostdata/20230313-173528
base: https://git.kernel.org/cgit/linux/kernel/git/mkp/scsi.git for-next
patch link: https://lore.kernel.org/all/20230313093114.1498305-10-john.g.garry@oracle.com/
patch subject: [PATCH RESEND v2 09/11] scsi: scsi_debug: Drop sdebug_dev_info.num_in_q

in testcase: blktests
version: blktests-x86_64-e3f2ca5-1_20230228
with following parameters:

	disk: 1HDD
	test: scsi-group-02



on test machine: 16 threads 1 sockets Intel(R) Xeon(R) CPU D-1541 @ 2.10GHz (Broadwell-DE) with 48G memory

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):




If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Link: https://lore.kernel.org/oe-lkp/202303201334.18b30edc-oliver.sang@intel.com


2023-03-18 00:18:58 sed "s:^:scsi/:" /lkp/benchmarks/blktests/tests/scsi-group-02
2023-03-18 00:18:58 ./check scsi/004 scsi/005
scsi/004 (ensure repeated TASK SET FULL results in EIO on timing out command)
scsi/004 (ensure repeated TASK SET FULL results in EIO on timing out command) [failed]
    runtime    ...  1.467s
    --- tests/scsi/004.out	2023-02-28 16:52:49.000000000 +0000
    +++ /lkp/benchmarks/blktests/results/nodev/scsi/004.out.bad	2023-03-18 00:19:00.232079954 +0000
    @@ -1,3 +1,2 @@
     Running scsi/004
    -Input/output error
     Test complete



To reproduce:

        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        sudo bin/lkp install job.yaml           # job file is attached in this email
        bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
        sudo bin/lkp run generated-yaml-file

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
John Garry March 20, 2023, 11:41 a.m. UTC | #2
On 20/03/2023 05:31, kernel test robot wrote:
> 
> Greeting,
> 
> FYI, we noticed blktests.scsi/004.fail due to commit (built with gcc-11):
> 
> commit: 76fe86662c283bf322ac1e032c46ddaa0da5cb16 ("[PATCH RESEND v2 09/11] scsi: scsi_debug: Drop sdebug_dev_info.num_in_q")
> url: https://github.com/intel-lab-lkp/linux/commits/John-Garry/scsi-scsi_debug-Don-t-hold-driver-host-struct-pointer-in-host-hostdata/20230313-173528
> base: https://git.kernel.org/cgit/linux/kernel/git/mkp/scsi.git for-next
> patch link: https://lore.kernel.org/all/20230313093114.1498305-10-john.g.garry@oracle.com/
> patch subject: [PATCH RESEND v2 09/11] scsi: scsi_debug: Drop sdebug_dev_info.num_in_q
> 
> in testcase: blktests
> version: blktests-x86_64-e3f2ca5-1_20230228
> with following parameters:
> 
> 	disk: 1HDD
> 	test: scsi-group-02
> 
> 
> 
> on test machine: 16 threads 1 sockets Intel(R) Xeon(R) CPU D-1541 @ 2.10GHz (Broadwell-DE) with 48G memory
> 
> caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
> 
> 
> 
> 
> If you fix the issue, kindly add following tag
> | Reported-by: kernel test robot <oliver.sang@intel.com>
> | Link: https://lore.kernel.org/oe-lkp/202303201334.18b30edc-oliver.sang@intel.com
> 
> 
> 2023-03-18 00:18:58 sed "s:^:scsi/:" /lkp/benchmarks/blktests/tests/scsi-group-02
> 2023-03-18 00:18:58 ./check scsi/004 scsi/005
> scsi/004 (ensure repeated TASK SET FULL results in EIO on timing out command)
> scsi/004 (ensure repeated TASK SET FULL results in EIO on timing out command) [failed]
>      runtime    ...  1.467s
>      --- tests/scsi/004.out	2023-02-28 16:52:49.000000000 +0000
>      +++ /lkp/benchmarks/blktests/results/nodev/scsi/004.out.bad	2023-03-18 00:19:00.232079954 +0000
>      @@ -1,3 +1,2 @@
>       Running scsi/004
>      -Input/output error
>       Test complete
> 

I'll check this now.

> 
> 
> To reproduce:
> 
>          git clone https://github.com/intel/lkp-tests.git
>          cd lkp-tests
>          sudo bin/lkp install job.yaml           # job file is attached in this email
>          bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
>          sudo bin/lkp run generated-yaml-file
> 
>          # if come across any failure that blocks the test,
>          # please remove ~/.lkp and /lkp dir to run from a clean state.
> 
> 
>
John Garry March 22, 2023, 1:36 p.m. UTC | #3
On 20/03/2023 11:41, John Garry wrote:
>> scsi/004 (ensure repeated TASK SET FULL results in EIO on timing out 
>> command) [failed]
>>      runtime    ...  1.467s
>>      --- tests/scsi/004.out    2023-02-28 16:52:49.000000000 +0000
>>      +++ /lkp/benchmarks/blktests/results/nodev/scsi/004.out.bad    
>> 2023-03-18 00:19:00.232079954 +0000
>>      @@ -1,3 +1,2 @@
>>       Running scsi/004
>>      -Input/output error
>>       Test complete
>>
> 
> I'll check this now.

This should fix it:

--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5580,7 +5580,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, 
struct sdebug_dev_info *devip,
                 int num_in_q = scsi_device_busy(sdp);
                 int qdepth = cmnd->device->queue_depth;

-               if ((num_in_q == (qdepth - 1)) &&
+               if ((num_in_q == qdepth) &&
                     (atomic_inc_return(&sdebug_a_tsf) >=


I'll send that change in my other scsi_debug series after I test further.
diff mbox series

Patch

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 0d515bac93bf..449b460e4c1b 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -288,7 +288,6 @@  struct sdebug_dev_info {
 	uuid_t lu_name;
 	struct sdebug_host_info *sdbg_host;
 	unsigned long uas_bm[1];
-	atomic_t num_in_q;
 	atomic_t stopped;	/* 1: by SSU, 2: device start */
 	bool used;
 
@@ -4931,7 +4930,6 @@  static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)
 	struct sdebug_queue *sqp;
 	struct sdebug_queued_cmd *sqcp;
 	struct scsi_cmnd *scp;
-	struct sdebug_dev_info *devip;
 
 	if (unlikely(aborted))
 		sd_dp->aborted = false;
@@ -4956,11 +4954,7 @@  static void sdebug_q_cmd_complete(struct sdebug_defer *sd_dp)
 		       sd_dp->sqa_idx, qc_idx, sd_dp->hc_idx);
 		return;
 	}
-	devip = (struct sdebug_dev_info *)scp->device->hostdata;
-	if (likely(devip))
-		atomic_dec(&devip->num_in_q);
-	else
-		pr_err("devip=NULL\n");
+
 	if (unlikely(atomic_read(&retired_max_queue) > 0))
 		retiring = 1;
 
@@ -5192,7 +5186,6 @@  static struct sdebug_dev_info *find_build_dev_info(struct scsi_device *sdev)
 	open_devip->target = sdev->id;
 	open_devip->lun = sdev->lun;
 	open_devip->sdbg_host = sdbg_host;
-	atomic_set(&open_devip->num_in_q, 0);
 	set_bit(SDEBUG_UA_POOCCUR, open_devip->uas_bm);
 	open_devip->used = true;
 	return open_devip;
@@ -5263,7 +5256,6 @@  static bool stop_queued_cmnd(struct scsi_cmnd *cmnd)
 	enum sdeb_defer_type l_defer_t;
 	struct sdebug_queue *sqp;
 	struct sdebug_queued_cmd *sqcp;
-	struct sdebug_dev_info *devip;
 	struct sdebug_defer *sd_dp;
 
 	for (j = 0, sqp = sdebug_q_arr; j < submit_queues; ++j, ++sqp) {
@@ -5278,10 +5270,6 @@  static bool stop_queued_cmnd(struct scsi_cmnd *cmnd)
 				if (cmnd != sqcp->a_cmnd)
 					continue;
 				/* found */
-				devip = (struct sdebug_dev_info *)
-						cmnd->device->hostdata;
-				if (devip)
-					atomic_dec(&devip->num_in_q);
 				sqcp->a_cmnd = NULL;
 				sd_dp = sqcp->sd_dp;
 				if (sd_dp) {
@@ -5308,7 +5296,6 @@  static void stop_all_queued(void)
 	enum sdeb_defer_type l_defer_t;
 	struct sdebug_queue *sqp;
 	struct sdebug_queued_cmd *sqcp;
-	struct sdebug_dev_info *devip;
 	struct sdebug_defer *sd_dp;
 
 	for (j = 0, sqp = sdebug_q_arr; j < submit_queues; ++j, ++sqp) {
@@ -5318,10 +5305,6 @@  static void stop_all_queued(void)
 				sqcp = &sqp->qc_arr[k];
 				if (sqcp->a_cmnd == NULL)
 					continue;
-				devip = (struct sdebug_dev_info *)
-					sqcp->a_cmnd->device->hostdata;
-				if (devip)
-					atomic_dec(&devip->num_in_q);
 				sqcp->a_cmnd = NULL;
 				sd_dp = sqcp->sd_dp;
 				if (sd_dp) {
@@ -5565,9 +5548,8 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 			 int delta_jiff, int ndelay)
 {
 	bool new_sd_dp;
-	bool inject = false;
 	bool polled = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_POLLED;
-	int k, num_in_q, qdepth;
+	int k;
 	unsigned long iflags;
 	u64 ns_from_boot = 0;
 	struct sdebug_queue *sqp;
@@ -5591,16 +5573,21 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 		spin_unlock_irqrestore(&sqp->qc_lock, iflags);
 		return SCSI_MLQUEUE_HOST_BUSY;
 	}
-	num_in_q = atomic_read(&devip->num_in_q);
-	qdepth = cmnd->device->queue_depth;
+
 	if (unlikely(sdebug_every_nth && (SDEBUG_OPT_RARE_TSF & sdebug_opts) &&
 		     (scsi_result == 0))) {
+		int num_in_q = scsi_device_busy(sdp);
+		int qdepth = cmnd->device->queue_depth;
+
 		if ((num_in_q == (qdepth - 1)) &&
 		    (atomic_inc_return(&sdebug_a_tsf) >=
 		     abs(sdebug_every_nth))) {
 			atomic_set(&sdebug_a_tsf, 0);
-			inject = true;
 			scsi_result = device_qfull_result;
+
+			if (unlikely(SDEBUG_OPT_Q_NOISE & sdebug_opts))
+				sdev_printk(KERN_INFO, sdp, "%s: num_in_q=%d +1, <inject> status: TASK SET FULL\n",
+					    __func__, num_in_q);
 		}
 	}
 
@@ -5616,7 +5603,6 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 		goto respond_in_thread;
 	}
 	set_bit(k, sqp->in_use_bm);
-	atomic_inc(&devip->num_in_q);
 	sqcp = &sqp->qc_arr[k];
 	sqcp->a_cmnd = cmnd;
 	cmnd->host_scribble = (unsigned char *)sqcp;
@@ -5626,7 +5612,6 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 	if (!sd_dp) {
 		sd_dp = kzalloc(sizeof(*sd_dp), GFP_ATOMIC);
 		if (!sd_dp) {
-			atomic_dec(&devip->num_in_q);
 			clear_bit(k, sqp->in_use_bm);
 			return SCSI_MLQUEUE_HOST_BUSY;
 		}
@@ -5686,7 +5671,6 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 				if (kt <= d) {	/* elapsed duration >= kt */
 					spin_lock_irqsave(&sqp->qc_lock, iflags);
 					sqcp->a_cmnd = NULL;
-					atomic_dec(&devip->num_in_q);
 					clear_bit(k, sqp->in_use_bm);
 					spin_unlock_irqrestore(&sqp->qc_lock, iflags);
 					if (new_sd_dp)
@@ -5762,9 +5746,7 @@  static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 			sd_dp->aborted = false;
 		}
 	}
-	if (unlikely((SDEBUG_OPT_Q_NOISE & sdebug_opts) && scsi_result == device_qfull_result))
-		sdev_printk(KERN_INFO, sdp, "%s: num_in_q=%d +1, %s%s\n", __func__,
-			    num_in_q, (inject ? "<inject> " : ""), "status: TASK SET FULL");
+
 	return 0;
 
 respond_in_thread:	/* call back to mid-layer using invocation thread */
@@ -7369,17 +7351,12 @@  static void sdebug_do_remove_host(bool the_end)
 
 static int sdebug_change_qdepth(struct scsi_device *sdev, int qdepth)
 {
-	int num_in_q = 0;
-	struct sdebug_dev_info *devip;
+	struct sdebug_dev_info *devip = sdev->hostdata;
 
-	block_unblock_all_queues(true);
-	devip = (struct sdebug_dev_info *)sdev->hostdata;
-	if (NULL == devip) {
-		block_unblock_all_queues(false);
+	if (!devip)
 		return	-ENODEV;
-	}
-	num_in_q = atomic_read(&devip->num_in_q);
 
+	block_unblock_all_queues(true);
 	if (qdepth > SDEBUG_CANQUEUE) {
 		qdepth = SDEBUG_CANQUEUE;
 		pr_warn("%s: requested qdepth [%d] exceeds canqueue [%d], trim\n", __func__,
@@ -7390,10 +7367,8 @@  static int sdebug_change_qdepth(struct scsi_device *sdev, int qdepth)
 	if (qdepth != sdev->queue_depth)
 		scsi_change_queue_depth(sdev, qdepth);
 
-	if (SDEBUG_OPT_Q_NOISE & sdebug_opts) {
-		sdev_printk(KERN_INFO, sdev, "%s: qdepth=%d, num_in_q=%d\n",
-			    __func__, qdepth, num_in_q);
-	}
+	if (SDEBUG_OPT_Q_NOISE & sdebug_opts)
+		sdev_printk(KERN_INFO, sdev, "%s: qdepth=%d\n", __func__, qdepth);
 	block_unblock_all_queues(false);
 	return sdev->queue_depth;
 }
@@ -7495,7 +7470,6 @@  static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 	struct sdebug_queue *sqp;
 	struct sdebug_queued_cmd *sqcp;
 	struct scsi_cmnd *scp;
-	struct sdebug_dev_info *devip;
 	struct sdebug_defer *sd_dp;
 
 	sqp = sdebug_q_arr + queue_num;
@@ -7533,11 +7507,6 @@  static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
 
 		} else		/* ignoring non REQ_POLLED requests */
 			continue;
-		devip = (struct sdebug_dev_info *)scp->device->hostdata;
-		if (likely(devip))
-			atomic_dec(&devip->num_in_q);
-		else
-			pr_err("devip=NULL from %s\n", __func__);
 		if (unlikely(atomic_read(&retired_max_queue) > 0))
 			retiring = true;