diff mbox

[27/36] scsi_dh_alua: Use workqueue for RTPG

Message ID 1443523658-87622-28-git-send-email-hare@suse.de (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Hannes Reinecke Sept. 29, 2015, 10:47 a.m. UTC
The current ALUA device_handler has two drawbacks:
- We're sending a 'SET TARGET PORT GROUP' command to every LUN,
  disregarding the fact that several LUNs might be in a port group
  and will be automatically switched whenever _any_ LUN within
  that port group receives the command.
- Whenever a LUN is in 'transitioning' mode we cannot block I/O
  to that LUN, instead the controller has to abort the command.
  This leads to increased traffic across the wire and heavy load
  on the controller during switchover.

With this patch the RTPG handling is moved to a per-portgroup
workqueue. This reduces the number of 'REPORT TARGET PORT GROUP'
and 'SET TARGET PORT GROUPS' sent to the controller as we're sending
them now per port group, and not per device as previously.
It also allows us to block I/O to any LUN / port group found to be
in 'transitioning' ALUA mode, as the workqueue item will be requeued
until the controller moves out of transitioning.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/scsi/device_handler/scsi_dh_alua.c | 327 +++++++++++++++++++++++------
 1 file changed, 267 insertions(+), 60 deletions(-)

Comments

kernel test robot Sept. 29, 2015, 1:27 p.m. UTC | #1
Hi Hannes,

[auto build test results on v4.3-rc3 -- if it's inappropriate base, please ignore]

reproduce:
  # apt-get install sparse
  make ARCH=x86_64 allmodconfig
  make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

   drivers/scsi/device_handler/scsi_dh_alua.c:142:16: sparse: Variable length array is used.
   drivers/scsi/device_handler/scsi_dh_alua.c:171:16: sparse: Variable length array is used.
   drivers/scsi/device_handler/scsi_dh_alua.c:194:24: sparse: symbol 'alua_lookup_pg' was not declared. Should it be static?
   drivers/scsi/device_handler/scsi_dh_alua.c:222:24: sparse: symbol 'alua_get_pg' was not declared. Should it be static?
>> drivers/scsi/device_handler/scsi_dh_alua.c:856:14: sparse: incompatible types in comparison expression (different address spaces)
   drivers/scsi/device_handler/scsi_dh_alua.c:900:14: sparse: incompatible types in comparison expression (different address spaces)
   drivers/scsi/device_handler/scsi_dh_alua.c:935:14: sparse: incompatible types in comparison expression (different address spaces)

vim +856 drivers/scsi/device_handler/scsi_dh_alua.c

   840		struct alua_dh_data *h = sdev->handler_data;
   841		struct alua_port_group *pg = NULL;
   842		unsigned int optimize = 0, argc;
   843		const char *p = params;
   844		int result = SCSI_DH_OK;
   845		unsigned long flags;
   846	
   847		if ((sscanf(params, "%u", &argc) != 1) || (argc != 1))
   848			return -EINVAL;
   849	
   850		while (*p++)
   851			;
   852		if ((sscanf(p, "%u", &optimize) != 1) || (optimize > 1))
   853			return -EINVAL;
   854	
   855		rcu_read_lock();
 > 856		pg = rcu_dereference(h->pg);
   857		if (!pg) {
   858			rcu_read_unlock();
   859			return -ENXIO;
   860		}
   861		spin_lock_irqsave(&pg->lock, flags);
   862		if (optimize)
   863			pg->flags |= ALUA_OPTIMIZE_STPG;
   864		else

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Oct. 1, 2015, 11:34 p.m. UTC | #2
On 09/29/2015 03:48 AM, Hannes Reinecke wrote:
> +static void alua_rtpg_work(struct work_struct *work)
> +{
> +	struct alua_port_group *pg =
> +		container_of(work, struct alua_port_group, rtpg_work.work);
> +	struct scsi_device *sdev;
> +	LIST_HEAD(qdata_list);
> +	int err = SCSI_DH_OK;
> +	struct alua_queue_data *qdata, *tmp;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&pg->lock, flags);
> +	sdev = pg->rtpg_sdev;
> +	if (!sdev) {
> +		WARN_ON(pg->flags & ALUA_PG_RUN_RTPG ||
> +			pg->flags & ALUA_PG_RUN_STPG);
> +		spin_unlock_irqrestore(&pg->lock, flags);
> +		return;
> +	}
> +	pg->flags |= ALUA_PG_RUNNING;
> +	if (pg->flags & ALUA_PG_RUN_RTPG) {
> +		spin_unlock_irqrestore(&pg->lock, flags);
> +		err = alua_rtpg(sdev, pg);
> +		spin_lock_irqsave(&pg->lock, flags);
> +		if (err == SCSI_DH_RETRY) {
> +			pg->flags &= ~ALUA_PG_RUNNING;
> +			spin_unlock_irqrestore(&pg->lock, flags);
> +			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
> +					   pg->interval * HZ);
> +			return;
> +		}
> +		pg->flags &= ~ALUA_PG_RUN_RTPG;
> +		if (err != SCSI_DH_OK)
> +			pg->flags &= ~ALUA_PG_RUN_STPG;
> +	}
> +	if (pg->flags & ALUA_PG_RUN_STPG) {
> +		spin_unlock_irqrestore(&pg->lock, flags);
> +		err = alua_stpg(sdev, pg);
> +		spin_lock_irqsave(&pg->lock, flags);
> +		pg->flags &= ~ALUA_PG_RUN_STPG;
> +		if (err == SCSI_DH_RETRY) {
> +			pg->flags |= ALUA_PG_RUN_RTPG;
> +			pg->interval = 0;
> +			pg->flags &= ~ALUA_PG_RUNNING;
> +			spin_unlock_irqrestore(&pg->lock, flags);
> +			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
> +					   pg->interval * HZ);
> +			return;
> +		}
> +	}
> +
> +	list_splice_init(&pg->rtpg_list, &qdata_list);
> +	pg->rtpg_sdev = NULL;
> +	spin_unlock_irqrestore(&pg->lock, flags);
> +
> +	list_for_each_entry_safe(qdata, tmp, &qdata_list, entry) {
> +		list_del(&qdata->entry);
> +		if (qdata->callback_fn)
> +			qdata->callback_fn(qdata->callback_data, err);
> +		kfree(qdata);
> +	}
> +	spin_lock_irqsave(&pg->lock, flags);
> +	pg->flags &= ~ALUA_PG_RUNNING;
> +	spin_unlock_irqrestore(&pg->lock, flags);
> +	scsi_device_put(sdev);
> +	kref_put(&pg->kref, release_port_group);
> +}

With this patch series applied kmemleak reports several leaks that were 
not reported without this patch. Is scsi_device_put() + kref_put() 
always called before this function returns without requeueing the work 
item ? Shouldn't the return value of queue_delayed_work() be checked ? 
The leaks reported by kmemleak are:

unreferenced object 0xffff880423d31728 (size 128):
   comm "kworker/2:3", pid 3589, jiffies 4294946634 (age 501.720s)
   hex dump (first 32 bytes):
     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
   backtrace:
     [<ffffffff814ed75e>] kmemleak_alloc+0x4e/0xb0
     [<ffffffff811a9128>] kmem_cache_alloc_trace+0xc8/0x210
     [<ffffffffa0269d78>] alua_rtpg_work+0x1b8/0xc10 [scsi_dh_alua]
     [<ffffffff8108d9a8>] process_one_work+0x1d8/0x610
     [<ffffffff8108def4>] worker_thread+0x114/0x460
     [<ffffffff810941e8>] kthread+0xf8/0x110
     [<ffffffff814f5c2f>] ret_from_fork+0x3f/0x70
     [<ffffffffffffffff>] 0xffffffffffffffff
unreferenced object 0xffff88042ba00150 (size 8):
   comm "srp_daemon", pid 608, jiffies 4294946454 (age 503.510s)
   hex dump (first 8 bytes):
     68 6f 73 74 31 35 00 a5                          host15..
   backtrace:
     [<ffffffff814ed75e>] kmemleak_alloc+0x4e/0xb0
     [<ffffffff811abd53>] __kmalloc_track_caller+0xe3/0x240
     [<ffffffff81295ce2>] kvasprintf+0x52/0x80
     [<ffffffff8128adbf>] kobject_set_name_vargs+0x1f/0x60
     [<ffffffff81382217>] dev_set_name+0x47/0x50
     [<ffffffffa000dcaa>] scsi_host_alloc+0x32a/0x4b0 [scsi_mod]
     [<ffffffffa0279524>] srp_create_target+0x54/0x1410 [ib_srp]
     [<ffffffff81381bf8>] dev_attr_store+0x18/0x30
     [<ffffffff812300f4>] sysfs_kf_write+0x44/0x60
     [<ffffffff8122f724>] kernfs_fop_write+0x144/0x190
     [<ffffffff811b8788>] __vfs_write+0x28/0xe0
     [<ffffffff811b8e19>] vfs_write+0xa9/0x190
     [<ffffffff811b9b19>] SyS_write+0x49/0xa0
     [<ffffffff814f5876>] entry_SYSCALL_64_fastpath+0x16/0x7a
     [<ffffffffffffffff>] 0xffffffffffffffff

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Reinecke Oct. 2, 2015, 5:59 a.m. UTC | #3
On 10/02/2015 01:34 AM, Bart Van Assche wrote:
> On 09/29/2015 03:48 AM, Hannes Reinecke wrote:
>> +static void alua_rtpg_work(struct work_struct *work)
>> +{
>> +    struct alua_port_group *pg =
>> +        container_of(work, struct alua_port_group, rtpg_work.work);
>> +    struct scsi_device *sdev;
>> +    LIST_HEAD(qdata_list);
>> +    int err = SCSI_DH_OK;
>> +    struct alua_queue_data *qdata, *tmp;
>> +    unsigned long flags;
>> +
>> +    spin_lock_irqsave(&pg->lock, flags);
>> +    sdev = pg->rtpg_sdev;
>> +    if (!sdev) {
>> +        WARN_ON(pg->flags & ALUA_PG_RUN_RTPG ||
>> +            pg->flags & ALUA_PG_RUN_STPG);
>> +        spin_unlock_irqrestore(&pg->lock, flags);
>> +        return;
>> +    }
>> +    pg->flags |= ALUA_PG_RUNNING;
>> +    if (pg->flags & ALUA_PG_RUN_RTPG) {
>> +        spin_unlock_irqrestore(&pg->lock, flags);
>> +        err = alua_rtpg(sdev, pg);
>> +        spin_lock_irqsave(&pg->lock, flags);
>> +        if (err == SCSI_DH_RETRY) {
>> +            pg->flags &= ~ALUA_PG_RUNNING;
>> +            spin_unlock_irqrestore(&pg->lock, flags);
>> +            queue_delayed_work(kaluad_wq, &pg->rtpg_work,
>> +                       pg->interval * HZ);
>> +            return;
>> +        }
>> +        pg->flags &= ~ALUA_PG_RUN_RTPG;
>> +        if (err != SCSI_DH_OK)
>> +            pg->flags &= ~ALUA_PG_RUN_STPG;
>> +    }
>> +    if (pg->flags & ALUA_PG_RUN_STPG) {
>> +        spin_unlock_irqrestore(&pg->lock, flags);
>> +        err = alua_stpg(sdev, pg);
>> +        spin_lock_irqsave(&pg->lock, flags);
>> +        pg->flags &= ~ALUA_PG_RUN_STPG;
>> +        if (err == SCSI_DH_RETRY) {
>> +            pg->flags |= ALUA_PG_RUN_RTPG;
>> +            pg->interval = 0;
>> +            pg->flags &= ~ALUA_PG_RUNNING;
>> +            spin_unlock_irqrestore(&pg->lock, flags);
>> +            queue_delayed_work(kaluad_wq, &pg->rtpg_work,
>> +                       pg->interval * HZ);
>> +            return;
>> +        }
>> +    }
>> +
>> +    list_splice_init(&pg->rtpg_list, &qdata_list);
>> +    pg->rtpg_sdev = NULL;
>> +    spin_unlock_irqrestore(&pg->lock, flags);
>> +
>> +    list_for_each_entry_safe(qdata, tmp, &qdata_list, entry) {
>> +        list_del(&qdata->entry);
>> +        if (qdata->callback_fn)
>> +            qdata->callback_fn(qdata->callback_data, err);
>> +        kfree(qdata);
>> +    }
>> +    spin_lock_irqsave(&pg->lock, flags);
>> +    pg->flags &= ~ALUA_PG_RUNNING;
>> +    spin_unlock_irqrestore(&pg->lock, flags);
>> +    scsi_device_put(sdev);
>> +    kref_put(&pg->kref, release_port_group);
>> +}
> 
> With this patch series applied kmemleak reports several leaks that
> were not reported without this patch. Is scsi_device_put() +
> kref_put() always called before this function returns without
> requeueing the work item ? Shouldn't the return value of
> queue_delayed_work() be checked ? The leaks reported by kmemleak are:
> 
Yes, you are right. I need to check queue_delayed_work() and issue
a scsi_device_put()/kref_put() if the item is already queued.

Cheers,

Hannes
diff mbox

Patch

diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
index 4113b76..1b7153f 100644
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
@@ -59,13 +59,23 @@ 
 #define ALUA_RTPG_SIZE			128
 #define ALUA_FAILOVER_TIMEOUT		60
 #define ALUA_FAILOVER_RETRIES		5
+#define ALUA_RTPG_DELAY_MSECS		5
 
 /* device handler flags */
-#define ALUA_OPTIMIZE_STPG		1
-#define ALUA_RTPG_EXT_HDR_UNSUPP	2
+#define ALUA_OPTIMIZE_STPG		0x01
+#define ALUA_RTPG_EXT_HDR_UNSUPP	0x02
+/* State machine flags */
+#define ALUA_PG_RUN_RTPG		0x10
+#define ALUA_PG_RUN_STPG		0x20
+#define ALUA_PG_RUNNING			0x40
+
+static uint optimize_stpg;
+module_param(optimize_stpg, uint, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(optimize_stpg, "Allow use of a non-optimized path, rather than sending a STPG, when implicit TPGS is supported (0=No,1=Yes). Default is 0.");
 
 static LIST_HEAD(port_group_list);
 static DEFINE_SPINLOCK(port_group_lock);
+static struct workqueue_struct *kaluad_wq;
 
 struct alua_port_group {
 	struct kref		kref;
@@ -78,12 +88,25 @@  struct alua_port_group {
 	int			pref;
 	unsigned		flags; /* used for optimizing STPG */
 	unsigned char		transition_tmo;
+	unsigned long		expiry;
+	unsigned long		interval;
+	struct delayed_work	rtpg_work;
+	spinlock_t		lock;
+	struct list_head	rtpg_list;
+	struct scsi_device	*rtpg_sdev;
 };
 
 struct alua_dh_data {
 	struct alua_port_group	*pg;
 	int			rel_port;
+	spinlock_t		pg_lock;
 	struct scsi_device	*sdev;
+	int			init_error;
+	struct mutex		init_mutex;
+};
+
+struct alua_queue_data {
+	struct list_head	entry;
 	activate_complete	callback_fn;
 	void			*callback_data;
 };
@@ -91,8 +114,10 @@  struct alua_dh_data {
 #define ALUA_POLICY_SWITCH_CURRENT	0
 #define ALUA_POLICY_SWITCH_ALL		1
 
-static int alua_rtpg(struct scsi_device *, struct alua_port_group *);
-static char print_alua_state(int);
+static void alua_rtpg_work(struct work_struct *work);
+static void alua_rtpg_queue(struct alua_port_group *pg,
+			    struct scsi_device *sdev,
+			    struct alua_queue_data *qdata);
 
 static void release_port_group(struct kref *kref)
 {
@@ -103,6 +128,7 @@  static void release_port_group(struct kref *kref)
 	spin_lock(&port_group_lock);
 	list_del(&pg->node);
 	spin_unlock(&port_group_lock);
+	WARN_ON(pg->rtpg_sdev);
 	kfree(pg);
 }
 
@@ -168,7 +194,7 @@  static int submit_stpg(struct scsi_device *sdev, int group_id,
 struct alua_port_group *alua_lookup_pg(char *id_str, size_t id_size,
 				       int group_id)
 {
-	struct alua_port_group *pg = NULL;
+	struct alua_port_group *pg;
 
 	list_for_each_entry(pg, &port_group_list, node) {
 		if (pg->group_id != group_id)
@@ -218,18 +244,26 @@  struct alua_port_group *alua_get_pg(struct scsi_device *sdev,
 	pg->group_id = group_id;
 	pg->tpgs = tpgs;
 	pg->state = TPGS_STATE_OPTIMIZED;
+	if (optimize_stpg)
+		pg->flags |= ALUA_OPTIMIZE_STPG;
 	kref_init(&pg->kref);
+	INIT_DELAYED_WORK(&pg->rtpg_work, alua_rtpg_work);
+	INIT_LIST_HEAD(&pg->rtpg_list);
+	INIT_LIST_HEAD(&pg->node);
+	spin_lock_init(&pg->lock);
 
 	/* Re-check list again to catch concurrent updates */
 	spin_lock(&port_group_lock);
 	tmp_pg = alua_lookup_pg(id_str, id_size, group_id);
 	if (tmp_pg) {
 		spin_unlock(&port_group_lock);
-		kfree(pg);
-		return tmp_pg;
+		kref_put(&pg->kref, release_port_group);
+		pg = tmp_pg;
+		tmp_pg = NULL;
+	} else {
+		list_add(&pg->node, &port_group_list);
+		spin_unlock(&port_group_lock);
 	}
-	list_add(&pg->node, &port_group_list);
-	spin_unlock(&port_group_lock);
 
 	return pg;
 }
@@ -293,7 +327,7 @@  static int alua_check_tpgs(struct scsi_device *sdev)
  * Extract the relative target port and the target port group
  * descriptor from the list of identificators.
  *
- * Returns 0 or SCSI_DH_ error code on failure.
+ * Returns the target port group id or -1 on failure
  */
 static int alua_check_vpd(struct scsi_device *sdev, struct alua_dh_data *h,
 			  int tpgs)
@@ -301,6 +335,8 @@  static int alua_check_vpd(struct scsi_device *sdev, struct alua_dh_data *h,
 	int rel_port = -1, group_id;
 	char id_str[256];
 	int id_size;
+	struct alua_port_group *pg = NULL, *old_pg = NULL;
+	bool pg_found = false;
 
 	group_id = scsi_vpd_tpg_id(sdev, &rel_port);
 	if (group_id < 0) {
@@ -332,11 +368,37 @@  static int alua_check_vpd(struct scsi_device *sdev, struct alua_dh_data *h,
 		    "rel port %02x\n", ALUA_DH_NAME,
 		    id_str, group_id, h->rel_port);
 
-	h->pg = alua_get_pg(sdev, group_id, tpgs, id_str, id_size);
-	if (!h->pg)
+	pg = alua_get_pg(sdev, group_id, tpgs, id_str, id_size);
+	if (!pg)
 		return SCSI_DH_NOMEM;
 
-	return alua_rtpg(sdev, h->pg);
+	/* Check for existing port_group references */
+	spin_lock(&h->pg_lock);
+	if (h->pg) {
+		old_pg = pg;
+		/* port_group has changed. Update to new port group */
+		if (h->pg != pg) {
+			old_pg = h->pg;
+			rcu_assign_pointer(h->pg, pg);
+			h->pg->expiry = 0;
+			pg_found = true;
+		}
+	} else {
+		rcu_assign_pointer(h->pg, pg);
+		pg_found = true;
+	}
+	alua_rtpg_queue(h->pg, sdev, NULL);
+	spin_unlock(&h->pg_lock);
+
+	if (pg_found)
+		synchronize_rcu();
+	if (old_pg) {
+		if (old_pg->rtpg_sdev)
+			flush_delayed_work(&old_pg->rtpg_work);
+		kref_put(&old_pg->kref, release_port_group);
+	}
+
+	return SCSI_DH_OK;
 }
 
 static char print_alua_state(int state)
@@ -430,14 +492,15 @@  static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
 	int len, k, off, valid_states = 0, bufflen = ALUA_RTPG_SIZE;
 	unsigned char *ucp, *buff;
 	unsigned err, retval;
-	unsigned long expiry, interval = 0;
 	unsigned int tpg_desc_tbl_off;
 	unsigned char orig_transition_tmo;
 
-	if (!pg->transition_tmo)
-		expiry = round_jiffies_up(jiffies + ALUA_FAILOVER_TIMEOUT * HZ);
-	else
-		expiry = round_jiffies_up(jiffies + pg->transition_tmo * HZ);
+	if (!pg->expiry) {
+		if (!pg->transition_tmo)
+			pg->expiry = round_jiffies_up(jiffies + ALUA_FAILOVER_TIMEOUT * HZ);
+		else
+			pg->expiry = round_jiffies_up(jiffies + pg->transition_tmo * HZ);
+	}
 
 	buff = kzalloc(bufflen, GFP_KERNEL);
 	if (!buff)
@@ -480,16 +543,18 @@  static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
 			err = SCSI_DH_RETRY;
 		else if (sense_hdr.sense_key == UNIT_ATTENTION)
 			err = SCSI_DH_RETRY;
-		if (err == SCSI_DH_RETRY && time_before(jiffies, expiry)) {
+		if (err == SCSI_DH_RETRY &&
+		    pg->expiry != 0 && time_before(jiffies, pg->expiry)) {
 			sdev_printk(KERN_ERR, sdev, "%s: rtpg retry\n",
 				    ALUA_DH_NAME);
 			scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
-			goto retry;
+			return err;
 		}
 		sdev_printk(KERN_ERR, sdev, "%s: rtpg failed\n",
 			    ALUA_DH_NAME);
 		scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
 		kfree(buff);
+		pg->expiry = 0;
 		return SCSI_DH_IO;
 	}
 
@@ -504,6 +569,7 @@  static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
 			sdev_printk(KERN_WARNING, sdev,
 				    "%s: kmalloc buffer failed\n",__func__);
 			/* Temporary failure, bypass */
+			pg->expiry = 0;
 			return SCSI_DH_DEV_TEMP_BUSY;
 		}
 		goto retry;
@@ -519,7 +585,7 @@  static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
 		sdev_printk(KERN_INFO, sdev,
 			    "%s: transition timeout set to %d seconds\n",
 			    ALUA_DH_NAME, pg->transition_tmo);
-		expiry = jiffies + pg->transition_tmo * HZ;
+		pg->expiry = jiffies + pg->transition_tmo * HZ;
 	}
 
 	if ((buff[4] & RTPG_FMT_MASK) == RTPG_FMT_EXT_HDR)
@@ -553,23 +619,26 @@  static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
 
 	switch (pg->state) {
 	case TPGS_STATE_TRANSITIONING:
-		if (time_before(jiffies, expiry)) {
+		if (time_before(jiffies, pg->expiry)) {
 			/* State transition, retry */
-			interval += 2000;
-			msleep(interval);
-			goto retry;
+			pg->interval = 2;
+			err = SCSI_DH_RETRY;
+		} else {
+			/* Transitioning time exceeded, set port to standby */
+			err = SCSI_DH_IO;
+			pg->state = TPGS_STATE_STANDBY;
+			pg->expiry = 0;
 		}
-		/* Transitioning time exceeded, set port to standby */
-		err = SCSI_DH_RETRY;
-		pg->state = TPGS_STATE_STANDBY;
 		break;
 	case TPGS_STATE_OFFLINE:
 		/* Path unusable */
 		err = SCSI_DH_DEV_OFFLINED;
+		pg->expiry = 0;
 		break;
 	default:
 		/* Useable path if active */
 		err = SCSI_DH_OK;
+		pg->expiry = 0;
 		break;
 	}
 	kfree(buff);
@@ -637,6 +706,106 @@  static unsigned alua_stpg(struct scsi_device *sdev, struct alua_port_group *pg)
 	return SCSI_DH_RETRY;
 }
 
+static void alua_rtpg_work(struct work_struct *work)
+{
+	struct alua_port_group *pg =
+		container_of(work, struct alua_port_group, rtpg_work.work);
+	struct scsi_device *sdev;
+	LIST_HEAD(qdata_list);
+	int err = SCSI_DH_OK;
+	struct alua_queue_data *qdata, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pg->lock, flags);
+	sdev = pg->rtpg_sdev;
+	if (!sdev) {
+		WARN_ON(pg->flags & ALUA_PG_RUN_RTPG ||
+			pg->flags & ALUA_PG_RUN_STPG);
+		spin_unlock_irqrestore(&pg->lock, flags);
+		return;
+	}
+	pg->flags |= ALUA_PG_RUNNING;
+	if (pg->flags & ALUA_PG_RUN_RTPG) {
+		spin_unlock_irqrestore(&pg->lock, flags);
+		err = alua_rtpg(sdev, pg);
+		spin_lock_irqsave(&pg->lock, flags);
+		if (err == SCSI_DH_RETRY) {
+			pg->flags &= ~ALUA_PG_RUNNING;
+			spin_unlock_irqrestore(&pg->lock, flags);
+			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
+					   pg->interval * HZ);
+			return;
+		}
+		pg->flags &= ~ALUA_PG_RUN_RTPG;
+		if (err != SCSI_DH_OK)
+			pg->flags &= ~ALUA_PG_RUN_STPG;
+	}
+	if (pg->flags & ALUA_PG_RUN_STPG) {
+		spin_unlock_irqrestore(&pg->lock, flags);
+		err = alua_stpg(sdev, pg);
+		spin_lock_irqsave(&pg->lock, flags);
+		pg->flags &= ~ALUA_PG_RUN_STPG;
+		if (err == SCSI_DH_RETRY) {
+			pg->flags |= ALUA_PG_RUN_RTPG;
+			pg->interval = 0;
+			pg->flags &= ~ALUA_PG_RUNNING;
+			spin_unlock_irqrestore(&pg->lock, flags);
+			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
+					   pg->interval * HZ);
+			return;
+		}
+	}
+
+	list_splice_init(&pg->rtpg_list, &qdata_list);
+	pg->rtpg_sdev = NULL;
+	spin_unlock_irqrestore(&pg->lock, flags);
+
+	list_for_each_entry_safe(qdata, tmp, &qdata_list, entry) {
+		list_del(&qdata->entry);
+		if (qdata->callback_fn)
+			qdata->callback_fn(qdata->callback_data, err);
+		kfree(qdata);
+	}
+	spin_lock_irqsave(&pg->lock, flags);
+	pg->flags &= ~ALUA_PG_RUNNING;
+	spin_unlock_irqrestore(&pg->lock, flags);
+	scsi_device_put(sdev);
+	kref_put(&pg->kref, release_port_group);
+}
+
+static void alua_rtpg_queue(struct alua_port_group *pg,
+			    struct scsi_device *sdev,
+			    struct alua_queue_data *qdata)
+{
+	int start_queue = 0;
+	unsigned long flags;
+
+	if (!pg)
+		return;
+
+	spin_lock_irqsave(&pg->lock, flags);
+	if (qdata) {
+		list_add_tail(&qdata->entry, &pg->rtpg_list);
+		pg->flags |= ALUA_PG_RUN_STPG;
+	}
+	if (pg->rtpg_sdev == NULL) {
+		pg->interval = 0;
+		pg->flags |= ALUA_PG_RUN_RTPG;
+		kref_get(&pg->kref);
+		pg->rtpg_sdev = sdev;
+		scsi_device_get(sdev);
+		start_queue = 1;
+	}
+	spin_unlock_irqrestore(&pg->lock, flags);
+
+	if (start_queue &&
+	    !queue_delayed_work(kaluad_wq, &pg->rtpg_work,
+				msecs_to_jiffies(ALUA_RTPG_DELAY_MSECS))) {
+		scsi_device_put(sdev);
+		kref_put(&pg->kref, release_port_group);
+	}
+}
+
 /*
  * alua_initialize - Initialize ALUA state
  * @sdev: the device to be initialized
@@ -648,10 +817,12 @@  static int alua_initialize(struct scsi_device *sdev, struct alua_dh_data *h)
 {
 	int err = SCSI_DH_DEV_UNSUPP, tpgs;
 
+	mutex_lock(&h->init_mutex);
 	tpgs = alua_check_tpgs(sdev);
 	if (tpgs != TPGS_MODE_NONE)
 		err = alua_check_vpd(sdev, h, tpgs);
-
+	h->init_error = err;
+	mutex_unlock(&h->init_mutex);
 	return err;
 }
 
@@ -671,6 +842,7 @@  static int alua_set_params(struct scsi_device *sdev, const char *params)
 	unsigned int optimize = 0, argc;
 	const char *p = params;
 	int result = SCSI_DH_OK;
+	unsigned long flags;
 
 	if ((sscanf(params, "%u", &argc) != 1) || (argc != 1))
 		return -EINVAL;
@@ -680,22 +852,23 @@  static int alua_set_params(struct scsi_device *sdev, const char *params)
 	if ((sscanf(p, "%u", &optimize) != 1) || (optimize > 1))
 		return -EINVAL;
 
-	pg = h->pg;
-	if (!pg)
+	rcu_read_lock();
+	pg = rcu_dereference(h->pg);
+	if (!pg) {
+		rcu_read_unlock();
 		return -ENXIO;
-
+	}
+	spin_lock_irqsave(&pg->lock, flags);
 	if (optimize)
 		pg->flags |= ALUA_OPTIMIZE_STPG;
 	else
 		pg->flags |= ~ALUA_OPTIMIZE_STPG;
+	spin_unlock_irqrestore(&pg->lock, flags);
+	rcu_read_unlock();
 
 	return result;
 }
 
-static uint optimize_stpg;
-module_param(optimize_stpg, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(optimize_stpg, "Allow use of a non-optimized path, rather than sending a STPG, when implicit TPGS is supported (0=No,1=Yes). Default is 0.");
-
 /*
  * alua_activate - activate a path
  * @sdev: device on the path to be activated
@@ -711,24 +884,34 @@  static int alua_activate(struct scsi_device *sdev,
 {
 	struct alua_dh_data *h = sdev->handler_data;
 	int err = SCSI_DH_OK;
+	struct alua_queue_data *qdata;
+	struct alua_port_group *pg;
 
-	if (!h->pg)
+	qdata = kzalloc(sizeof(*qdata), GFP_KERNEL);
+	if (!qdata) {
+		err = SCSI_DH_RES_TEMP_UNAVAIL;
 		goto out;
+	}
+	qdata->callback_fn = fn;
+	qdata->callback_data = data;
 
-	kref_get(&h->pg->kref);
-
-	if (optimize_stpg)
-		h->pg->flags |= ALUA_OPTIMIZE_STPG;
-
-	err = alua_rtpg(sdev, h->pg);
-	if (err != SCSI_DH_OK) {
-		kref_put(&h->pg->kref, release_port_group);
+	mutex_lock(&h->init_mutex);
+	rcu_read_lock();
+	pg = rcu_dereference(h->pg);
+	if (!pg) {
+		rcu_read_unlock();
+		kfree(qdata);
+		err = h->init_error;
+		mutex_unlock(&h->init_mutex);
 		goto out;
 	}
-	err = alua_stpg(sdev, h->pg);
-	if (err == SCSI_DH_RETRY)
-		err = alua_rtpg(sdev, h->pg);
-	kref_put(&h->pg->kref, release_port_group);
+	mutex_unlock(&h->init_mutex);
+	fn = NULL;
+	kref_get(&pg->kref);
+	rcu_read_unlock();
+
+	alua_rtpg_queue(pg, sdev, qdata);
+	kref_put(&pg->kref, release_port_group);
 out:
 	if (fn)
 		fn(data, err);
@@ -744,14 +927,19 @@  out:
 static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
 {
 	struct alua_dh_data *h = sdev->handler_data;
-	int state;
+	struct alua_port_group *pg;
+	int state = TPGS_STATE_OPTIMIZED;
 	int ret = BLKPREP_OK;
 
-	if (!h->pg)
-		return ret;
-	kref_get(&h->pg->kref);
-	state = h->pg->state;
-	kref_put(&h->pg->kref, release_port_group);
+	rcu_read_lock();
+	pg = rcu_dereference(h->pg);
+	if (pg) {
+		state = pg->state;
+		/* Defer I/O while rtpg_work is active */
+		if (pg->rtpg_sdev)
+			state = TPGS_STATE_TRANSITIONING;
+	}
+	rcu_read_unlock();
 	if (state == TPGS_STATE_TRANSITIONING)
 		ret = BLKPREP_DEFER;
 	else if (state != TPGS_STATE_OPTIMIZED &&
@@ -776,10 +964,13 @@  static int alua_bus_attach(struct scsi_device *sdev)
 	h = kzalloc(sizeof(*h) , GFP_KERNEL);
 	if (!h)
 		return -ENOMEM;
-	h->pg = NULL;
+	spin_lock_init(&h->pg_lock);
+	rcu_assign_pointer(h->pg, NULL);
 	h->rel_port = -1;
+	h->init_error = SCSI_DH_OK;
 	h->sdev = sdev;
 
+	mutex_init(&h->init_mutex);
 	err = alua_initialize(sdev, h);
 	if (err == SCSI_DH_NOMEM)
 		ret = -ENOMEM;
@@ -800,10 +991,18 @@  failed:
 static void alua_bus_detach(struct scsi_device *sdev)
 {
 	struct alua_dh_data *h = sdev->handler_data;
+	struct alua_port_group *pg;
 
-	if (h->pg) {
-		kref_put(&h->pg->kref, release_port_group);
-		h->pg = NULL;
+	spin_lock(&h->pg_lock);
+	pg = h->pg;
+	rcu_assign_pointer(h->pg, NULL);
+	h->sdev = NULL;
+	spin_unlock(&h->pg_lock);
+	if (pg) {
+		synchronize_rcu();
+		if (pg->rtpg_sdev)
+			flush_delayed_work(&pg->rtpg_work);
+		kref_put(&pg->kref, release_port_group);
 	}
 	sdev->handler_data = NULL;
 	kfree(h);
@@ -824,16 +1023,24 @@  static int __init alua_init(void)
 {
 	int r;
 
+	kaluad_wq = create_workqueue("kaluad_wq");
+	if (!kaluad_wq) {
+		/* Temporary failure, bypass */
+		return SCSI_DH_DEV_TEMP_BUSY;
+	}
 	r = scsi_register_device_handler(&alua_dh);
-	if (r != 0)
+	if (r != 0) {
 		printk(KERN_ERR "%s: Failed to register scsi device handler",
 			ALUA_DH_NAME);
+		destroy_workqueue(kaluad_wq);
+	}
 	return r;
 }
 
 static void __exit alua_exit(void)
 {
 	scsi_unregister_device_handler(&alua_dh);
+	destroy_workqueue(kaluad_wq);
 }
 
 module_init(alua_init);