diff mbox

[4/6] dm-multipath: remove process_queued_ios()

Message ID 1391415493-102943-5-git-send-email-hare@suse.de (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Hannes Reinecke Feb. 3, 2014, 8:18 a.m. UTC
Doesn't serve any real purpose anymore; dm_table_run_queue()
will already move things to a workqueue, so we don't need
to do it ourselves.
We only need to take care to add a small delay when calling
__pg_init_all_paths() to move processing off to a workqueue;
pg_init_done() is run from an interrupt context and needs to
complete as fast as possible.

Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/md/dm-mpath.c | 59 +++++++++++++++------------------------------------
 1 file changed, 17 insertions(+), 42 deletions(-)

Comments

Junichi Nomura Feb. 3, 2014, 11:30 a.m. UTC | #1
On 02/03/14 17:18, Hannes Reinecke wrote:
> Doesn't serve any real purpose anymore; dm_table_run_queue()
> will already move things to a workqueue, so we don't need
> to do it ourselves.
> We only need to take care to add a small delay when calling
> __pg_init_all_paths() to move processing off to a workqueue;
> pg_init_done() is run from an interrupt context and needs to
> complete as fast as possible.

I think more explanation is needed for the patch description.
As far as I understand, the change is based on the following reasoning:

  process_queued_ios() has served 3 functions:
    1) select pg and pgpath if none is selected
    2) start pg_init if requested
    3) dispatch queued IOs when pg is ready

  Basically, a call to queue_work(process_queued_ios) can be replaced by
  dm_table_run_queue(), which runs request queue and ends up calling
  map_io(), which does 1), 2) and 3).

  Exception is when !pg_ready() (= either pg_init is running or requested),
  multipath_busy() prevents map_io() being called from request_fn.

  If pg_init is running, it should be ok as far as pg_init_done() does
  the right thing when pg_init is completed. I.e. restart pg_init if
  !pg_ready() or call dm_table_run_queue() to kick map_io().

  If pg_init is requested, we have to make sure the request is detected
  and pg_init will be started.
  pg_init is requested in 3 places:
    a) __choose_pgpath() in map_io()
    b) __choose_pgpath() in multipath_ioctl()
    c) pg_init retry in pg_init_done()
  a) is ok because map_io() calls __pg_init_all_paths(), which does 2).
  b) needs a call to __pg_init_all_paths(), which does 2).
  c) needs a call to __pg_init_all_paths(), which does 2).


By writing the above, I found possible bugs related to 1):

> @@ -1217,9 +1185,11 @@ static void pg_init_done(void *data, int errors)
>  
>  	if (!m->pg_init_required)
>  		m->queue_io = 0;
> -
> -	m->pg_init_delay_retry = delay_retry;
> -	queue_work(kmultipathd, &m->process_queued_ios);
> +	else {
> +		m->pg_init_delay_retry = delay_retry;
> +		__pg_init_all_paths(m, 50/HZ);
> +		goto out;
> +	}
>  
>  	/*
>  	 * Wake up any thread waiting to suspend.

It is possible that m->current_pg is NULL.
(E.g. pg_init failed for current_pgpath, bypass_pg() was called, etc.)
__pg_init_all_paths() will cause oops in such a case.

So how about doing this in pg_init_done():

	if (m->pg_init_required) {
		m->pg_init_delay_retry = delay_retry;
		if (__pg_init_all_paths(m))
			goto out;
	}

	/* pg_init successfully completed */
	m->queue_io = 0;

and in __pg_init_all_paths(), do something like:

	m->pg_init_required = 0;
	...
	if (!m->current_pg)
		return 0;
	...
	return m->pg_init_in_progress;


> @@ -1593,8 +1563,13 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
>  	if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
>  		r = scsi_verify_blk_ioctl(NULL, cmd);
>  
> -	if (r == -ENOTCONN && !fatal_signal_pending(current))
> -		queue_work(kmultipathd, &m->process_queued_ios);
> +	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
> +		spin_lock_irqsave(&m->lock, flags);
> +		if (m->current_pgpath && m->pg_init_required)
> +			__pg_init_all_paths(m, 0);
> +		spin_unlock_irqrestore(&m->lock, flags);
> +		dm_table_run_md_queue_async(m->ti->table);
> +	}
>  
>  	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
>  }

Similarly, m->current_pgpath can be NULL here while pg_init_required.
Then pg_init_required is left uncleared and all IOs in the queue will
stall until somebody calls multipath_ioctl() to redo pg selection.
Hannes Reinecke Feb. 3, 2014, 11:39 a.m. UTC | #2
On 02/03/2014 12:30 PM, Junichi Nomura wrote:
> On 02/03/14 17:18, Hannes Reinecke wrote:
>> Doesn't serve any real purpose anymore; dm_table_run_queue()
>> will already move things to a workqueue, so we don't need
>> to do it ourselves.
>> We only need to take care to add a small delay when calling
>> __pg_init_all_paths() to move processing off to a workqueue;
>> pg_init_done() is run from an interrupt context and needs to
>> complete as fast as possible.
> 
> I think more explanation is needed for the patch description.
> As far as I understand, the change is based on the following reasoning:
> 
>   process_queued_ios() has served 3 functions:
>     1) select pg and pgpath if none is selected
>     2) start pg_init if requested
>     3) dispatch queued IOs when pg is ready
> 
>   Basically, a call to queue_work(process_queued_ios) can be replaced by
>   dm_table_run_queue(), which runs request queue and ends up calling
>   map_io(), which does 1), 2) and 3).
> 
Yes.

>   Exception is when !pg_ready() (= either pg_init is running or requested),
>   multipath_busy() prevents map_io() being called from request_fn.
> 
>   If pg_init is running, it should be ok as far as pg_init_done() does
>   the right thing when pg_init is completed. I.e. restart pg_init if
>   !pg_ready() or call dm_table_run_queue() to kick map_io().
> 
>   If pg_init is requested, we have to make sure the request is detected
>   and pg_init will be started.
>   pg_init is requested in 3 places:
>     a) __choose_pgpath() in map_io()
>     b) __choose_pgpath() in multipath_ioctl()
>     c) pg_init retry in pg_init_done()
>   a) is ok because map_io() calls __pg_init_all_paths(), which does 2).
>   b) needs a call to __pg_init_all_paths(), which does 2).
>   c) needs a call to __pg_init_all_paths(), which does 2).
> 
> 
> By writing the above, I found possible bugs related to 1):
> 
>> @@ -1217,9 +1185,11 @@ static void pg_init_done(void *data, int errors)
>>  
>>  	if (!m->pg_init_required)
>>  		m->queue_io = 0;
>> -
>> -	m->pg_init_delay_retry = delay_retry;
>> -	queue_work(kmultipathd, &m->process_queued_ios);
>> +	else {
>> +		m->pg_init_delay_retry = delay_retry;
>> +		__pg_init_all_paths(m, 50/HZ);
>> +		goto out;
>> +	}
>>  
>>  	/*
>>  	 * Wake up any thread waiting to suspend.
> 
> It is possible that m->current_pg is NULL.
> (E.g. pg_init failed for current_pgpath, bypass_pg() was called, etc.)
> __pg_init_all_paths() will cause oops in such a case.
> 
Ah. Right.

> So how about doing this in pg_init_done():
> 
> 	if (m->pg_init_required) {
> 		m->pg_init_delay_retry = delay_retry;
> 		if (__pg_init_all_paths(m))
> 			goto out;
> 	}
> 
> 	/* pg_init successfully completed */
> 	m->queue_io = 0;
> 
> and in __pg_init_all_paths(), do something like:
> 
> 	m->pg_init_required = 0;
> 	...
> 	if (!m->current_pg)
> 		return 0;
> 	...
> 	return m->pg_init_in_progress;
> 
> 
Hmm. That still wouldn't be doing the right thing.
'fail_path' in pg_init_done() might be setting current_pg to NULL,
but this doesn't mean that the entire path group is invalid.
I just means that this particular path is invalid, and we still
might need to retry pg_init for the other paths.

>> @@ -1593,8 +1563,13 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
>>  	if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
>>  		r = scsi_verify_blk_ioctl(NULL, cmd);
>>  
>> -	if (r == -ENOTCONN && !fatal_signal_pending(current))
>> -		queue_work(kmultipathd, &m->process_queued_ios);
>> +	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
>> +		spin_lock_irqsave(&m->lock, flags);
>> +		if (m->current_pgpath && m->pg_init_required)
>> +			__pg_init_all_paths(m, 0);
>> +		spin_unlock_irqrestore(&m->lock, flags);
>> +		dm_table_run_md_queue_async(m->ti->table);
>> +	}
>>  
>>  	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
>>  }
> 
> Similarly, m->current_pgpath can be NULL here while pg_init_required.
> Then pg_init_required is left uncleared and all IOs in the queue will
> stall until somebody calls multipath_ioctl() to redo pg selection.
> 
Ok, correct. Will be fixing it up.

Thanks for the review.

Cheers,

Hannes
Junichi Nomura Feb. 3, 2014, 12:08 p.m. UTC | #3
On 02/03/14 17:18, Hannes Reinecke wrote:
> We only need to take care to add a small delay when calling
> __pg_init_all_paths() to move processing off to a workqueue;
> pg_init_done() is run from an interrupt context and needs to
> complete as fast as possible.
...
> @@ -1217,9 +1185,11 @@ static void pg_init_done(void *data, int errors)
>  
>  	if (!m->pg_init_required)
>  		m->queue_io = 0;
> -
> -	m->pg_init_delay_retry = delay_retry;
> -	queue_work(kmultipathd, &m->process_queued_ios);
> +	else {
> +		m->pg_init_delay_retry = delay_retry;
> +		__pg_init_all_paths(m, 50/HZ);
> +		goto out;
> +	}
>  

I forgot to comment on this.
Adding delay to queue_work() doesn't make it fast.
So I couldn't see why this "50/HZ" delay has to be added
and where this value comes.
Hannes Reinecke Feb. 3, 2014, 12:18 p.m. UTC | #4
On 02/03/2014 01:08 PM, Junichi Nomura wrote:
> On 02/03/14 17:18, Hannes Reinecke wrote:
>> We only need to take care to add a small delay when calling
>> __pg_init_all_paths() to move processing off to a workqueue;
>> pg_init_done() is run from an interrupt context and needs to
>> complete as fast as possible.
> ...
>> @@ -1217,9 +1185,11 @@ static void pg_init_done(void *data, int errors)
>>  
>>  	if (!m->pg_init_required)
>>  		m->queue_io = 0;
>> -
>> -	m->pg_init_delay_retry = delay_retry;
>> -	queue_work(kmultipathd, &m->process_queued_ios);
>> +	else {
>> +		m->pg_init_delay_retry = delay_retry;
>> +		__pg_init_all_paths(m, 50/HZ);
>> +		goto out;
>> +	}
>>  
> 
> I forgot to comment on this.
> Adding delay to queue_work() doesn't make it fast.
> So I couldn't see why this "50/HZ" delay has to be added
> and where this value comes.
> 
Well, it wasn't probably the best choice of words.

Thing is, without a delay the workqueue item will be executed
directly (cf __queue_delayed_work()).
But pg_init_done() is run from an interrupt context, and as such any
memory allocations have to be atomic.
So if we were to call queue_delayed_work() without any delay
we will end up calling scsi_dh_activate from an interrupt context,
too, but there we most definitely do _not_ have only atomic allocations.
Hence the delay.

Cheers,

Hannes
Junichi Nomura Feb. 3, 2014, 12:39 p.m. UTC | #5
On 02/03/14 21:18, Hannes Reinecke wrote:
> On 02/03/2014 01:08 PM, Junichi Nomura wrote:
>> On 02/03/14 17:18, Hannes Reinecke wrote:
>>> We only need to take care to add a small delay when calling
>>> __pg_init_all_paths() to move processing off to a workqueue;
>>> pg_init_done() is run from an interrupt context and needs to
>>> complete as fast as possible.
>> ...
>>> @@ -1217,9 +1185,11 @@ static void pg_init_done(void *data, int errors)
>>>  
>>>  	if (!m->pg_init_required)
>>>  		m->queue_io = 0;
>>> -
>>> -	m->pg_init_delay_retry = delay_retry;
>>> -	queue_work(kmultipathd, &m->process_queued_ios);
>>> +	else {
>>> +		m->pg_init_delay_retry = delay_retry;
>>> +		__pg_init_all_paths(m, 50/HZ);
>>> +		goto out;
>>> +	}
>>>  
>>
>> I forgot to comment on this.
>> Adding delay to queue_work() doesn't make it fast.
>> So I couldn't see why this "50/HZ" delay has to be added
>> and where this value comes.
>>
> Well, it wasn't probably the best choice of words.
> 
> Thing is, without a delay the workqueue item will be executed
> directly (cf __queue_delayed_work()).
> But pg_init_done() is run from an interrupt context, and as such any
> memory allocations have to be atomic.
> So if we were to call queue_delayed_work() without any delay
> we will end up calling scsi_dh_activate from an interrupt context,
> too, but there we most definitely do _not_ have only atomic allocations.
> Hence the delay.

Work is executed in the worker context (in this case by kmpath_handlerd).
Isn't it?
Hannes Reinecke Feb. 3, 2014, 12:57 p.m. UTC | #6
On 02/03/2014 01:39 PM, Junichi Nomura wrote:
> On 02/03/14 21:18, Hannes Reinecke wrote:
>> On 02/03/2014 01:08 PM, Junichi Nomura wrote:
>>> On 02/03/14 17:18, Hannes Reinecke wrote:
>>>> We only need to take care to add a small delay when calling
>>>> __pg_init_all_paths() to move processing off to a workqueue;
>>>> pg_init_done() is run from an interrupt context and needs to
>>>> complete as fast as possible.
>>> ...
>>>> @@ -1217,9 +1185,11 @@ static void pg_init_done(void *data, int errors)
>>>>  
>>>>  	if (!m->pg_init_required)
>>>>  		m->queue_io = 0;
>>>> -
>>>> -	m->pg_init_delay_retry = delay_retry;
>>>> -	queue_work(kmultipathd, &m->process_queued_ios);
>>>> +	else {
>>>> +		m->pg_init_delay_retry = delay_retry;
>>>> +		__pg_init_all_paths(m, 50/HZ);
>>>> +		goto out;
>>>> +	}
>>>>  
>>>
>>> I forgot to comment on this.
>>> Adding delay to queue_work() doesn't make it fast.
>>> So I couldn't see why this "50/HZ" delay has to be added
>>> and where this value comes.
>>>
>> Well, it wasn't probably the best choice of words.
>>
>> Thing is, without a delay the workqueue item will be executed
>> directly (cf __queue_delayed_work()).
>> But pg_init_done() is run from an interrupt context, and as such any
>> memory allocations have to be atomic.
>> So if we were to call queue_delayed_work() without any delay
>> we will end up calling scsi_dh_activate from an interrupt context,
>> too, but there we most definitely do _not_ have only atomic allocations.
>> Hence the delay.
> 
> Work is executed in the worker context (in this case by kmpath_handlerd).
> Isn't it?
> 
Yes, but without the delay we'd be scheduling during pg_init_done(),
ie within an interrupt context.

Cheers,

Hannes
Junichi Nomura Feb. 4, 2014, 3:21 a.m. UTC | #7
On 02/03/14 21:57, Hannes Reinecke wrote:
> On 02/03/2014 01:39 PM, Junichi Nomura wrote:
>> On 02/03/14 21:18, Hannes Reinecke wrote:
>>> On 02/03/2014 01:08 PM, Junichi Nomura wrote:
>>>> On 02/03/14 17:18, Hannes Reinecke wrote:
>>>>> We only need to take care to add a small delay when calling
>>>>> __pg_init_all_paths() to move processing off to a workqueue;
>>>>> pg_init_done() is run from an interrupt context and needs to
>>>>> complete as fast as possible.
>>>> ...
>>>>> @@ -1217,9 +1185,11 @@ static void pg_init_done(void *data, int errors)
>>>>>  
>>>>>  	if (!m->pg_init_required)
>>>>>  		m->queue_io = 0;
>>>>> -
>>>>> -	m->pg_init_delay_retry = delay_retry;
>>>>> -	queue_work(kmultipathd, &m->process_queued_ios);
>>>>> +	else {
>>>>> +		m->pg_init_delay_retry = delay_retry;
>>>>> +		__pg_init_all_paths(m, 50/HZ);
>>>>> +		goto out;
>>>>> +	}
>>>>>  
>>>>
>>>> I forgot to comment on this.
>>>> Adding delay to queue_work() doesn't make it fast.
>>>> So I couldn't see why this "50/HZ" delay has to be added
>>>> and where this value comes.
>>>>
>>> Well, it wasn't probably the best choice of words.
>>>
>>> Thing is, without a delay the workqueue item will be executed
>>> directly (cf __queue_delayed_work()).
>>> But pg_init_done() is run from an interrupt context, and as such any
>>> memory allocations have to be atomic.
>>> So if we were to call queue_delayed_work() without any delay
>>> we will end up calling scsi_dh_activate from an interrupt context,
>>> too, but there we most definitely do _not_ have only atomic allocations.
>>> Hence the delay.
>>
>> Work is executed in the worker context (in this case by kmpath_handlerd).
>> Isn't it?
>>
> Yes, but without the delay we'd be scheduling during pg_init_done(),
> ie within an interrupt context.

Could you elaborate on the problem you are going to solve?
If scheduling happens in interrupt context, it's a bug.
And if such a bug exists, it should be there even without this series
of your patch.

Besides, 50/HZ is 0 unless your HZ is extremely low.
So the code won't work as you intended anyway...
diff mbox

Patch

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5373ca9..b11e3b3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -93,8 +93,6 @@  struct multipath {
 	unsigned pg_init_count;		/* Number of times pg_init called */
 	unsigned pg_init_delay_msecs;	/* Number of msecs before pg_init retry */
 
-	struct work_struct process_queued_ios;
-
 	struct work_struct trigger_event;
 
 	/*
@@ -119,7 +117,6 @@  typedef int (*action_fn) (struct pgpath *pgpath);
 static struct kmem_cache *_mpio_cache;
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
-static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
 static int __pgpath_busy(struct pgpath *pgpath);
@@ -197,7 +194,6 @@  static struct multipath *alloc_multipath(struct dm_target *ti)
 		spin_lock_init(&m->lock);
 		m->queue_io = 1;
 		m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
-		INIT_WORK(&m->process_queued_ios, process_queued_ios);
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
@@ -254,10 +250,10 @@  static void clear_mapinfo(struct multipath *m, union map_info *info)
  * Path selection
  *-----------------------------------------------*/
 
-static void __pg_init_all_paths(struct multipath *m)
+static void __pg_init_all_paths(struct multipath *m, unsigned long min_delay)
 {
 	struct pgpath *pgpath;
-	unsigned long pg_init_delay = 0;
+	unsigned long pg_init_delay = min_delay;
 
 	if (m->pg_init_in_progress || m->pg_init_disabled)
 		return;
@@ -406,7 +402,7 @@  static int map_io(struct multipath *m, struct request *clone,
 							      &pgpath->path,
 							      nr_bytes);
 		} else {
-			__pg_init_all_paths(m);
+			__pg_init_all_paths(m, 0);
 			r = DM_MAPIO_REQUEUE;
 		}
 	} else {
@@ -438,41 +434,13 @@  static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
 		m->saved_queue_if_no_path = queue_if_no_path;
 	m->queue_if_no_path = queue_if_no_path;
 	if (!m->queue_if_no_path)
-		queue_work(kmultipathd, &m->process_queued_ios);
+		dm_table_run_md_queue_async(m->ti->table);
 
 	spin_unlock_irqrestore(&m->lock, flags);
 
 	return 0;
 }
 
-static void process_queued_ios(struct work_struct *work)
-{
-	struct multipath *m =
-		container_of(work, struct multipath, process_queued_ios);
-	struct pgpath *pgpath = NULL;
-	unsigned must_queue = 1;
-	unsigned long flags;
-
-	spin_lock_irqsave(&m->lock, flags);
-
-	if (!m->current_pgpath)
-		__choose_pgpath(m, 0);
-
-	pgpath = m->current_pgpath;
-
-	if ((pgpath && !m->queue_io) ||
-	    (!pgpath && !m->queue_if_no_path))
-		must_queue = 0;
-
-	if (m->pg_init_required && !m->pg_init_in_progress && pgpath &&
-	    !m->pg_init_disabled)
-		__pg_init_all_paths(m);
-
-	spin_unlock_irqrestore(&m->lock, flags);
-	if (!must_queue)
-		dm_table_run_md_queue_async(m->ti->table);
-}
-
 /*
  * An event is triggered whenever a path is taken out of use.
  * Includes path failure and PG bypass.
@@ -1019,7 +987,7 @@  static int reinstate_path(struct pgpath *pgpath)
 
 	if (!m->nr_valid_paths++) {
 		m->current_pgpath = NULL;
-		queue_work(kmultipathd, &m->process_queued_ios);
+		dm_table_run_md_queue_async(m->ti->table);
 	} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
 		if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
 			m->pg_init_in_progress++;
@@ -1217,9 +1185,11 @@  static void pg_init_done(void *data, int errors)
 
 	if (!m->pg_init_required)
 		m->queue_io = 0;
-
-	m->pg_init_delay_retry = delay_retry;
-	queue_work(kmultipathd, &m->process_queued_ios);
+	else {
+		m->pg_init_delay_retry = delay_retry;
+		__pg_init_all_paths(m, 50/HZ);
+		goto out;
+	}
 
 	/*
 	 * Wake up any thread waiting to suspend.
@@ -1593,8 +1563,13 @@  static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
 		r = scsi_verify_blk_ioctl(NULL, cmd);
 
-	if (r == -ENOTCONN && !fatal_signal_pending(current))
-		queue_work(kmultipathd, &m->process_queued_ios);
+	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
+		spin_lock_irqsave(&m->lock, flags);
+		if (m->current_pgpath && m->pg_init_required)
+			__pg_init_all_paths(m, 0);
+		spin_unlock_irqrestore(&m->lock, flags);
+		dm_table_run_md_queue_async(m->ti->table);
+	}
 
 	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }