diff mbox

blktrace: Fix potentail deadlock between delete & sysfs ops

Message ID 1502384553-14442-1-git-send-email-longman@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Waiman Long Aug. 10, 2017, 5:02 p.m. UTC
The lockdep code had reported the following unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(s_active#228);
                               lock(&bdev->bd_mutex/1);
                               lock(s_active#228);
  lock(&bdev->bd_mutex);

 *** DEADLOCK ***

The deadlock may happen when one task (CPU1) is trying to delete
a partition in a block device and another task (CPU0) is accessing
tracing sysfs file in that partition.

To avoid that, accessing tracing sysfs file will now use a mutex
trylock loop and the operation will fail if a delete operation is
in progress.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 block/ioctl.c           |  3 +++
 include/linux/fs.h      |  1 +
 kernel/trace/blktrace.c | 24 ++++++++++++++++++++++--
 3 files changed, 26 insertions(+), 2 deletions(-)

Comments

Steven Rostedt Aug. 15, 2017, 11:11 p.m. UTC | #1
On Thu, 10 Aug 2017 13:02:33 -0400
Waiman Long <longman@redhat.com> wrote:

> The lockdep code had reported the following unsafe locking scenario:
> 
>        CPU0                    CPU1
>        ----                    ----
>   lock(s_active#228);
>                                lock(&bdev->bd_mutex/1);
>                                lock(s_active#228);
>   lock(&bdev->bd_mutex);
> 
>  *** DEADLOCK ***
> 
> The deadlock may happen when one task (CPU1) is trying to delete
> a partition in a block device and another task (CPU0) is accessing
> tracing sysfs file in that partition.
> 
> To avoid that, accessing tracing sysfs file will now use a mutex
> trylock loop and the operation will fail if a delete operation is
> in progress.
> 

This is wrong at a lot of levels.

> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  block/ioctl.c           |  3 +++
>  include/linux/fs.h      |  1 +
>  kernel/trace/blktrace.c | 24 ++++++++++++++++++++++--
>  3 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/block/ioctl.c b/block/ioctl.c
> index 0de02ee..7211608 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -86,12 +86,15 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
>  				return -EBUSY;
>  			}
>  			/* all seems OK */
> +			bdev->bd_deleting = 1;

Note, one would need a  memory barrier here. But I'm not sure how much
of a fast path this location is.

			/*
			 * Make sure bd_deleting is set before taking
			 * mutex.
			 */
			smp_mb();

>  			fsync_bdev(bdevp);
>  			invalidate_bdev(bdevp);
>  
>  			mutex_lock_nested(&bdev->bd_mutex, 1);
>  			delete_partition(disk, partno);
>  			mutex_unlock(&bdev->bd_mutex);
> +			bdev->bd_deleting = 0;
> +
>  			mutex_unlock(&bdevp->bd_mutex);
>  			bdput(bdevp);
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 7b5d681..5d4ae9d 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -427,6 +427,7 @@ struct block_device {
>  #endif
>  	struct block_device *	bd_contains;
>  	unsigned		bd_block_size;
> +	int			bd_deleting;
>  	struct hd_struct *	bd_part;
>  	/* number of times partitions within this device have been opened. */
>  	unsigned		bd_part_count;
> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
> index bc364f8..715e77e 100644
> --- a/kernel/trace/blktrace.c
> +++ b/kernel/trace/blktrace.c
> @@ -1605,6 +1605,18 @@ static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
>  	return bdev_get_queue(bdev);
>  }
>  
> +/*
> + * Read/write to the tracing sysfs file requires taking references to the
> + * sysfs file and then acquiring the bd_mutex. Deleting a block device
> + * requires acquiring the bd_mutex and then waiting for all the sysfs
> + * references to be gone. This can lead to deadlock if both operations
> + * happen simultaneously. To avoid this problem, read/write to the
> + * the tracing sysfs files can now fail if the bd_mutex cannot be
> + * acquired while a deletion operation is in progress.
> + *
> + * A mutex trylock loop is used assuming that tracing sysfs operations
> + * aren't frequently enough to cause any contention problem.
> + */
>  static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
>  					 struct device_attribute *attr,
>  					 char *buf)
> @@ -1622,7 +1634,11 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
>  	if (q == NULL)
>  		goto out_bdput;
>  
> -	mutex_lock(&bdev->bd_mutex);
> +	while (!mutex_trylock(&bdev->bd_mutex)) {

		/* Make sure trylock happens before reading bd_deleting */
		smp_mb();

Since we don't take the lock, there's nothing that prevents the CPU
from fetching bd_deleting early, and this going into an infinite spin
even though bd_deleting is clear (without the memory barriers).

> +		if (bdev->bd_deleting)
> +			goto out_bdput;

You also just turned the mutex into a spinlock. What happens if we just
preempted the owner of bdev->bd_mutex and are an RT task with higher
priority? This will turn into a live lock.

> +		schedule();
> +	}
>  
>  	if (attr == &dev_attr_enable) {
>  		ret = sprintf(buf, "%u\n", !!q->blk_trace);
> @@ -1683,7 +1699,11 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
>  	if (q == NULL)
>  		goto out_bdput;
>  
> -	mutex_lock(&bdev->bd_mutex);
> +	while (!mutex_trylock(&bdev->bd_mutex)) {

		/* Make sure trylock happens before reading bd_deleting */
		smp_mb();

> +		if (bdev->bd_deleting)
> +			goto out_bdput;

Same here.

-- Steve

> +		schedule();
> +	}
>  
>  	if (attr == &dev_attr_enable) {
>  		if (value)
Waiman Long Aug. 16, 2017, 6:14 p.m. UTC | #2
On 08/15/2017 07:11 PM, Steven Rostedt wrote:
> On Thu, 10 Aug 2017 13:02:33 -0400
> Waiman Long <longman@redhat.com> wrote:
>
>> The lockdep code had reported the following unsafe locking scenario:
>>
>>        CPU0                    CPU1
>>        ----                    ----
>>   lock(s_active#228);
>>                                lock(&bdev->bd_mutex/1);
>>                                lock(s_active#228);
>>   lock(&bdev->bd_mutex);
>>
>>  *** DEADLOCK ***
>>
>> The deadlock may happen when one task (CPU1) is trying to delete
>> a partition in a block device and another task (CPU0) is accessing
>> tracing sysfs file in that partition.
>>
>> To avoid that, accessing tracing sysfs file will now use a mutex
>> trylock loop and the operation will fail if a delete operation is
>> in progress.
>>
> This is wrong at a lot of levels.
>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> ---
>>  block/ioctl.c           |  3 +++
>>  include/linux/fs.h      |  1 +
>>  kernel/trace/blktrace.c | 24 ++++++++++++++++++++++--
>>  3 files changed, 26 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/ioctl.c b/block/ioctl.c
>> index 0de02ee..7211608 100644
>> --- a/block/ioctl.c
>> +++ b/block/ioctl.c
>> @@ -86,12 +86,15 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
>>  				return -EBUSY;
>>  			}
>>  			/* all seems OK */
>> +			bdev->bd_deleting = 1;
> Note, one would need a  memory barrier here. But I'm not sure how much
> of a fast path this location is.
>
> 			/*
> 			 * Make sure bd_deleting is set before taking
> 			 * mutex.
> 			 */
> 			smp_mb();
>

You are right. Some sort of memory barrier is needed to make sure that
the setting of bd_deleting won't get reordered into the
mutex_lock/mutex_unlock critical section.

>>  			fsync_bdev(bdevp);
>>  			invalidate_bdev(bdevp);
>>  
>>  			mutex_lock_nested(&bdev->bd_mutex, 1);
>>  			delete_partition(disk, partno);
>>  			mutex_unlock(&bdev->bd_mutex);
>> +			bdev->bd_deleting = 0;
>> +
>>  			mutex_unlock(&bdevp->bd_mutex);
>>  			bdput(bdevp);
>>  
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 7b5d681..5d4ae9d 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -427,6 +427,7 @@ struct block_device {
>>  #endif
>>  	struct block_device *	bd_contains;
>>  	unsigned		bd_block_size;
>> +	int			bd_deleting;
>>  	struct hd_struct *	bd_part;
>>  	/* number of times partitions within this device have been opened. */
>>  	unsigned		bd_part_count;
>> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
>> index bc364f8..715e77e 100644
>> --- a/kernel/trace/blktrace.c
>> +++ b/kernel/trace/blktrace.c
>> @@ -1605,6 +1605,18 @@ static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
>>  	return bdev_get_queue(bdev);
>>  }
>>  
>> +/*
>> + * Read/write to the tracing sysfs file requires taking references to the
>> + * sysfs file and then acquiring the bd_mutex. Deleting a block device
>> + * requires acquiring the bd_mutex and then waiting for all the sysfs
>> + * references to be gone. This can lead to deadlock if both operations
>> + * happen simultaneously. To avoid this problem, read/write to the
>> + * the tracing sysfs files can now fail if the bd_mutex cannot be
>> + * acquired while a deletion operation is in progress.
>> + *
>> + * A mutex trylock loop is used assuming that tracing sysfs operations
>> + * aren't frequently enough to cause any contention problem.
>> + */
>>  static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
>>  					 struct device_attribute *attr,
>>  					 char *buf)
>> @@ -1622,7 +1634,11 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
>>  	if (q == NULL)
>>  		goto out_bdput;
>>  
>> -	mutex_lock(&bdev->bd_mutex);
>> +	while (!mutex_trylock(&bdev->bd_mutex)) {
> 		/* Make sure trylock happens before reading bd_deleting */
> 		smp_mb();
>
> Since we don't take the lock, there's nothing that prevents the CPU
> from fetching bd_deleting early, and this going into an infinite spin
> even though bd_deleting is clear (without the memory barriers).
>
>> +		if (bdev->bd_deleting)
>> +			goto out_bdput;

We don't need a memory barrier here. Just a READ_ONCE() to make sure
that the compiler won't optimize the read out of the mutex_trylock()
loop is enough.

> You also just turned the mutex into a spinlock. What happens if we just
> preempted the owner of bdev->bd_mutex and are an RT task with higher
> priority? This will turn into a live lock.
>
>> +		schedule();
>> +	}
>>  

That is OK because I used schedule() instead of cpu_relax() for
inserting delay.

Thanks for the comment. I will send out an updated patch later today.

Cheers,
Longman
Steven Rostedt Aug. 16, 2017, 6:17 p.m. UTC | #3
On Wed, 16 Aug 2017 14:14:36 -0400
Waiman Long <longman@redhat.com> wrote:

> > You also just turned the mutex into a spinlock. What happens if we just
> > preempted the owner of bdev->bd_mutex and are an RT task with higher
> > priority? This will turn into a live lock.
> >  
> >> +		schedule();
> >> +	}
> >>    
> 
> That is OK because I used schedule() instead of cpu_relax() for
> inserting delay.

Please explain to me how that is OK? schedule is a nop if the current
task is the highest priority task running, and it preempted the owner
of the lock. Nothing will actually schedule.

-- Steve
Waiman Long Aug. 16, 2017, 6:46 p.m. UTC | #4
On 08/16/2017 02:17 PM, Steven Rostedt wrote:
> On Wed, 16 Aug 2017 14:14:36 -0400
> Waiman Long <longman@redhat.com> wrote:
>
>>> You also just turned the mutex into a spinlock. What happens if we just
>>> preempted the owner of bdev->bd_mutex and are an RT task with higher
>>> priority? This will turn into a live lock.
>>>  
>>>> +		schedule();
>>>> +	}
>>>>    
>> That is OK because I used schedule() instead of cpu_relax() for
>> inserting delay.
> Please explain to me how that is OK? schedule is a nop if the current
> task is the highest priority task running, and it preempted the owner
> of the lock. Nothing will actually schedule.
>
> -- Steve

I haven't been thinking about RT tasks. You are right that it can be a
problem in this case. I think I will have to revert back to use
mutex_lock() if a RT task is running. Though in this case, the lock
inversion problem will still be there. However, it is highly unlikely
that a RT task will need to read write the block trace sysfs files.

Thanks for the input.

Cheers,
Longman
Steven Rostedt Aug. 16, 2017, 6:59 p.m. UTC | #5
On Wed, 16 Aug 2017 14:46:42 -0400
Waiman Long <longman@redhat.com> wrote:


> I haven't been thinking about RT tasks. You are right that it can be a
> problem in this case. I think I will have to revert back to use
> mutex_lock() if a RT task is running. Though in this case, the lock
> inversion problem will still be there. However, it is highly unlikely
> that a RT task will need to read write the block trace sysfs files.

And it is highly unlikely that the lock inversion will happen. But
let's not switch one bug with another. And with PREEMPT_RT coming, that
can boost tasks into being RT, it can make the likelihood of RT tasks
running normally non RT tasks higher.

-- Steve
Waiman Long Aug. 16, 2017, 7:07 p.m. UTC | #6
On 08/16/2017 02:59 PM, Steven Rostedt wrote:
> On Wed, 16 Aug 2017 14:46:42 -0400
> Waiman Long <longman@redhat.com> wrote:
>
>
>> I haven't been thinking about RT tasks. You are right that it can be a
>> problem in this case. I think I will have to revert back to use
>> mutex_lock() if a RT task is running. Though in this case, the lock
>> inversion problem will still be there. However, it is highly unlikely
>> that a RT task will need to read write the block trace sysfs files.
> And it is highly unlikely that the lock inversion will happen. But

That is true too:-)

> let's not switch one bug with another. And with PREEMPT_RT coming, that
> can boost tasks into being RT, it can make the likelihood of RT tasks
> running normally non RT tasks higher.
>
> -- Steve

I am thinking about maybe letting a RT task to sleep a tiny amount of
time instead of calling schedule(). Hopefully, that will allow a
lower-priority task to make forward progress.

Cheers,
Longman
Steven Rostedt Aug. 16, 2017, 7:14 p.m. UTC | #7
On Wed, 16 Aug 2017 15:07:25 -0400
Waiman Long <longman@redhat.com> wrote:


> I am thinking about maybe letting a RT task to sleep a tiny amount of
> time instead of calling schedule(). Hopefully, that will allow a
> lower-priority task to make forward progress.

That is actually one of the ugly hacks we have in the PREEMPT_RT patch
(and one of the things we need to fix before getting it to mainline).

A simple msleep(1).

-- Steve
Waiman Long Aug. 16, 2017, 7:26 p.m. UTC | #8
On 08/16/2017 03:14 PM, Steven Rostedt wrote:
> On Wed, 16 Aug 2017 15:07:25 -0400
> Waiman Long <longman@redhat.com> wrote:
>
>
>> I am thinking about maybe letting a RT task to sleep a tiny amount of
>> time instead of calling schedule(). Hopefully, that will allow a
>> lower-priority task to make forward progress.
> That is actually one of the ugly hacks we have in the PREEMPT_RT patch
> (and one of the things we need to fix before getting it to mainline).
>
> A simple msleep(1).
>
> -- Steve


I am planning to use usleep_range(). That will allow a short sleep in
the us range instead of ms. The documentation suggest that for a
10us-20ms sleep.

Cheers,
Longman
diff mbox

Patch

diff --git a/block/ioctl.c b/block/ioctl.c
index 0de02ee..7211608 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -86,12 +86,15 @@  static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 				return -EBUSY;
 			}
 			/* all seems OK */
+			bdev->bd_deleting = 1;
 			fsync_bdev(bdevp);
 			invalidate_bdev(bdevp);
 
 			mutex_lock_nested(&bdev->bd_mutex, 1);
 			delete_partition(disk, partno);
 			mutex_unlock(&bdev->bd_mutex);
+			bdev->bd_deleting = 0;
+
 			mutex_unlock(&bdevp->bd_mutex);
 			bdput(bdevp);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7b5d681..5d4ae9d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -427,6 +427,7 @@  struct block_device {
 #endif
 	struct block_device *	bd_contains;
 	unsigned		bd_block_size;
+	int			bd_deleting;
 	struct hd_struct *	bd_part;
 	/* number of times partitions within this device have been opened. */
 	unsigned		bd_part_count;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f8..715e77e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1605,6 +1605,18 @@  static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
 	return bdev_get_queue(bdev);
 }
 
+/*
+ * Read/write to the tracing sysfs file requires taking references to the
+ * sysfs file and then acquiring the bd_mutex. Deleting a block device
+ * requires acquiring the bd_mutex and then waiting for all the sysfs
+ * references to be gone. This can lead to deadlock if both operations
+ * happen simultaneously. To avoid this problem, read/write to the
+ * the tracing sysfs files can now fail if the bd_mutex cannot be
+ * acquired while a deletion operation is in progress.
+ *
+ * A mutex trylock loop is used assuming that tracing sysfs operations
+ * aren't frequently enough to cause any contention problem.
+ */
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -1622,7 +1634,11 @@  static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (q == NULL)
 		goto out_bdput;
 
-	mutex_lock(&bdev->bd_mutex);
+	while (!mutex_trylock(&bdev->bd_mutex)) {
+		if (bdev->bd_deleting)
+			goto out_bdput;
+		schedule();
+	}
 
 	if (attr == &dev_attr_enable) {
 		ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1683,7 +1699,11 @@  static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (q == NULL)
 		goto out_bdput;
 
-	mutex_lock(&bdev->bd_mutex);
+	while (!mutex_trylock(&bdev->bd_mutex)) {
+		if (bdev->bd_deleting)
+			goto out_bdput;
+		schedule();
+	}
 
 	if (attr == &dev_attr_enable) {
 		if (value)