diff mbox series

[RFC,v2,5/5] block: revert back to synchronous request_queue removal

Message ID 20200409214530.2413-6-mcgrof@kernel.org (mailing list archive)
State New, archived
Headers show
Series blktrace: fix use after free | expand

Commit Message

Luis Chamberlain April 9, 2020, 9:45 p.m. UTC
Commit dc9edc44de6c ("block: Fix a blk_exit_rl() regression") merged on
v4.12 moved the work behind blk_release_queue() into a workqueue after a
splat floated around which indicated some work on blk_release_queue()
could sleep in blk_exit_rl(). This splat would be possible when a driver
called blk_put_queue() or blk_cleanup_queue() (which calls blk_put_queue()
as its final call) from an atomic context.

blk_put_queue() puts decrements the refcount for the request_queue
kobject, and upon reaching 0 blk_release_queue() is called. Although
blk_exit_rl() is now removed through commit db6d9952356 ("block: remove
request_list code"), we reserve the right to be able to sleep within
blk_release_queue() context. There should be little reason to
defer removal from atomic context these days, as you can always just
increase your block device's reference count even in atomic context and
leave the removal for the request_queue to the upper layers later.
However if you really need to defer removal of the request_queue, you can
set the queue flag QUEUE_FLAG_DEFER_REMOVAL now.

Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: yu kuai <yukuai3@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 block/blk-sysfs.c      | 40 ++++++++++++++++++++++++++++++++--------
 include/linux/blkdev.h |  3 +++
 2 files changed, 35 insertions(+), 8 deletions(-)

Comments

Bart Van Assche April 10, 2020, 3:12 a.m. UTC | #1
On 2020-04-09 14:45, Luis Chamberlain wrote:
> blk_put_queue() puts decrements the refcount for the request_queue
                  ^^^^
        can this word be left out?

> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 8b1cab52cef9..46fee1ef92e3 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -614,6 +614,7 @@ struct request_queue {
>  #define QUEUE_FLAG_PCI_P2PDMA	25	/* device supports PCI p2p requests */
>  #define QUEUE_FLAG_ZONE_RESETALL 26	/* supports Zone Reset All */
>  #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
> +#define QUEUE_FLAG_DEFER_REMOVAL 28	/* defer queue removal */
>  
>  #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
>  				 (1 << QUEUE_FLAG_SAME_COMP))
> @@ -648,6 +649,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
>  #else
>  #define blk_queue_rq_alloc_time(q)	false
>  #endif
> +#define blk_queue_defer_removal(q) \
> +	test_bit(QUEUE_FLAG_DEFER_REMOVAL, &(q)->queue_flags)

Since blk_queue_defer_removal() has no callers the code that depends on
QUEUE_FLAG_DEFER_REMOVAL to be set will be subject to bitrot. It would
make me happy if the QUEUE_FLAG_DEFER_REMOVAL flag and the code that
depends on that flag would be removed.

Please add a might_sleep() call in blk_put_queue() since with this patch
applied it is no longer allowed to call blk_put_queue() from atomic context.

Thanks,

Bart.
Luis Chamberlain April 10, 2020, 2:34 p.m. UTC | #2
On Thu, Apr 09, 2020 at 08:12:21PM -0700, Bart Van Assche wrote:
> On 2020-04-09 14:45, Luis Chamberlain wrote:
> > blk_put_queue() puts decrements the refcount for the request_queue
>                   ^^^^
>         can this word be left out?

Sure.

> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index 8b1cab52cef9..46fee1ef92e3 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -614,6 +614,7 @@ struct request_queue {
> >  #define QUEUE_FLAG_PCI_P2PDMA	25	/* device supports PCI p2p requests */
> >  #define QUEUE_FLAG_ZONE_RESETALL 26	/* supports Zone Reset All */
> >  #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
> > +#define QUEUE_FLAG_DEFER_REMOVAL 28	/* defer queue removal */
> >  
> >  #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
> >  				 (1 << QUEUE_FLAG_SAME_COMP))
> > @@ -648,6 +649,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
> >  #else
> >  #define blk_queue_rq_alloc_time(q)	false
> >  #endif
> > +#define blk_queue_defer_removal(q) \
> > +	test_bit(QUEUE_FLAG_DEFER_REMOVAL, &(q)->queue_flags)
> 
> Since blk_queue_defer_removal() has no callers the code that depends on
> QUEUE_FLAG_DEFER_REMOVAL to be set will be subject to bitrot. It would
> make me happy if the QUEUE_FLAG_DEFER_REMOVAL flag and the code that
> depends on that flag would be removed.

Sure thing.

Feedback on the cover letter thread patch 0/5 about whether or not to
consider userspace impact changes on these changes should be detailed on
the commit log would be useful.

> Please add a might_sleep() call in blk_put_queue() since with this patch
> applied it is no longer allowed to call blk_put_queue() from atomic context.

Sure thing.

  Luis
Luis Chamberlain April 10, 2020, 8:50 p.m. UTC | #3
On Fri, Apr 10, 2020 at 8:34 AM Luis Chamberlain <mcgrof@kernel.org> wrote:
> On Thu, Apr 09, 2020 at 08:12:21PM -0700, Bart Van Assche wrote:
> > Please add a might_sleep() call in blk_put_queue() since with this patch
> > applied it is no longer allowed to call blk_put_queue() from atomic context.
>
> Sure thing.

On second though, I don't think blk_put_queue() would be the right
place for might_sleep(), given we really only care about the *last*
refcount decrement to 0. So I'll move it to blk_release_queue().
Granted, at that point we are too late, and we'd get a splat about
this issue *iff* we really sleep. So yeah, I do suppose that forcing
this check there still makes sense.

  Luis
Luis Chamberlain April 10, 2020, 9:27 p.m. UTC | #4
On Fri, Apr 10, 2020 at 2:50 PM Luis Chamberlain <mcgrof@kernel.org> wrote:
>
> On Fri, Apr 10, 2020 at 8:34 AM Luis Chamberlain <mcgrof@kernel.org> wrote:
> > On Thu, Apr 09, 2020 at 08:12:21PM -0700, Bart Van Assche wrote:
> > > Please add a might_sleep() call in blk_put_queue() since with this patch
> > > applied it is no longer allowed to call blk_put_queue() from atomic context.
> >
> > Sure thing.
>
> On second though, I don't think blk_put_queue() would be the right
> place for might_sleep(), given we really only care about the *last*
> refcount decrement to 0. So I'll move it to blk_release_queue().
> Granted, at that point we are too late, and we'd get a splat about
> this issue *iff* we really sleep. So yeah, I do suppose that forcing
> this check there still makes sense.

I'll add might_sleep() to both blk_release_queue() *and* blk_cleanup_queue().

  Luis
Bart Van Assche April 11, 2020, 11:21 p.m. UTC | #5
On 2020-04-10 14:27, Luis Chamberlain wrote:
> On Fri, Apr 10, 2020 at 2:50 PM Luis Chamberlain <mcgrof@kernel.org> wrote:
>>
>> On Fri, Apr 10, 2020 at 8:34 AM Luis Chamberlain <mcgrof@kernel.org> wrote:
>>> On Thu, Apr 09, 2020 at 08:12:21PM -0700, Bart Van Assche wrote:
>>>> Please add a might_sleep() call in blk_put_queue() since with this patch
>>>> applied it is no longer allowed to call blk_put_queue() from atomic context.
>>>
>>> Sure thing.
>>
>> On second though, I don't think blk_put_queue() would be the right
>> place for might_sleep(), given we really only care about the *last*
>> refcount decrement to 0. So I'll move it to blk_release_queue().
>> Granted, at that point we are too late, and we'd get a splat about
>> this issue *iff* we really sleep. So yeah, I do suppose that forcing
>> this check there still makes sense.
> 
> I'll add might_sleep() to both blk_release_queue() *and* blk_cleanup_queue().

Since there is already an unconditional mutex_lock() call in
blk_cleanup_queue(), do we really need to add a might_sleep() call in
blk_cleanup_queue()?

Thanks,

Bart.
Luis Chamberlain April 14, 2020, 3:38 a.m. UTC | #6
On Sat, Apr 11, 2020 at 04:21:17PM -0700, Bart Van Assche wrote:
> On 2020-04-10 14:27, Luis Chamberlain wrote:
> > On Fri, Apr 10, 2020 at 2:50 PM Luis Chamberlain <mcgrof@kernel.org> wrote:
> >>
> >> On Fri, Apr 10, 2020 at 8:34 AM Luis Chamberlain <mcgrof@kernel.org> wrote:
> >>> On Thu, Apr 09, 2020 at 08:12:21PM -0700, Bart Van Assche wrote:
> >>>> Please add a might_sleep() call in blk_put_queue() since with this patch
> >>>> applied it is no longer allowed to call blk_put_queue() from atomic context.
> >>>
> >>> Sure thing.
> >>
> >> On second though, I don't think blk_put_queue() would be the right
> >> place for might_sleep(), given we really only care about the *last*
> >> refcount decrement to 0. So I'll move it to blk_release_queue().
> >> Granted, at that point we are too late, and we'd get a splat about
> >> this issue *iff* we really sleep. So yeah, I do suppose that forcing
> >> this check there still makes sense.
> > 
> > I'll add might_sleep() to both blk_release_queue() *and* blk_cleanup_queue().
> 
> Since there is already an unconditional mutex_lock() call in
> blk_cleanup_queue(), do we really need to add a might_sleep() call in
> blk_cleanup_queue()?

You are right, mutex_lock() already has a might_sleep() sprinkled on it.

  Luis
diff mbox series

Patch

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 20f20b0fa0b9..2ae8c39c88ef 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -860,10 +860,9 @@  static void blk_exit_queue(struct request_queue *q)
 	bdi_put(q->backing_dev_info);
 }
 
-
 /**
- * __blk_release_queue - release a request queue
- * @work: pointer to the release_work member of the request queue to be released
+ * blk_release_queue_sync- release a request queue
+ * @q: pointer to the request queue to be released
  *
  * Description:
  *     This function is called when a block device is being unregistered. The
@@ -872,11 +871,27 @@  static void blk_exit_queue(struct request_queue *q)
  *     the reference counter of the request queue. Once the reference counter
  *     of the request queue reaches zero, blk_release_queue is called to release
  *     all allocated resources of the request queue.
+ *
+ *     There are two approaches to releasing the request queue, by default
+ *     we reserve the right to sleep on release and so release is synchronous.
+ *     If you know the path under which blk_cleanup_queue() or your last
+ *     blk_put_queue() is called can be called in atomic context you want to
+ *     ensure to defer the removal by setting the QUEUE_FLAG_DEFER_REMOVAL
+ *     flag as follows upon initialization:
+ *
+ *     blk_queue_flag_set(QUEUE_FLAG_DEFER_REMOVAL, q)
+ *
+ *     Note that deferring removal may have implications for userspace. An
+ *     example is if you are using an ioctl to allow removal of a block device,
+ *     and the kernel returns immediately even though the device may only
+ *     disappear after the full removal is completed.
+ *
+ *     You should also be able to work around this by just increasing the
+ *     refcount for the block device instead during your atomic operation,
+ *     and so QUEUE_FLAG_DEFER_REMOVAL should almost never be required.
  */
-static void __blk_release_queue(struct work_struct *work)
+static void blk_release_queue_sync(struct request_queue *q)
 {
-	struct request_queue *q = container_of(work, typeof(*q), release_work);
-
 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
@@ -905,13 +920,22 @@  static void __blk_release_queue(struct work_struct *work)
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
 
+void __blk_release_queue(struct work_struct *work)
+{
+	struct request_queue *q = container_of(work, typeof(*q), release_work);
+
+	blk_release_queue_sync(q);
+}
+
 static void blk_release_queue(struct kobject *kobj)
 {
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 
-	INIT_WORK(&q->release_work, __blk_release_queue);
-	schedule_work(&q->release_work);
+	if (blk_queue_defer_removal(q))
+		schedule_work(&q->release_work);
+	else
+		blk_release_queue_sync(q);
 }
 
 static const struct sysfs_ops queue_sysfs_ops = {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8b1cab52cef9..46fee1ef92e3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -614,6 +614,7 @@  struct request_queue {
 #define QUEUE_FLAG_PCI_P2PDMA	25	/* device supports PCI p2p requests */
 #define QUEUE_FLAG_ZONE_RESETALL 26	/* supports Zone Reset All */
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
+#define QUEUE_FLAG_DEFER_REMOVAL 28	/* defer queue removal */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP))
@@ -648,6 +649,8 @@  bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #else
 #define blk_queue_rq_alloc_time(q)	false
 #endif
+#define blk_queue_defer_removal(q) \
+	test_bit(QUEUE_FLAG_DEFER_REMOVAL, &(q)->queue_flags)
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \