diff mbox series

block: remove per-disk debugfs files in blk_unregister_queue

Message ID 20220524083325.833981-1-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series block: remove per-disk debugfs files in blk_unregister_queue | expand

Commit Message

Christoph Hellwig May 24, 2022, 8:33 a.m. UTC
The block debugfs files are created in blk_register_queue, which is
called by add_disk and use a naming scheme based on the disk_name.
After del_gendisk returns that name can be reused and thus we must not
leave these debugfs files around, otherwise the kernel is unhappy
and spews messages like:

	Directory XXXXX with parent 'block' already present!

and the newly created devices will not have working debugfs files.

Move the unregistration to blk_unregister_queue instead (which matches
the sysfs unregistration) to make sure the debugfs life time rules match
those of the disk name.

As part of the move also make sure the whole debugfs unregistration is
inside a single debugfs_mutex critical section.

Note that this breaks blktests block/002, which checks that the debugfs
directory has not been removed while blktests is running, but that
particular check should simply be removed from the test case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-debugfs.c  |  8 ++------
 block/blk-mq-debugfs.h  |  5 -----
 block/blk-rq-qos.c      |  2 --
 block/blk-sysfs.c       | 15 +++++++--------
 kernel/trace/blktrace.c |  3 ---
 5 files changed, 9 insertions(+), 24 deletions(-)

Comments

Hannes Reinecke May 24, 2022, 9:29 a.m. UTC | #1
On 5/24/22 10:33, Christoph Hellwig wrote:
> The block debugfs files are created in blk_register_queue, which is
> called by add_disk and use a naming scheme based on the disk_name.
> After del_gendisk returns that name can be reused and thus we must not
> leave these debugfs files around, otherwise the kernel is unhappy
> and spews messages like:
> 
> 	Directory XXXXX with parent 'block' already present!
> 
> and the newly created devices will not have working debugfs files.
> 
> Move the unregistration to blk_unregister_queue instead (which matches
> the sysfs unregistration) to make sure the debugfs life time rules match
> those of the disk name.
> 
> As part of the move also make sure the whole debugfs unregistration is
> inside a single debugfs_mutex critical section.
> 
> Note that this breaks blktests block/002, which checks that the debugfs
> directory has not been removed while blktests is running, but that
> particular check should simply be removed from the test case.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   block/blk-mq-debugfs.c  |  8 ++------
>   block/blk-mq-debugfs.h  |  5 -----
>   block/blk-rq-qos.c      |  2 --
>   block/blk-sysfs.c       | 15 +++++++--------
>   kernel/trace/blktrace.c |  3 ---
>   5 files changed, 9 insertions(+), 24 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 7e4136a60e1cc..f7eaa5405e4b5 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -713,6 +713,8 @@ void blk_mq_debugfs_register(struct request_queue *q)
>   
>   void blk_mq_debugfs_unregister(struct request_queue *q)
>   {
> +	/* all entries were removed by the caller */
> +	q->rqos_debugfs_dir = NULL;
>   	q->sched_debugfs_dir = NULL;
>   }
>   
> @@ -833,12 +835,6 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
>   	debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
>   }
>   
> -void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
> -{
> -	debugfs_remove_recursive(q->rqos_debugfs_dir);
> -	q->rqos_debugfs_dir = NULL;
> -}
> -
>   void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
>   					struct blk_mq_hw_ctx *hctx)
>   {
> diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
> index 69918f4170d69..401e9e9da640b 100644
> --- a/block/blk-mq-debugfs.h
> +++ b/block/blk-mq-debugfs.h
> @@ -36,7 +36,6 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
>   
>   void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
>   void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
> -void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q);
>   #else
>   static inline void blk_mq_debugfs_register(struct request_queue *q)
>   {
> @@ -87,10 +86,6 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
>   static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
>   {
>   }
> -
> -static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
> -{
> -}
>   #endif
>   
>   #ifdef CONFIG_BLK_DEBUG_FS_ZONED
> diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
> index e83af7bc75919..d3a75693adbf4 100644
> --- a/block/blk-rq-qos.c
> +++ b/block/blk-rq-qos.c
> @@ -294,8 +294,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
>   
>   void rq_qos_exit(struct request_queue *q)
>   {
> -	blk_mq_debugfs_unregister_queue_rqos(q);
> -
>   	while (q->rq_qos) {
>   		struct rq_qos *rqos = q->rq_qos;
>   		q->rq_qos = rqos->next;
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 88bd41d4cb593..c02fec8e9a3f6 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -779,14 +779,6 @@ static void blk_release_queue(struct kobject *kobj)
>   	if (queue_is_mq(q))
>   		blk_mq_release(q);
>   
> -	blk_trace_shutdown(q);
> -	mutex_lock(&q->debugfs_mutex);
> -	debugfs_remove_recursive(q->debugfs_dir);
> -	mutex_unlock(&q->debugfs_mutex);
> -
> -	if (queue_is_mq(q))
> -		blk_mq_debugfs_unregister(q);
> -
>   	bioset_exit(&q->bio_split);
>   
>   	if (blk_queue_has_srcu(q))
> @@ -951,5 +943,12 @@ void blk_unregister_queue(struct gendisk *disk)
>   
>   	mutex_unlock(&q->sysfs_dir_lock);
>   
> +	mutex_lock(&q->debugfs_mutex);
> +	blk_trace_shutdown(q);
> +	debugfs_remove_recursive(q->debugfs_dir);
> +	if (queue_is_mq(q))
> +		blk_mq_debugfs_unregister(q);
> +	mutex_unlock(&q->debugfs_mutex);
> +
Odd.
Calling debugfs_remove_recursive(q->debugfs_dir) for all queue types, 
but setting q->debugfs_dir to NULL only for mq?
Care to explain?

Cheers,

Hannes
Ming Lei May 24, 2022, 11:47 a.m. UTC | #2
On Tue, May 24, 2022 at 10:33:25AM +0200, Christoph Hellwig wrote:
> The block debugfs files are created in blk_register_queue, which is
> called by add_disk and use a naming scheme based on the disk_name.
> After del_gendisk returns that name can be reused and thus we must not
> leave these debugfs files around, otherwise the kernel is unhappy
> and spews messages like:
> 
> 	Directory XXXXX with parent 'block' already present!
> 
> and the newly created devices will not have working debugfs files.
> 
> Move the unregistration to blk_unregister_queue instead (which matches
> the sysfs unregistration) to make sure the debugfs life time rules match
> those of the disk name.

Not look into details yet, but as one blk-mq debugfs user, I don't like
the idea, since I often see io hang issue when calling
blk_mq_freeze_queue_wait(), and blk-mq debugfs is very helpful for
investigating this kind of issue.

But now blk-mq debugfs is gone with this patch __before__ draining IO in
del_gendisk, and it becomes not useful as before.


Thanks,
Ming
Christoph Hellwig May 24, 2022, noon UTC | #3
On Tue, May 24, 2022 at 11:29:05AM +0200, Hannes Reinecke wrote:
> Calling debugfs_remove_recursive(q->debugfs_dir) for all queue types, but 
> setting q->debugfs_dir to NULL only for mq?
> Care to explain?

->debugfs_dir is never set to NULL, only q->sched_debugfs_dir and
q->rqos_debugfs_dir.  Both of those are only used for blk-mq (and
don't really need the reset either, but this is all existing code
and will be dealt with later instead of a minimalistic bug fix.
Christoph Hellwig May 24, 2022, 12:01 p.m. UTC | #4
On Tue, May 24, 2022 at 07:47:03PM +0800, Ming Lei wrote:
> Not look into details yet, but as one blk-mq debugfs user, I don't like
> the idea, since I often see io hang issue when calling
> blk_mq_freeze_queue_wait(), and blk-mq debugfs is very helpful for
> investigating this kind of issue.
> 
> But now blk-mq debugfs is gone with this patch __before__ draining IO in
> del_gendisk, and it becomes not useful as before.

This is the way hot it is set up, so in doubt I want it to be torn down
synchronously.  There might be a way to move the teardown later, but
that will require a very careful audit.
Ming Lei May 24, 2022, 12:14 p.m. UTC | #5
On Tue, May 24, 2022 at 02:01:34PM +0200, Christoph Hellwig wrote:
> On Tue, May 24, 2022 at 07:47:03PM +0800, Ming Lei wrote:
> > Not look into details yet, but as one blk-mq debugfs user, I don't like
> > the idea, since I often see io hang issue when calling
> > blk_mq_freeze_queue_wait(), and blk-mq debugfs is very helpful for
> > investigating this kind of issue.
> > 
> > But now blk-mq debugfs is gone with this patch __before__ draining IO in
> > del_gendisk, and it becomes not useful as before.
> 
> This is the way hot it is set up, so in doubt I want it to be torn down
> synchronously.  There might be a way to move the teardown later, but
> that will require a very careful audit.

Then please move it to disk release at least, when blk-mq debug becomes
less useful.

In theory, blk_cleanup_queue() may have similar issue, but most of
drivers release disk after blk_cleanup_queue() except for scsi disk.

So from user viewpoint, the best place to tear down blk-mq debugfs is
still in queue release handler.

Thanks,
Ming
Christoph Hellwig May 24, 2022, 1:06 p.m. UTC | #6
On Tue, May 24, 2022 at 08:14:18PM +0800, Ming Lei wrote:
> Then please move it to disk release at least, when blk-mq debug becomes
> less useful.

No. del_gendisk is where we need stop all new external access to the
queue so that we can sanely unwind it without chasing one little thing
after another.  This is where we stop allowing opening it, stop allowing
sysfs access, and stop requests from being sent to the driver.  What
we could do is something like the patch below.  It passes basic removal
testing, but I'm a lot less confident in it than the debugfs changes
themselves:

---
From 5d553acccfc6bfbda48f13d252f1411ef65a3d0d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 May 2022 14:33:09 +0200
Subject: block: freeze the queue earlier in del_gendisk

Ming mentioned that being able to observer request in debugfs might
be useful while the queue is being frozen in del_gendisk.  Move the
free wait before blk_unregister_queue to make that possible.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/genhd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 36532b9318419..8ff5b187791af 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -621,6 +621,7 @@ void del_gendisk(struct gendisk *disk)
 	 * Prevent new I/O from crossing bio_queue_enter().
 	 */
 	blk_queue_start_drain(q);
+	blk_mq_freeze_queue_wait(q);
 
 	if (!(disk->flags & GENHD_FL_HIDDEN)) {
 		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
@@ -644,8 +645,6 @@ void del_gendisk(struct gendisk *disk)
 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
 	device_del(disk_to_dev(disk));
 
-	blk_mq_freeze_queue_wait(q);
-
 	blk_throtl_cancel_bios(disk->queue);
 
 	blk_sync_queue(q);
Ming Lei May 24, 2022, 2:44 p.m. UTC | #7
On Tue, May 24, 2022 at 10:33:25AM +0200, Christoph Hellwig wrote:
> The block debugfs files are created in blk_register_queue, which is
> called by add_disk and use a naming scheme based on the disk_name.
> After del_gendisk returns that name can be reused and thus we must not
> leave these debugfs files around, otherwise the kernel is unhappy
> and spews messages like:
> 
> 	Directory XXXXX with parent 'block' already present!
> 
> and the newly created devices will not have working debugfs files.
> 
> Move the unregistration to blk_unregister_queue instead (which matches
> the sysfs unregistration) to make sure the debugfs life time rules match
> those of the disk name.
> 
> As part of the move also make sure the whole debugfs unregistration is
> inside a single debugfs_mutex critical section.
> 
> Note that this breaks blktests block/002, which checks that the debugfs
> directory has not been removed while blktests is running, but that
> particular check should simply be removed from the test case.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/blk-mq-debugfs.c  |  8 ++------
>  block/blk-mq-debugfs.h  |  5 -----
>  block/blk-rq-qos.c      |  2 --
>  block/blk-sysfs.c       | 15 +++++++--------
>  kernel/trace/blktrace.c |  3 ---
>  5 files changed, 9 insertions(+), 24 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 7e4136a60e1cc..f7eaa5405e4b5 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -713,6 +713,8 @@ void blk_mq_debugfs_register(struct request_queue *q)
>  
>  void blk_mq_debugfs_unregister(struct request_queue *q)
>  {
> +	/* all entries were removed by the caller */
> +	q->rqos_debugfs_dir = NULL;
>  	q->sched_debugfs_dir = NULL;
>  }
>  
> @@ -833,12 +835,6 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
>  	debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
>  }
>  
> -void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
> -{
> -	debugfs_remove_recursive(q->rqos_debugfs_dir);
> -	q->rqos_debugfs_dir = NULL;
> -}
> -
>  void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
>  					struct blk_mq_hw_ctx *hctx)
>  {
> diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
> index 69918f4170d69..401e9e9da640b 100644
> --- a/block/blk-mq-debugfs.h
> +++ b/block/blk-mq-debugfs.h
> @@ -36,7 +36,6 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
>  
>  void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
>  void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
> -void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q);
>  #else
>  static inline void blk_mq_debugfs_register(struct request_queue *q)
>  {
> @@ -87,10 +86,6 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
>  static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
>  {
>  }
> -
> -static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
> -{
> -}
>  #endif
>  
>  #ifdef CONFIG_BLK_DEBUG_FS_ZONED
> diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
> index e83af7bc75919..d3a75693adbf4 100644
> --- a/block/blk-rq-qos.c
> +++ b/block/blk-rq-qos.c
> @@ -294,8 +294,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
>  
>  void rq_qos_exit(struct request_queue *q)
>  {
> -	blk_mq_debugfs_unregister_queue_rqos(q);
> -
>  	while (q->rq_qos) {
>  		struct rq_qos *rqos = q->rq_qos;
>  		q->rq_qos = rqos->next;
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 88bd41d4cb593..c02fec8e9a3f6 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -779,14 +779,6 @@ static void blk_release_queue(struct kobject *kobj)
>  	if (queue_is_mq(q))
>  		blk_mq_release(q);
>  
> -	blk_trace_shutdown(q);
> -	mutex_lock(&q->debugfs_mutex);
> -	debugfs_remove_recursive(q->debugfs_dir);
> -	mutex_unlock(&q->debugfs_mutex);
> -
> -	if (queue_is_mq(q))
> -		blk_mq_debugfs_unregister(q);
> -
>  	bioset_exit(&q->bio_split);
>  
>  	if (blk_queue_has_srcu(q))
> @@ -951,5 +943,12 @@ void blk_unregister_queue(struct gendisk *disk)
>  
>  	mutex_unlock(&q->sysfs_dir_lock);
>  
> +	mutex_lock(&q->debugfs_mutex);
> +	blk_trace_shutdown(q);
> +	debugfs_remove_recursive(q->debugfs_dir);

The above line code may cause kernel panic in the following code paths:

1) blk_mq_debugfs_unregister_hctxs() called from __blk_mq_update_nr_hw_queues()

2) blk_mq_debugfs_unregister_sched_hctx() called from blk_mq_exit_sched()


Thanks, 
Ming
Christoph Hellwig May 27, 2022, 7:15 a.m. UTC | #8
On Tue, May 24, 2022 at 10:44:44PM +0800, Ming Lei wrote:
> The above line code may cause kernel panic in the following code paths:
> 
> 1) blk_mq_debugfs_unregister_hctxs() called from __blk_mq_update_nr_hw_queues()
> 
> 2) blk_mq_debugfs_unregister_sched_hctx() called from blk_mq_exit_sched()

Yes.  Then again we already have the same sort of race window in reverse
at startup time, where nothing synchronizes between 
blk_mq_debugfs_register_hctxs from __blk_mq_update_nr_hw_queues and
add_disk adding debugfs files.

All this probably needs serializing using debugfs_mutex and a check of
QUEUE_FLAG_REGISTERED.
diff mbox series

Patch

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7e4136a60e1cc..f7eaa5405e4b5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -713,6 +713,8 @@  void blk_mq_debugfs_register(struct request_queue *q)
 
 void blk_mq_debugfs_unregister(struct request_queue *q)
 {
+	/* all entries were removed by the caller */
+	q->rqos_debugfs_dir = NULL;
 	q->sched_debugfs_dir = NULL;
 }
 
@@ -833,12 +835,6 @@  void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 	debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
 }
 
-void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
-{
-	debugfs_remove_recursive(q->rqos_debugfs_dir);
-	q->rqos_debugfs_dir = NULL;
-}
-
 void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 					struct blk_mq_hw_ctx *hctx)
 {
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 69918f4170d69..401e9e9da640b 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -36,7 +36,6 @@  void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
 void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
-void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q);
 #else
 static inline void blk_mq_debugfs_register(struct request_queue *q)
 {
@@ -87,10 +86,6 @@  static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 {
 }
-
-static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
-{
-}
 #endif
 
 #ifdef CONFIG_BLK_DEBUG_FS_ZONED
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index e83af7bc75919..d3a75693adbf4 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -294,8 +294,6 @@  void rq_qos_wait(struct rq_wait *rqw, void *private_data,
 
 void rq_qos_exit(struct request_queue *q)
 {
-	blk_mq_debugfs_unregister_queue_rqos(q);
-
 	while (q->rq_qos) {
 		struct rq_qos *rqos = q->rq_qos;
 		q->rq_qos = rqos->next;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 88bd41d4cb593..c02fec8e9a3f6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -779,14 +779,6 @@  static void blk_release_queue(struct kobject *kobj)
 	if (queue_is_mq(q))
 		blk_mq_release(q);
 
-	blk_trace_shutdown(q);
-	mutex_lock(&q->debugfs_mutex);
-	debugfs_remove_recursive(q->debugfs_dir);
-	mutex_unlock(&q->debugfs_mutex);
-
-	if (queue_is_mq(q))
-		blk_mq_debugfs_unregister(q);
-
 	bioset_exit(&q->bio_split);
 
 	if (blk_queue_has_srcu(q))
@@ -951,5 +943,12 @@  void blk_unregister_queue(struct gendisk *disk)
 
 	mutex_unlock(&q->sysfs_dir_lock);
 
+	mutex_lock(&q->debugfs_mutex);
+	blk_trace_shutdown(q);
+	debugfs_remove_recursive(q->debugfs_dir);
+	if (queue_is_mq(q))
+		blk_mq_debugfs_unregister(q);
+	mutex_unlock(&q->debugfs_mutex);
+
 	kobject_put(&disk_to_dev(disk)->kobj);
 }
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 10a32b0f2deb6..fe04c6f96ca5d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -770,14 +770,11 @@  int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  **/
 void blk_trace_shutdown(struct request_queue *q)
 {
-	mutex_lock(&q->debugfs_mutex);
 	if (rcu_dereference_protected(q->blk_trace,
 				      lockdep_is_held(&q->debugfs_mutex))) {
 		__blk_trace_startstop(q, 0);
 		__blk_trace_remove(q);
 	}
-
-	mutex_unlock(&q->debugfs_mutex);
 }
 
 #ifdef CONFIG_BLK_CGROUP