diff mbox series

[RFC,1/2] bdev: add support for CPU latency PM QoS tuning

Message ID 20240829075423.1345042-2-tero.kristo@linux.intel.com (mailing list archive)
State New
Headers show
Series block: CPU latency PM QoS tuning | expand

Commit Message

Tero Kristo Aug. 29, 2024, 7:18 a.m. UTC
Add support for limiting CPU latency while block IO is running. When a
block IO is started, it will add a user configurable CPU latency limit
in place (if any.) The limit is removed with a configurable timeout mechanism.

Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
---
 block/bdev.c              | 51 +++++++++++++++++++++++++++++++++++++++
 block/bio.c               |  2 ++
 block/blk.h               |  2 ++
 include/linux/blk_types.h | 12 +++++++++
 4 files changed, 67 insertions(+)

Comments

Jens Axboe Aug. 29, 2024, 11:37 a.m. UTC | #1
On 8/29/24 1:18 AM, Tero Kristo wrote:
> diff --git a/block/bio.c b/block/bio.c
> index e9e809a63c59..6c46d75345d7 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -282,6 +282,8 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
>  	bio->bi_max_vecs = max_vecs;
>  	bio->bi_io_vec = table;
>  	bio->bi_pool = NULL;
> +
> +	bdev_update_cpu_latency_pm_qos(bio->bi_bdev);
>  }
>  EXPORT_SYMBOL(bio_init);

This is entirely the wrong place to do this, presumably it should be
done at IO dispatch time, not when something initializes a bio.

And also feels like entirely the wrong way to go about this, adding
overhead to potentially each IO dispatch, of which there can be millions
per second.
Tero Kristo Aug. 30, 2024, 11:55 a.m. UTC | #2
On Thu, 2024-08-29 at 05:37 -0600, Jens Axboe wrote:
> On 8/29/24 1:18 AM, Tero Kristo wrote:
> > diff --git a/block/bio.c b/block/bio.c
> > index e9e809a63c59..6c46d75345d7 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -282,6 +282,8 @@ void bio_init(struct bio *bio, struct
> > block_device *bdev, struct bio_vec *table,
> >  	bio->bi_max_vecs = max_vecs;
> >  	bio->bi_io_vec = table;
> >  	bio->bi_pool = NULL;
> > +
> > +	bdev_update_cpu_latency_pm_qos(bio->bi_bdev);
> >  }
> >  EXPORT_SYMBOL(bio_init);
> 
> This is entirely the wrong place to do this, presumably it should be
> done at IO dispatch time, not when something initializes a bio.
> 
> And also feels like entirely the wrong way to go about this, adding
> overhead to potentially each IO dispatch, of which there can be
> millions
> per second.

Any thoughts where it could/should be added?

I moved the bdev_* callback from bio_init to the below location and it
seems to work also:

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b4df8e5ac9e..d97a3a4252de 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2706,6 +2706,7 @@ static void __blk_mq_flush_plug_list(struct
request_queue *q,
 {
        if (blk_queue_quiesced(q))
                return;
+       bdev_update_cpu_latency_pm_qos(q->disk->part0);
        q->mq_ops->queue_rqs(&plug->mq_list);
 }
Ming Lei Aug. 30, 2024, 2:26 p.m. UTC | #3
On Fri, Aug 30, 2024 at 02:55:56PM +0300, Tero Kristo wrote:
> On Thu, 2024-08-29 at 05:37 -0600, Jens Axboe wrote:
> > On 8/29/24 1:18 AM, Tero Kristo wrote:
> > > diff --git a/block/bio.c b/block/bio.c
> > > index e9e809a63c59..6c46d75345d7 100644
> > > --- a/block/bio.c
> > > +++ b/block/bio.c
> > > @@ -282,6 +282,8 @@ void bio_init(struct bio *bio, struct
> > > block_device *bdev, struct bio_vec *table,
> > >  	bio->bi_max_vecs = max_vecs;
> > >  	bio->bi_io_vec = table;
> > >  	bio->bi_pool = NULL;
> > > +
> > > +	bdev_update_cpu_latency_pm_qos(bio->bi_bdev);
> > >  }
> > >  EXPORT_SYMBOL(bio_init);
> > 
> > This is entirely the wrong place to do this, presumably it should be
> > done at IO dispatch time, not when something initializes a bio.
> > 
> > And also feels like entirely the wrong way to go about this, adding
> > overhead to potentially each IO dispatch, of which there can be
> > millions
> > per second.
> 
> Any thoughts where it could/should be added?
> 
> I moved the bdev_* callback from bio_init to the below location and it
> seems to work also:
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 3b4df8e5ac9e..d97a3a4252de 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2706,6 +2706,7 @@ static void __blk_mq_flush_plug_list(struct
> request_queue *q,
>  {
>         if (blk_queue_quiesced(q))
>                 return;
> +       bdev_update_cpu_latency_pm_qos(q->disk->part0);
>         q->mq_ops->queue_rqs(&plug->mq_list);

IO submission CPU may not be same with the completion CPU, so this
approach looks wrong.

What you are trying to do is to avoid IO completion CPU to enter
deep idle in case of inflight block IOs.

Only fast device cares this CPU latency, maybe you just need to call
some generic helper in driver(NVMe), and you may have to figure out
the exact IO completion CPU for hardware queue with inflight IOs.

Thanks,
Ming
Tero Kristo Sept. 4, 2024, 11:37 a.m. UTC | #4
On Fri, 2024-08-30 at 22:26 +0800, Ming Lei wrote:
> On Fri, Aug 30, 2024 at 02:55:56PM +0300, Tero Kristo wrote:
> > On Thu, 2024-08-29 at 05:37 -0600, Jens Axboe wrote:
> > > On 8/29/24 1:18 AM, Tero Kristo wrote:
> > > > diff --git a/block/bio.c b/block/bio.c
> > > > index e9e809a63c59..6c46d75345d7 100644
> > > > --- a/block/bio.c
> > > > +++ b/block/bio.c
> > > > @@ -282,6 +282,8 @@ void bio_init(struct bio *bio, struct
> > > > block_device *bdev, struct bio_vec *table,
> > > >  	bio->bi_max_vecs = max_vecs;
> > > >  	bio->bi_io_vec = table;
> > > >  	bio->bi_pool = NULL;
> > > > +
> > > > +	bdev_update_cpu_latency_pm_qos(bio->bi_bdev);
> > > >  }
> > > >  EXPORT_SYMBOL(bio_init);
> > > 
> > > This is entirely the wrong place to do this, presumably it should
> > > be
> > > done at IO dispatch time, not when something initializes a bio.
> > > 
> > > And also feels like entirely the wrong way to go about this,
> > > adding
> > > overhead to potentially each IO dispatch, of which there can be
> > > millions
> > > per second.
> > 
> > Any thoughts where it could/should be added?
> > 
> > I moved the bdev_* callback from bio_init to the below location and
> > it
> > seems to work also:
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 3b4df8e5ac9e..d97a3a4252de 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2706,6 +2706,7 @@ static void __blk_mq_flush_plug_list(struct
> > request_queue *q,
> >  {
> >         if (blk_queue_quiesced(q))
> >                 return;
> > +       bdev_update_cpu_latency_pm_qos(q->disk->part0);
> >         q->mq_ops->queue_rqs(&plug->mq_list);
> 
> IO submission CPU may not be same with the completion CPU, so this
> approach looks wrong.
> 
> What you are trying to do is to avoid IO completion CPU to enter
> deep idle in case of inflight block IOs.
> 
> Only fast device cares this CPU latency, maybe you just need to call
> some generic helper in driver(NVMe), and you may have to figure out
> the exact IO completion CPU for hardware queue with inflight IOs.
> 
> Thanks,
> Ming
> 

Thanks for feedback, I've updated my patch to work on the NVMe driver
instead, taking the queue CPU affinity into account. I will send a
separate RFC of that out soonish to the corresponding list.

-Tero
diff mbox series

Patch

diff --git a/block/bdev.c b/block/bdev.c
index 353677ac49b3..8f20681a4ea6 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -405,10 +405,18 @@  void __init bdev_cache_init(void)
 	blockdev_superblock = blockdev_mnt->mnt_sb;   /* For writeback */
 }
 
+static void bdev_pm_qos_work(struct work_struct *work)
+{
+	struct bdev_cpu_latency_qos *qos =
+		container_of(work, struct bdev_cpu_latency_qos, work.work);
+	dev_pm_qos_remove_request(&qos->req);
+}
+
 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 {
 	struct block_device *bdev;
 	struct inode *inode;
+	int cpu;
 
 	inode = new_inode(blockdev_superblock);
 	if (!inode)
@@ -433,6 +441,16 @@  struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		return NULL;
 	}
 	bdev->bd_disk = disk;
+	bdev->bd_pm_qos = alloc_percpu(struct bdev_cpu_latency_qos);
+	if (!bdev->bd_pm_qos) {
+		free_percpu(bdev->bd_stats);
+		iput(inode);
+		return NULL;
+	}
+	for_each_possible_cpu(cpu)
+		INIT_DELAYED_WORK(per_cpu_ptr(&bdev->bd_pm_qos->work, cpu),
+				  bdev_pm_qos_work);
+	bdev->cpu_lat_limit = -1;
 	return bdev;
 }
 
@@ -462,6 +480,19 @@  void bdev_unhash(struct block_device *bdev)
 
 void bdev_drop(struct block_device *bdev)
 {
+	int cpu;
+	struct bdev_cpu_latency_qos *qos;
+
+	for_each_possible_cpu(cpu) {
+		qos = per_cpu_ptr(bdev->bd_pm_qos, cpu);
+		if (dev_pm_qos_request_active(&qos->req)) {
+			cancel_delayed_work(&qos->work);
+			dev_pm_qos_remove_request(&qos->req);
+		}
+	}
+
+	free_percpu(bdev->bd_pm_qos);
+
 	iput(BD_INODE(bdev));
 }
 
@@ -1281,6 +1312,26 @@  void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
 	blkdev_put_no_open(bdev);
 }
 
+void bdev_update_cpu_latency_pm_qos(struct block_device *bdev)
+{
+	int cpu;
+	struct bdev_cpu_latency_qos *qos;
+
+	if (!bdev || bdev->cpu_lat_limit < 0)
+		return;
+
+	cpu = raw_smp_processor_id();
+	qos = per_cpu_ptr(bdev->bd_pm_qos, cpu);
+
+	if (!dev_pm_qos_request_active(&qos->req))
+		dev_pm_qos_add_request(get_cpu_device(cpu), &qos->req,
+				       DEV_PM_QOS_RESUME_LATENCY,
+				       bdev->cpu_lat_limit);
+
+	mod_delayed_work(system_wq, &qos->work,
+			 msecs_to_jiffies(bdev->cpu_lat_timeout));
+}
+
 bool disk_live(struct gendisk *disk)
 {
 	return !inode_unhashed(BD_INODE(disk->part0));
diff --git a/block/bio.c b/block/bio.c
index e9e809a63c59..6c46d75345d7 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -282,6 +282,8 @@  void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
 	bio->bi_max_vecs = max_vecs;
 	bio->bi_io_vec = table;
 	bio->bi_pool = NULL;
+
+	bdev_update_cpu_latency_pm_qos(bio->bi_bdev);
 }
 EXPORT_SYMBOL(bio_init);
 
diff --git a/block/blk.h b/block/blk.h
index 189bc25beb50..dda2a188984b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -516,6 +516,8 @@  void drop_partition(struct block_device *part);
 
 void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
 
+void bdev_update_cpu_latency_pm_qos(struct block_device *bdev);
+
 struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 		struct lock_class_key *lkclass);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 781c4500491b..0ed29603eaa9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -11,6 +11,7 @@ 
 #include <linux/device.h>
 #include <linux/ktime.h>
 #include <linux/rw_hint.h>
+#include <linux/pm_qos.h>
 
 struct bio_set;
 struct bio;
@@ -38,6 +39,11 @@  struct bio_crypt_ctx;
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
 #define SECTOR_MASK		(PAGE_SECTORS - 1)
 
+struct bdev_cpu_latency_qos {
+	struct dev_pm_qos_request	req;
+	struct delayed_work		work;
+};
+
 struct block_device {
 	sector_t		bd_start_sect;
 	sector_t		bd_nr_sectors;
@@ -71,6 +77,12 @@  struct block_device {
 
 	struct partition_meta_info *bd_meta_info;
 	int			bd_writers;
+
+	/* For preventing deep idle during block I/O */
+	struct bdev_cpu_latency_qos __percpu	*bd_pm_qos;
+	int cpu_lat_timeout;
+	int cpu_lat_limit;
+
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path