diff mbox series

[1/2] block: track per-node I/O latency

Message ID 20240326153529.75989-2-hare@kernel.org (mailing list archive)
State New, archived
Headers show
Series block,nvme: latency-based I/O scheduler | expand

Commit Message

Hannes Reinecke March 26, 2024, 3:35 p.m. UTC
Add a new option 'BLK_NODE_LATENCY' to track per-node I/O latency.
This can be used by I/O scheduler to determine the 'best' queue
to send I/O to.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 block/Kconfig          |   7 +
 block/Makefile         |   1 +
 block/blk-mq-debugfs.c |   2 +
 block/blk-nodelat.c    | 368 +++++++++++++++++++++++++++++++++++++++++
 block/blk-rq-qos.h     |   6 +
 include/linux/blk-mq.h |  11 ++
 6 files changed, 395 insertions(+)
 create mode 100644 block/blk-nodelat.c

Comments

kernel test robot March 27, 2024, 6:03 p.m. UTC | #1
Hi Hannes,

kernel test robot noticed the following build errors:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on linus/master v6.9-rc1 next-20240327]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Hannes-Reinecke/block-track-per-node-I-O-latency/20240326-234521
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20240326153529.75989-2-hare%40kernel.org
patch subject: [PATCH 1/2] block: track per-node I/O latency
config: openrisc-allnoconfig (https://download.01.org/0day-ci/archive/20240328/202403280137.o1GjQ6cI-lkp@intel.com/config)
compiler: or1k-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240328/202403280137.o1GjQ6cI-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202403280137.o1GjQ6cI-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/blk-integrity.h:5,
                    from block/bdev.c:15:
>> include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
    1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
         |               ^~
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for 'blk_nodelat_latency' [-Wmissing-prototypes]
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         |     ^~~~~~~~~~~~~~~~~~~
   include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
    1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
         |               ^~


vim +/in +1240 include/linux/blk-mq.h

  1233	
  1234	#ifdef CONFIG_BLK_NODE_LATENCY
  1235	int blk_nodelat_enable(struct request_queue *q);
  1236	void blk_nodelat_disable(struct request_queue *q);
  1237	u64 blk_nodelat_latency(struct request_queue *q, int node);
  1238	int blk_nodelat_init(struct gendisk *disk);
  1239	#else
> 1240	static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
  1241	static inline void blk_nodelat_disable(struct request_queue *q) {}
> 1242	u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
kernel test robot March 27, 2024, 8:59 p.m. UTC | #2
Hi Hannes,

kernel test robot noticed the following build warnings:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.9-rc1 next-20240327]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Hannes-Reinecke/block-track-per-node-I-O-latency/20240326-234521
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20240326153529.75989-2-hare%40kernel.org
patch subject: [PATCH 1/2] block: track per-node I/O latency
config: arm-randconfig-001-20240327 (https://download.01.org/0day-ci/archive/20240328/202403280412.Ojp0tGKt-lkp@intel.com/config)
compiler: clang version 19.0.0git (https://github.com/llvm/llvm-project 23de3862dce582ce91c1aa914467d982cb1a73b4)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240328/202403280412.Ojp0tGKt-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202403280412.Ojp0tGKt-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from drivers/scsi/aic7xxx/aic79xx_pci.c:44:
   In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:46:
   In file included from include/linux/blkdev.h:9:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:8:
   In file included from include/linux/cacheflush.h:5:
   In file included from arch/arm/include/asm/cacheflush.h:10:
   In file included from include/linux/mm.h:2208:
   include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     522 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   In file included from drivers/scsi/aic7xxx/aic79xx_pci.c:44:
   In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:57:
   In file included from include/scsi/scsi_cmnd.h:7:
   In file included from include/linux/t10-pi.h:6:
   include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
    1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
         |               ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         |     ^
   include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         | ^
         | static 
   include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
    1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
         |               ^
   2 warnings and 2 errors generated.
--
   In file included from drivers/scsi/aic7xxx/aic79xx_core.c:43:
   In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:46:
   In file included from include/linux/blkdev.h:9:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:8:
   In file included from include/linux/cacheflush.h:5:
   In file included from arch/arm/include/asm/cacheflush.h:10:
   In file included from include/linux/mm.h:2208:
   include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     522 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   In file included from drivers/scsi/aic7xxx/aic79xx_core.c:43:
   In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:57:
   In file included from include/scsi/scsi_cmnd.h:7:
   In file included from include/linux/t10-pi.h:6:
   include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
    1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
         |               ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         |     ^
   include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         | ^
         | static 
   include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
    1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
         |               ^
   drivers/scsi/aic7xxx/aic79xx_core.c:5694:13: warning: variable 'data_addr' set but not used [-Wunused-but-set-variable]
    5694 |                         uint64_t data_addr;
         |                                  ^
   3 warnings and 2 errors generated.
--
   In file included from drivers/scsi/aic7xxx/aic7xxx_core.c:43:
   In file included from drivers/scsi/aic7xxx/aic7xxx_osm.h:63:
   In file included from include/linux/blkdev.h:9:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:8:
   In file included from include/linux/cacheflush.h:5:
   In file included from arch/arm/include/asm/cacheflush.h:10:
   In file included from include/linux/mm.h:2208:
   include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     522 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   In file included from drivers/scsi/aic7xxx/aic7xxx_core.c:43:
   In file included from drivers/scsi/aic7xxx/aic7xxx_osm.h:74:
   In file included from include/scsi/scsi_cmnd.h:7:
   In file included from include/linux/t10-pi.h:6:
   include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
    1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
         |               ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         |     ^
   include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         | ^
         | static 
   include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
    1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
         |               ^
   drivers/scsi/aic7xxx/aic7xxx_core.c:4171:13: warning: variable 'data_addr' set but not used [-Wunused-but-set-variable]
    4171 |                         uint32_t data_addr;
         |                                  ^
   3 warnings and 2 errors generated.
--
   In file included from drivers/scsi/aic7xxx/aic7xxx_osm.c:123:
   In file included from drivers/scsi/aic7xxx/aic7xxx_osm.h:63:
   In file included from include/linux/blkdev.h:9:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:8:
   In file included from include/linux/cacheflush.h:5:
   In file included from arch/arm/include/asm/cacheflush.h:10:
   In file included from include/linux/mm.h:2208:
   include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     522 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   In file included from drivers/scsi/aic7xxx/aic7xxx_osm.c:123:
   In file included from drivers/scsi/aic7xxx/aic7xxx_osm.h:74:
   In file included from include/scsi/scsi_cmnd.h:7:
   In file included from include/linux/t10-pi.h:6:
   include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
    1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
         |               ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         |     ^
   include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         | ^
         | static 
   include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
    1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
         |               ^
   drivers/scsi/aic7xxx/aic7xxx_osm.c:1435:24: warning: bitwise operation between different enumeration types ('ahc_feature' and 'ahc_flag') [-Wenum-enum-conversion]
    1435 |             && (ahc->features & AHC_SCB_BTT) == 0) {
         |                 ~~~~~~~~~~~~~ ^ ~~~~~~~~~~~
   3 warnings and 2 errors generated.
--
   In file included from drivers/scsi/aic7xxx/aic79xx_osm_pci.c:42:
   In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:46:
   In file included from include/linux/blkdev.h:9:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:8:
   In file included from include/linux/cacheflush.h:5:
   In file included from arch/arm/include/asm/cacheflush.h:10:
   In file included from include/linux/mm.h:2208:
   include/linux/vmstat.h:522:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     522 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   In file included from drivers/scsi/aic7xxx/aic79xx_osm_pci.c:42:
   In file included from drivers/scsi/aic7xxx/aic79xx_osm.h:57:
   In file included from include/scsi/scsi_cmnd.h:7:
   In file included from include/linux/t10-pi.h:6:
   include/linux/blk-mq.h:1240:15: error: unknown type name 'in'
    1240 | static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
         |               ^
>> include/linux/blk-mq.h:1242:5: warning: no previous prototype for function 'blk_nodelat_latency' [-Wmissing-prototypes]
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         |     ^
   include/linux/blk-mq.h:1242:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
    1242 | u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
         | ^
         | static 
   include/linux/blk-mq.h:1243:15: error: unknown type name 'in'
    1243 | static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
         |               ^
   drivers/scsi/aic7xxx/aic79xx_osm_pci.c:177:25: warning: shift count >= width of type [-Wshift-count-overflow]
     177 |                     dma_set_mask(dev, DMA_BIT_MASK(64)) == 0)
         |                                       ^~~~~~~~~~~~~~~~
   include/linux/dma-mapping.h:77:54: note: expanded from macro 'DMA_BIT_MASK'
      77 | #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
         |                                                      ^ ~~~
   3 warnings and 2 errors generated.


vim +/blk_nodelat_latency +1242 include/linux/blk-mq.h

  1233	
  1234	#ifdef CONFIG_BLK_NODE_LATENCY
  1235	int blk_nodelat_enable(struct request_queue *q);
  1236	void blk_nodelat_disable(struct request_queue *q);
  1237	u64 blk_nodelat_latency(struct request_queue *q, int node);
  1238	int blk_nodelat_init(struct gendisk *disk);
  1239	#else
> 1240	static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
  1241	static inline void blk_nodelat_disable(struct request_queue *q) {}
> 1242	u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
diff mbox series

Patch

diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..7ce60becfb1d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -186,6 +186,13 @@  config BLK_CGROUP_IOPRIO
 	scheduler and block devices process requests. Only some I/O schedulers
 	and some block devices support I/O priorities.
 
+config BLK_NODE_LATENCY
+       bool "Track per-node I/O latency"
+       help
+       Enable the .nlat interface for tracking per-node I/O latency.
+       This can be used by I/O schedulers to determine the queue with the
+       least latency.
+
 config BLK_DEBUG_FS
 	bool "Block layer debugging information in debugfs"
 	default y
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..e2683f55d15f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -21,6 +21,7 @@  obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_BLK_CGROUP_IOPRIO)	+= blk-ioprio.o
 obj-$(CONFIG_BLK_CGROUP_IOLATENCY)	+= blk-iolatency.o
 obj-$(CONFIG_BLK_CGROUP_IOCOST)	+= blk-iocost.o
+obj-$(CONFIG_BLK_NODE_LATENCY) += blk-nodelat.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 obj-$(CONFIG_MQ_IOSCHED_KYBER)	+= kyber-iosched.o
 bfq-y				:= bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..cb38228b95d8 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -762,6 +762,8 @@  static const char *rq_qos_id_to_name(enum rq_qos_id id)
 		return "latency";
 	case RQ_QOS_COST:
 		return "cost";
+	case RQ_QOS_NLAT:
+		return "node-latency";
 	}
 	return "unknown";
 }
diff --git a/block/blk-nodelat.c b/block/blk-nodelat.c
new file mode 100644
index 000000000000..45d7e622b147
--- /dev/null
+++ b/block/blk-nodelat.c
@@ -0,0 +1,368 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per-node request latency tracking.
+ *
+ * Copyright (C) 2023 Hannes Reinecke
+ *
+ * A simple per-node latency tracker for use
+ * by I/O scheduler.
+ * Latencies are measures over 'win_usec' microseconds
+ * and stored per node.
+ * If the number of measurements falls below 'lowat'
+ * the measurement is assumed to be unreliable and
+ * will become 'stale'.
+ * These 'stale' latencies can be 'decayed', where
+ * during each measurement interval the 'stale'
+ * latency value is decreased by 'decay' percent.
+ * Once the 'stale' latency reaches zero it
+ * will be updated by the measured latency.
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+
+#include "blk-stat.h"
+#include "blk-rq-qos.h"
+#include "blk.h"
+
+#define NLAT_DEFAULT_LOWAT 2
+#define NLAT_DEFAULT_DECAY 50
+
+struct rq_nlat {
+	struct rq_qos rqos;
+
+	u64 win_usec;		/* latency measurement window */
+	unsigned int lowat;	/* Low Watermark below which latency measurement is deemed unreliable */
+	unsigned int decay;	/* Percentage for 'decaying' latencies */
+	bool enabled;
+
+	struct blk_stat_callback *cb;
+
+	unsigned int num;
+	u64 *latency;
+	unsigned int *samples;
+};
+
+static inline struct rq_nlat *RQNLAT(struct rq_qos *rqos)
+{
+	return container_of(rqos, struct rq_nlat, rqos);
+}
+
+static u64 nlat_default_latency_usec(struct request_queue *q)
+{
+	/*
+	 * We default to 2msec for non-rotational storage, and 75msec
+	 * for rotational storage.
+	 */
+	if (blk_queue_nonrot(q))
+		return 2000ULL;
+	else
+		return 75000ULL;
+}
+
+static void nlat_timer_fn(struct blk_stat_callback *cb)
+{
+	struct rq_nlat *nlat = cb->data;
+	int n;
+
+	for (n = 0; n < cb->buckets; n++) {
+		if (cb->stat[n].nr_samples < nlat->lowat && nlat->latency[n]) {
+			/*
+			 * 'decay' the latency by the specified
+			 * percentage to ensure the nodes are
+			 * being tested to balance out temporary
+			 * latency spikes.
+			 */
+			if (nlat->decay)
+				nlat->latency[n] =
+					div64_u64(nlat->latency[n] * nlat->decay, 100);
+		} else
+			nlat->latency[n] = cb->stat[n].mean;
+		nlat->samples[n] = cb->stat[n].nr_samples;
+	}
+	if (nlat->enabled)
+		blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+}
+
+static int nlat_node(const struct request *rq)
+{
+	if (!rq->mq_ctx)
+		return -1;
+	return cpu_to_node(blk_mq_rq_cpu((struct request *)rq));
+}
+
+static void nlat_exit(struct rq_qos *rqos)
+{
+	struct rq_nlat *nlat = RQNLAT(rqos);
+
+	blk_stat_remove_callback(nlat->rqos.disk->queue, nlat->cb);
+	blk_stat_free_callback(nlat->cb);
+	kfree(nlat->samples);
+	kfree(nlat->latency);
+	kfree(nlat);
+}
+
+u64 blk_nodelat_latency(struct request_queue *q, int node)
+{
+	struct rq_qos *rqos;
+	struct rq_nlat *nlat;
+
+	rqos = nlat_rq_qos(q);
+	if (!rqos)
+		return 0;
+	nlat = RQNLAT(rqos);
+	if (node > nlat->num)
+		return 0;
+
+	return div64_u64(nlat->latency[node], 1000);
+}
+EXPORT_SYMBOL_GPL(blk_nodelat_latency);
+
+int blk_nodelat_enable(struct request_queue *q)
+{
+	struct rq_qos *rqos;
+	struct rq_nlat *nlat;
+
+	/* Throttling already enabled? */
+	rqos = nlat_rq_qos(q);
+	if (!rqos)
+		return -EINVAL;
+	nlat = RQNLAT(rqos);
+	if (nlat->enabled)
+		return 0;
+
+	/* Queue not registered? Maybe shutting down... */
+	if (!blk_queue_registered(q))
+		return -EAGAIN;
+
+	if (queue_is_mq(q)) {
+		nlat->enabled = true;
+		blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_nodelat_enable);
+
+void blk_nodelat_disable(struct request_queue *q)
+{
+	struct rq_qos *rqos = nlat_rq_qos(q);
+	struct rq_nlat *nlat;
+	if (!rqos)
+		return;
+	nlat = RQNLAT(rqos);
+	if (nlat->enabled) {
+		blk_stat_deactivate(nlat->cb);
+		nlat->enabled = false;
+	}
+}
+EXPORT_SYMBOL_GPL(blk_nodelat_disable);
+
+#ifdef CONFIG_BLK_DEBUG_FS
+static int nlat_win_usec_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_nlat *nlat = RQNLAT(rqos);
+
+	seq_printf(m, "%llu\n", nlat->win_usec);
+	return 0;
+}
+
+static ssize_t nlat_win_usec_write(void *data, const char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	struct rq_qos *rqos = data;
+	struct rq_nlat *nlat = RQNLAT(rqos);
+	char val[16] = { };
+	u64 usec;
+	int err;
+
+	if (blk_queue_dying(nlat->rqos.disk->queue))
+		return -ENOENT;
+
+	if (count >= sizeof(val))
+		return -EINVAL;
+
+	if (copy_from_user(val, buf, count))
+		return -EFAULT;
+
+	err = kstrtoull(val, 10, &usec);
+	if (err)
+		return err;
+	blk_stat_deactivate(nlat->cb);
+	nlat->win_usec = usec;
+	blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+	return count;
+}
+
+static int nlat_lowat_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_nlat *nlat = RQNLAT(rqos);
+
+	seq_printf(m, "%u\n", nlat->lowat);
+	return 0;
+}
+
+static ssize_t nlat_lowat_write(void *data, const char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	struct rq_qos *rqos = data;
+	struct rq_nlat *nlat = RQNLAT(rqos);
+	char val[16] = { };
+	unsigned int lowat;
+	int err;
+
+	if (blk_queue_dying(nlat->rqos.disk->queue))
+		return -ENOENT;
+
+	if (count >= sizeof(val))
+		return -EINVAL;
+
+	if (copy_from_user(val, buf, count))
+		return -EFAULT;
+
+	err = kstrtouint(val, 10, &lowat);
+	if (err)
+		return err;
+	blk_stat_deactivate(nlat->cb);
+	nlat->lowat = lowat;
+	blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+	return count;
+}
+
+static int nlat_decay_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_nlat *nlat = RQNLAT(rqos);
+
+	seq_printf(m, "%u\n", nlat->decay);
+	return 0;
+}
+
+static ssize_t nlat_decay_write(void *data, const char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	struct rq_qos *rqos = data;
+	struct rq_nlat *nlat = RQNLAT(rqos);
+	char val[16] = { };
+	unsigned int decay;
+	int err;
+
+	if (blk_queue_dying(nlat->rqos.disk->queue))
+		return -ENOENT;
+
+	if (count >= sizeof(val))
+		return -EINVAL;
+
+	if (copy_from_user(val, buf, count))
+		return -EFAULT;
+
+	err = kstrtouint(val, 10, &decay);
+	if (err)
+		return err;
+	if (decay > 100)
+		return -EINVAL;
+	blk_stat_deactivate(nlat->cb);
+	nlat->decay = decay;
+	blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+	return count;
+}
+
+static int nlat_enabled_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_nlat *nlat = RQNLAT(rqos);
+
+	seq_printf(m, "%d\n", nlat->enabled);
+	return 0;
+}
+
+static int nlat_id_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+
+	seq_printf(m, "%u\n", rqos->id);
+	return 0;
+}
+
+static int nlat_latency_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_nlat *nlat = RQNLAT(rqos);
+	int n;
+
+	for (n = 0; n < nlat->num; n++)
+		seq_printf(m, "%llu %u ", nlat->latency[n], nlat->samples[n]);
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static const struct blk_mq_debugfs_attr nlat_debugfs_attrs[] = {
+	{"win_usec", 0600, nlat_win_usec_show, nlat_win_usec_write},
+	{"lowat", 0600, nlat_lowat_show, nlat_lowat_write},
+	{"decay", 0600, nlat_decay_show, nlat_decay_write},
+	{"enabled", 0400, nlat_enabled_show},
+	{"id", 0400, nlat_id_show},
+	{"latency", 0400, nlat_latency_show},
+	{},
+};
+#endif
+
+static const struct rq_qos_ops nlat_rqos_ops = {
+	.exit = nlat_exit,
+#ifdef CONFIG_BLK_DEBUG_FS
+	.debugfs_attrs = nlat_debugfs_attrs,
+#endif
+};
+
+int blk_nodelat_init(struct gendisk *disk)
+{
+	struct rq_nlat *nlat;
+	int nlat_num = num_possible_nodes();
+	int ret = -ENOMEM;
+
+	nlat = kzalloc(sizeof(*nlat), GFP_KERNEL);
+	if (!nlat)
+		return -ENOMEM;
+
+	nlat->num = nlat_num;
+	nlat->lowat = 2;
+	nlat->decay = 50;
+	nlat->latency = kzalloc(sizeof(u64) * nlat->num, GFP_KERNEL);
+	if (!nlat->latency)
+		goto err_free;
+	nlat->samples = kzalloc(sizeof(unsigned int) * nlat->num, GFP_KERNEL);
+	if (!nlat->samples)
+		goto err_free;
+	nlat->cb = blk_stat_alloc_callback(nlat_timer_fn, nlat_node,
+					   nlat->num, nlat);
+	if (!nlat->cb)
+		goto err_free;
+
+	nlat->win_usec = nlat_default_latency_usec(disk->queue);
+
+	/*
+	 * Assign rwb and add the stats callback.
+	 */
+	mutex_lock(&disk->queue->rq_qos_mutex);
+	ret = rq_qos_add(&nlat->rqos, disk, RQ_QOS_NLAT, &nlat_rqos_ops);
+	mutex_unlock(&disk->queue->rq_qos_mutex);
+	if (ret)
+		goto err_free_cb;
+
+	blk_stat_add_callback(disk->queue, nlat->cb);
+
+	return 0;
+
+err_free_cb:
+	blk_stat_free_callback(nlat->cb);
+err_free:
+	kfree(nlat->samples);
+	kfree(nlat->latency);
+	kfree(nlat);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_nodelat_init);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..2fc11ced0c00 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -17,6 +17,7 @@  enum rq_qos_id {
 	RQ_QOS_WBT,
 	RQ_QOS_LATENCY,
 	RQ_QOS_COST,
+	RQ_QOS_NLAT,
 };
 
 struct rq_wait {
@@ -79,6 +80,11 @@  static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
 	return rq_qos_id(q, RQ_QOS_LATENCY);
 }
 
+static inline struct rq_qos *nlat_rq_qos(struct request_queue *q)
+{
+	return rq_qos_id(q, RQ_QOS_NLAT);
+}
+
 static inline void rq_wait_init(struct rq_wait *rq_wait)
 {
 	atomic_set(&rq_wait->inflight, 0);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 390d35fa0032..daeb837b9bc6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1229,4 +1229,15 @@  static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
 }
 #endif /* CONFIG_BLK_DEV_ZONED */
 
+#ifdef CONFIG_BLK_NODE_LATENCY
+int blk_nodelat_enable(struct request_queue *q);
+void blk_nodelat_disable(struct request_queue *q);
+u64 blk_nodelat_latency(struct request_queue *q, int node);
+int blk_nodelat_init(struct gendisk *disk);
+#else
+static inline in blk_nodelat_enable(struct request_queue *q) { return 0; }
+static inline void blk_nodelat_disable(struct request_queue *q) {}
+u64 blk_nodelat_latency(struct request_queue *q, int node) { return 0; }
+static inline in blk_nodelat_init(struct gendisk *disk) { return -ENOTSUPP; }
+#endif
 #endif /* BLK_MQ_H */