From patchwork Mon Sep 27 22:03:25 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Bart Van Assche <bvanassche@acm.org>
X-Patchwork-Id: 12521029
Return-Path: <linux-block-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id F2067C433FE
	for <linux-block@archiver.kernel.org>; Mon, 27 Sep 2021 22:03:54 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id DF0C161058
	for <linux-block@archiver.kernel.org>; Mon, 27 Sep 2021 22:03:54 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S238051AbhI0WFa (ORCPT <rfc822;linux-block@archiver.kernel.org>);
        Mon, 27 Sep 2021 18:05:30 -0400
Received: from mail-pg1-f173.google.com ([209.85.215.173]:43971 "EHLO
        mail-pg1-f173.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S237679AbhI0WFQ (ORCPT
        <rfc822;linux-block@vger.kernel.org>);
        Mon, 27 Sep 2021 18:05:16 -0400
Received: by mail-pg1-f173.google.com with SMTP id r2so19091484pgl.10
        for <linux-block@vger.kernel.org>;
 Mon, 27 Sep 2021 15:03:37 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=5RYatZdhHmmiXuvRblXQ2liBvHEQiDy1GXtopaG/uwU=;
        b=I6z2gHVC4qUFv8DCYXEMHBW6BxPvhyfIrBlNco/eAF+lew3TbWvQRYBI1qUwOR+oTn
         Jjdn34G0KF5ILNJ+YgqI+zGCpzIaEkhIQGG4JLnBGy7O3QrHAQcRqfHMO2nSqLEelCgS
         HPf4fvGyEVKVvyQl9PUNIh2p1xW8AkJsxWxgrj0PmtnWNnqdsMAzpg/YWkJ+THcTw+vC
         8PNXII8skPtRJqC5/rTjF/Y+Q/lWF7+wVpIcRpC0SGTthD9wcfphPhhHAhD1gveZ7axb
         hfnlr2xrQpFqknLSJUL/OxjwpzvOHMMc6+zKvRbOXPg6xL8T2dt3BUS46maxjXF+iGZQ
         3a2A==
X-Gm-Message-State: AOAM532Qj1M+34V9NM4R6sqVgIEXHdmoBLygsnC7dRkgzg6s5pNs3H5f
        Gtyxpv1fv0HkwphA3xJZGQ0=
X-Google-Smtp-Source: 
 ABdhPJyhRO+/cJi7TqqV1F/mZSXDlmvZnlxjPEcIpfjD4XFBy1G+DgAg6Q87x40jtzq10dL50QispA==
X-Received: by 2002:a65:44c4:: with SMTP id g4mr1621296pgs.254.1632780217366;
        Mon, 27 Sep 2021 15:03:37 -0700 (PDT)
Received: from bvanassche-linux.mtv.corp.google.com
 ([2620:15c:211:201:3e98:6842:383d:b5b2])
        by smtp.gmail.com with ESMTPSA id y13sm381587pjm.5.2021.09.27.15.03.36
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 27 Sep 2021 15:03:36 -0700 (PDT)
From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, Christoph Hellwig <hch@lst.de>,
        Jaegeuk Kim <jaegeuk@kernel.org>,
        Bart Van Assche <bvanassche@acm.org>,
        Damien Le Moal <damien.lemoal@wdc.com>,
        Niklas Cassel <Niklas.Cassel@wdc.com>,
        Hannes Reinecke <hare@suse.de>
Subject: [PATCH v2 1/4] block/mq-deadline: Improve request accounting further
Date: Mon, 27 Sep 2021 15:03:25 -0700
Message-Id: <20210927220328.1410161-2-bvanassche@acm.org>
X-Mailer: git-send-email 2.33.0.685.g46640cef36-goog
In-Reply-To: <20210927220328.1410161-1-bvanassche@acm.org>
References: <20210927220328.1410161-1-bvanassche@acm.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-block.vger.kernel.org>
X-Mailing-List: linux-block@vger.kernel.org

The scheduler .insert_requests() callback is called when a request is
queued for the first time and also when it is requeued. Only count a
request the first time it is queued. Additionally, since the mq-deadline
scheduler only performs zone locking for requests that have been
inserted, skip the zone unlock code for requests that have not been
inserted into the mq-deadline scheduler.

Fixes: 38ba64d12d4c ("block/mq-deadline: Track I/O statistics")
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 block/mq-deadline.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 47f042fa6a68..c27b4347ca91 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -677,8 +677,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_req_zone_write_unlock(rq);
 
 	prio = ioprio_class_to_prio[ioprio_class];
-	dd_count(dd, inserted, prio);
-	rq->elv.priv[0] = (void *)(uintptr_t)1;
+	if (!rq->elv.priv[0]) {
+		dd_count(dd, inserted, prio);
+		rq->elv.priv[0] = (void *)(uintptr_t)1;
+	}
 
 	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
 		blk_mq_free_requests(&free);
@@ -759,12 +761,13 @@ static void dd_finish_request(struct request *rq)
 
 	/*
 	 * The block layer core may call dd_finish_request() without having
-	 * called dd_insert_requests(). Hence only update statistics for
-	 * requests for which dd_insert_requests() has been called. See also
-	 * blk_mq_request_bypass_insert().
+	 * called dd_insert_requests(). Skip requests that bypassed I/O
+	 * scheduling. See also blk_mq_request_bypass_insert().
 	 */
-	if (rq->elv.priv[0])
-		dd_count(dd, completed, prio);
+	if (!rq->elv.priv[0])
+		return;
+
+	dd_count(dd, completed, prio);
 
 	if (blk_queue_is_zoned(q)) {
 		unsigned long flags;

From patchwork Mon Sep 27 22:03:26 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Bart Van Assche <bvanassche@acm.org>
X-Patchwork-Id: 12521037
Return-Path: <linux-block-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 13E6AC43219
	for <linux-block@archiver.kernel.org>; Mon, 27 Sep 2021 22:03:57 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id F121D61178
	for <linux-block@archiver.kernel.org>; Mon, 27 Sep 2021 22:03:56 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S237805AbhI0WFd (ORCPT <rfc822;linux-block@archiver.kernel.org>);
        Mon, 27 Sep 2021 18:05:33 -0400
Received: from mail-pj1-f44.google.com ([209.85.216.44]:36770 "EHLO
        mail-pj1-f44.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S237909AbhI0WFR (ORCPT
        <rfc822;linux-block@vger.kernel.org>);
        Mon, 27 Sep 2021 18:05:17 -0400
Received: by mail-pj1-f44.google.com with SMTP id
 u1-20020a17090ae00100b0019ec31d3ba2so252201pjy.1
        for <linux-block@vger.kernel.org>;
 Mon, 27 Sep 2021 15:03:39 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=jyvehF4LvBk5lZK2Cm+maqg7XX2XBF0POzoT7z70NMY=;
        b=waPIebSC91ZTie/O6s6HQLCIb4tJzDJ4v88BaFK4Ph5S4nOA7foegx6xdgKuv+KOlk
         Lclmt3Q3cMnp46Md0AWVrPg7697EkxPwma2rbC77sn1dSBmyKB5nkXV8oE/qUefbCU7p
         3krvi/zJtjXwvHc0RCq9vTF+5E+5YZBG6V6Dh34m+P78g5QH36ZQOMFJaEEdU5rZgMAM
         c+zVs2zX5FRiFH8+4DLT8veUd3DqU12Z90PDOfkNUoQ2MaOxF8rr9O3E5x+wKxZ6e/2T
         wNNTHaoA5gs2vdei+X/6uSo2YSUYOi7RkY/VwHtTCD4WNmfZxG3sjG9YQFyqurJ0cgq6
         gi8Q==
X-Gm-Message-State: AOAM531EJrn5TxhphQw45ru8b/JXoqsPWDZYMhTNeekPiRAhwdW+0mNW
        KwSfuwipPdT6jXAApvvOc7kphYmoulbYCg==
X-Google-Smtp-Source: 
 ABdhPJwyk98BzBfdwJvr2PWla9Wq6QZMobVPccjernPS/ajj5fPvNgBvaGbbLe/xP/diMpZMYnymtw==
X-Received: by 2002:a17:902:8b86:b0:13d:d600:789f with SMTP id
 ay6-20020a1709028b8600b0013dd600789fmr1800351plb.73.1632780218863;
        Mon, 27 Sep 2021 15:03:38 -0700 (PDT)
Received: from bvanassche-linux.mtv.corp.google.com
 ([2620:15c:211:201:3e98:6842:383d:b5b2])
        by smtp.gmail.com with ESMTPSA id y13sm381587pjm.5.2021.09.27.15.03.37
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 27 Sep 2021 15:03:38 -0700 (PDT)
From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, Christoph Hellwig <hch@lst.de>,
        Jaegeuk Kim <jaegeuk@kernel.org>,
        Bart Van Assche <bvanassche@acm.org>,
        Damien Le Moal <damien.lemoal@wdc.com>,
        Niklas Cassel <Niklas.Cassel@wdc.com>,
        Hannes Reinecke <hare@suse.de>
Subject: [PATCH v2 2/4] block/mq-deadline: Add an invariant check
Date: Mon, 27 Sep 2021 15:03:26 -0700
Message-Id: <20210927220328.1410161-3-bvanassche@acm.org>
X-Mailer: git-send-email 2.33.0.685.g46640cef36-goog
In-Reply-To: <20210927220328.1410161-1-bvanassche@acm.org>
References: <20210927220328.1410161-1-bvanassche@acm.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-block.vger.kernel.org>
X-Mailing-List: linux-block@vger.kernel.org

Check a statistics invariant at module unload time. When running
blktests, the invariant is verified every time a request queue is
removed and hence is verified at least once per test.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 block/mq-deadline.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index c27b4347ca91..2586b3f8c7e9 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -270,6 +270,12 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 	deadline_remove_request(rq->q, per_prio, rq);
 }
 
+/* Number of requests queued for a given priority level. */
+static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
+{
+	return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
+}
+
 /*
  * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
  * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
@@ -539,6 +545,12 @@ static void dd_exit_sched(struct elevator_queue *e)
 
 		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
 		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
+		WARN_ONCE(dd_queued(dd, prio) != 0,
+			  "statistics for priority %d: i %u m %u d %u c %u\n",
+			  prio, dd_sum(dd, inserted, prio),
+			  dd_sum(dd, merged, prio),
+			  dd_sum(dd, dispatched, prio),
+			  dd_sum(dd, completed, prio));
 	}
 
 	free_percpu(dd->stats);
@@ -950,12 +962,6 @@ static int dd_async_depth_show(void *data, struct seq_file *m)
 	return 0;
 }
 
-/* Number of requests queued for a given priority level. */
-static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
-{
-	return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
-}
-
 static int dd_queued_show(void *data, struct seq_file *m)
 {
 	struct request_queue *q = data;

From patchwork Mon Sep 27 22:03:27 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Bart Van Assche <bvanassche@acm.org>
X-Patchwork-Id: 12521039
Return-Path: <linux-block-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 61D58C43217
	for <linux-block@archiver.kernel.org>; Mon, 27 Sep 2021 22:03:58 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 4C11F610A2
	for <linux-block@archiver.kernel.org>; Mon, 27 Sep 2021 22:03:58 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S238070AbhI0WFe (ORCPT <rfc822;linux-block@archiver.kernel.org>);
        Mon, 27 Sep 2021 18:05:34 -0400
Received: from mail-pj1-f43.google.com ([209.85.216.43]:53186 "EHLO
        mail-pj1-f43.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S237689AbhI0WFT (ORCPT
        <rfc822;linux-block@vger.kernel.org>);
        Mon, 27 Sep 2021 18:05:19 -0400
Received: by mail-pj1-f43.google.com with SMTP id v19so13471178pjh.2
        for <linux-block@vger.kernel.org>;
 Mon, 27 Sep 2021 15:03:40 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=rkAkdyiRDRAjsychRmPbHLosiwlhhwDHRhe0qZcK8/E=;
        b=U/OOlVVEP2T3F/xTb0aVrzrJBMQFrj7lzU0liLHBumk7u2OC2ScdL3lCgMzJtJUfz2
         kWkRN+8ucEgTe4rwjcncFibchWdasd1G4A/S5TvR8AYkva7Sj4dWoQ8gzDc80viCd1a2
         49sz3zFG56SPRO+JzXzRW9KcMSZU8aTKIZOBkbucDSMJ5bWkdF2gLf5Q7A0ai8YC79Mr
         fjVFO3JiyAqFlZ3yX78Z7LyDjg/wHpDLnabgXzr3GDGXjk++KpsK0OLi3sTaxyT4HtFt
         WFB1s35r/MzGvV7hLORCuAFT2sVQwc5Bak9zlvNbAPZIxKyhQXNFmPO2vmDewxxLDwVR
         A5YA==
X-Gm-Message-State: AOAM530eRc74r1o708Us0mhmI8R+nr34Wsz9yXP9Y592LQj8OqvGEMcR
        TR6uiQWCUjLYsw2TXcq6RJM=
X-Google-Smtp-Source: 
 ABdhPJxMfVWkuRA5OtSu4j70CadliuNBI5d6AaymR8t+gzixHDbY54uV4MVbiWLd49ZdUiDif+bD/Q==
X-Received: by 2002:a17:90a:154b:: with SMTP id
 y11mr1542582pja.116.1632780220285;
        Mon, 27 Sep 2021 15:03:40 -0700 (PDT)
Received: from bvanassche-linux.mtv.corp.google.com
 ([2620:15c:211:201:3e98:6842:383d:b5b2])
        by smtp.gmail.com with ESMTPSA id y13sm381587pjm.5.2021.09.27.15.03.39
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 27 Sep 2021 15:03:39 -0700 (PDT)
From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, Christoph Hellwig <hch@lst.de>,
        Jaegeuk Kim <jaegeuk@kernel.org>,
        Bart Van Assche <bvanassche@acm.org>,
        Damien Le Moal <damien.lemoal@wdc.com>,
        Niklas Cassel <Niklas.Cassel@wdc.com>,
        Hannes Reinecke <hare@suse.de>
Subject: [PATCH v2 3/4] block/mq-deadline: Stop using per-CPU counters
Date: Mon, 27 Sep 2021 15:03:27 -0700
Message-Id: <20210927220328.1410161-4-bvanassche@acm.org>
X-Mailer: git-send-email 2.33.0.685.g46640cef36-goog
In-Reply-To: <20210927220328.1410161-1-bvanassche@acm.org>
References: <20210927220328.1410161-1-bvanassche@acm.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-block.vger.kernel.org>
X-Mailing-List: linux-block@vger.kernel.org

Calculating the sum over all CPUs of per-CPU counters frequently is
inefficient. Hence switch from per-CPU to individual counters. Three
counters are protected by the mq-deadline spinlock since these are
only accessed from contexts that already hold that spinlock. The fourth
counter is atomic because protecting it with the mq-deadline spinlock
would trigger lock contention.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/mq-deadline.c | 124 ++++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 68 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 2586b3f8c7e9..b262f40f32c0 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -51,17 +51,16 @@ enum dd_prio {
 
 enum { DD_PRIO_COUNT = 3 };
 
-/* I/O statistics per I/O priority. */
+/*
+ * I/O statistics per I/O priority. It is fine if these counters overflow.
+ * What matters is that these counters are at least as wide as
+ * log2(max_outstanding_requests).
+ */
 struct io_stats_per_prio {
-	local_t inserted;
-	local_t merged;
-	local_t dispatched;
-	local_t completed;
-};
-
-/* I/O statistics for all I/O priorities (enum dd_prio). */
-struct io_stats {
-	struct io_stats_per_prio stats[DD_PRIO_COUNT];
+	uint32_t inserted;
+	uint32_t merged;
+	uint32_t dispatched;
+	atomic_t completed;
 };
 
 /*
@@ -74,6 +73,7 @@ struct dd_per_prio {
 	struct list_head fifo_list[DD_DIR_COUNT];
 	/* Next request in FIFO order. Read, write or both are NULL. */
 	struct request *next_rq[DD_DIR_COUNT];
+	struct io_stats_per_prio stats;
 };
 
 struct deadline_data {
@@ -88,8 +88,6 @@ struct deadline_data {
 	unsigned int batching;		/* number of sequential requests made */
 	unsigned int starved;		/* times reads have starved writes */
 
-	struct io_stats __percpu *stats;
-
 	/*
 	 * settings that change how the i/o scheduler behaves
 	 */
@@ -103,33 +101,6 @@ struct deadline_data {
 	spinlock_t zone_lock;
 };
 
-/* Count one event of type 'event_type' and with I/O priority 'prio' */
-#define dd_count(dd, event_type, prio) do {				\
-	struct io_stats *io_stats = get_cpu_ptr((dd)->stats);		\
-									\
-	BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));	\
-	BUILD_BUG_ON(!__same_type((prio), enum dd_prio));		\
-	local_inc(&io_stats->stats[(prio)].event_type);			\
-	put_cpu_ptr(io_stats);						\
-} while (0)
-
-/*
- * Returns the total number of dd_count(dd, event_type, prio) calls across all
- * CPUs. No locking or barriers since it is fine if the returned sum is slightly
- * outdated.
- */
-#define dd_sum(dd, event_type, prio) ({					\
-	unsigned int cpu;						\
-	u32 sum = 0;							\
-									\
-	BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));	\
-	BUILD_BUG_ON(!__same_type((prio), enum dd_prio));		\
-	for_each_present_cpu(cpu)					\
-		sum += local_read(&per_cpu_ptr((dd)->stats, cpu)->	\
-				  stats[(prio)].event_type);		\
-	sum;								\
-})
-
 /* Maps an I/O priority class to a deadline scheduler priority. */
 static const enum dd_prio ioprio_class_to_prio[] = {
 	[IOPRIO_CLASS_NONE]	= DD_BE_PRIO,
@@ -233,7 +204,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
 	const u8 ioprio_class = dd_rq_ioclass(next);
 	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 
-	dd_count(dd, merged, prio);
+	lockdep_assert_held(&dd->lock);
+
+	dd->per_prio[prio].stats.merged++;
 
 	/*
 	 * if next expires before rq, assign its expire time to rq
@@ -273,7 +246,11 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 /* Number of requests queued for a given priority level. */
 static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
 {
-	return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
+	const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
+
+	lockdep_assert_held(&dd->lock);
+
+	return stats->inserted - atomic_read(&stats->completed);
 }
 
 /*
@@ -463,7 +440,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 done:
 	ioprio_class = dd_rq_ioclass(rq);
 	prio = ioprio_class_to_prio[ioprio_class];
-	dd_count(dd, dispatched, prio);
+	dd->per_prio[prio].stats.dispatched++;
 	/*
 	 * If the request needs its target zone locked, do it.
 	 */
@@ -542,19 +519,22 @@ static void dd_exit_sched(struct elevator_queue *e)
 
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
+		const struct io_stats_per_prio *stats = &per_prio->stats;
+		uint32_t queued;
 
 		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
 		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
-		WARN_ONCE(dd_queued(dd, prio) != 0,
+
+		spin_lock(&dd->lock);
+		queued = dd_queued(dd, prio);
+		spin_unlock(&dd->lock);
+
+		WARN_ONCE(queued != 0,
 			  "statistics for priority %d: i %u m %u d %u c %u\n",
-			  prio, dd_sum(dd, inserted, prio),
-			  dd_sum(dd, merged, prio),
-			  dd_sum(dd, dispatched, prio),
-			  dd_sum(dd, completed, prio));
+			  prio, stats->inserted, stats->merged,
+			  stats->dispatched, atomic_read(&stats->completed));
 	}
 
-	free_percpu(dd->stats);
-
 	kfree(dd);
 }
 
@@ -578,11 +558,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 
 	eq->elevator_data = dd;
 
-	dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
-				     GFP_KERNEL | __GFP_ZERO);
-	if (!dd->stats)
-		goto free_dd;
-
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
@@ -604,9 +579,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	q->elevator = eq;
 	return 0;
 
-free_dd:
-	kfree(dd);
-
 put_eq:
 	kobject_put(&eq->kobj);
 	return ret;
@@ -689,8 +661,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_req_zone_write_unlock(rq);
 
 	prio = ioprio_class_to_prio[ioprio_class];
+	per_prio = &dd->per_prio[prio];
 	if (!rq->elv.priv[0]) {
-		dd_count(dd, inserted, prio);
+		per_prio->stats.inserted++;
 		rq->elv.priv[0] = (void *)(uintptr_t)1;
 	}
 
@@ -701,7 +674,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 
 	trace_block_rq_insert(rq);
 
-	per_prio = &dd->per_prio[prio];
 	if (at_head) {
 		list_add(&rq->queuelist, &per_prio->dispatch);
 	} else {
@@ -779,7 +751,7 @@ static void dd_finish_request(struct request *rq)
 	if (!rq->elv.priv[0])
 		return;
 
-	dd_count(dd, completed, prio);
+	atomic_inc(&per_prio->stats.completed);
 
 	if (blk_queue_is_zoned(q)) {
 		unsigned long flags;
@@ -966,28 +938,44 @@ static int dd_queued_show(void *data, struct seq_file *m)
 {
 	struct request_queue *q = data;
 	struct deadline_data *dd = q->elevator->elevator_data;
+	u32 rt, be, idle;
+
+	spin_lock(&dd->lock);
+	rt = dd_queued(dd, DD_RT_PRIO);
+	be = dd_queued(dd, DD_BE_PRIO);
+	idle = dd_queued(dd, DD_IDLE_PRIO);
+	spin_unlock(&dd->lock);
+
+	seq_printf(m, "%u %u %u\n", rt, be, idle);
 
-	seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
-		   dd_queued(dd, DD_BE_PRIO),
-		   dd_queued(dd, DD_IDLE_PRIO));
 	return 0;
 }
 
 /* Number of requests owned by the block driver for a given priority. */
 static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
 {
-	return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
-		- dd_sum(dd, completed, prio);
+	const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
+
+	lockdep_assert_held(&dd->lock);
+
+	return stats->dispatched + stats->merged -
+		atomic_read(&stats->completed);
 }
 
 static int dd_owned_by_driver_show(void *data, struct seq_file *m)
 {
 	struct request_queue *q = data;
 	struct deadline_data *dd = q->elevator->elevator_data;
+	u32 rt, be, idle;
+
+	spin_lock(&dd->lock);
+	rt = dd_owned_by_driver(dd, DD_RT_PRIO);
+	be = dd_owned_by_driver(dd, DD_BE_PRIO);
+	idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
+	spin_unlock(&dd->lock);
+
+	seq_printf(m, "%u %u %u\n", rt, be, idle);
 
-	seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
-		   dd_owned_by_driver(dd, DD_BE_PRIO),
-		   dd_owned_by_driver(dd, DD_IDLE_PRIO));
 	return 0;
 }
 

From patchwork Mon Sep 27 22:03:28 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Bart Van Assche <bvanassche@acm.org>
X-Patchwork-Id: 12521041
Return-Path: <linux-block-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8A279C4167B
	for <linux-block@archiver.kernel.org>; Mon, 27 Sep 2021 22:03:58 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 6F0006101A
	for <linux-block@archiver.kernel.org>; Mon, 27 Sep 2021 22:03:58 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S237856AbhI0WFf (ORCPT <rfc822;linux-block@archiver.kernel.org>);
        Mon, 27 Sep 2021 18:05:35 -0400
Received: from mail-pl1-f179.google.com ([209.85.214.179]:42913 "EHLO
        mail-pl1-f179.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S237776AbhI0WFU (ORCPT
        <rfc822;linux-block@vger.kernel.org>);
        Mon, 27 Sep 2021 18:05:20 -0400
Received: by mail-pl1-f179.google.com with SMTP id l6so12734314plh.9
        for <linux-block@vger.kernel.org>;
 Mon, 27 Sep 2021 15:03:42 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=3o901jObB8GLiXQ6V6G1l0BzVxTq6mgJd8A5osZSLSY=;
        b=LYPg174I6KHYjYJJe1d5g0/+Kmo3hI4eS8BivPMlxU7fSPfRT2EPijuH6/pvmlNqA+
         tDI82jpivvFzLDFec1JCppooPXsYcFH8r2KCME5BSOL8FM8Nzwv0OnZYOXXhnVxnCDLs
         EPqt8VDIFV23WBUjdMS2IcnE1IjE+6a0fKm48eS9QDHuXG//Du8dCBl1ov3+7dDkwdTr
         /usv236pKS9qxZdUpP7UFDztFcGrdFR6P74zL1RVsuCWQ1y6RwQ6Z36D6z9Lq8pLbrum
         ndSCOqxDOovBGoOZGhaYHvXs+gNiNMPFEMY2ZwOr5y3IMRH2+pvEC1JQFGEZYt/AB9Aj
         L3JA==
X-Gm-Message-State: AOAM532W+2dZSporCtdisY4fyiAsy+SS3vMKKZAcYg/rhev+8jxX0eMS
        5Nbxq9s8Hpt1+IRBI0ORz1o=
X-Google-Smtp-Source: 
 ABdhPJwqCA8YdH4HOM3yZCHdCs01axnwvjyvYspcu7EOntOOoHPq/uFcoFBQftuT2/DywXNGfFUqtA==
X-Received: by 2002:a17:902:6106:b0:13e:43e8:643a with SMTP id
 t6-20020a170902610600b0013e43e8643amr886875plj.29.1632780221670;
        Mon, 27 Sep 2021 15:03:41 -0700 (PDT)
Received: from bvanassche-linux.mtv.corp.google.com
 ([2620:15c:211:201:3e98:6842:383d:b5b2])
        by smtp.gmail.com with ESMTPSA id y13sm381587pjm.5.2021.09.27.15.03.40
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 27 Sep 2021 15:03:41 -0700 (PDT)
From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, Christoph Hellwig <hch@lst.de>,
        Jaegeuk Kim <jaegeuk@kernel.org>,
        Bart Van Assche <bvanassche@acm.org>,
        Damien Le Moal <damien.lemoal@wdc.com>,
        Niklas Cassel <Niklas.Cassel@wdc.com>,
        Hannes Reinecke <hare@suse.de>
Subject: [PATCH v2 4/4] block/mq-deadline: Prioritize high-priority requests
Date: Mon, 27 Sep 2021 15:03:28 -0700
Message-Id: <20210927220328.1410161-5-bvanassche@acm.org>
X-Mailer: git-send-email 2.33.0.685.g46640cef36-goog
In-Reply-To: <20210927220328.1410161-1-bvanassche@acm.org>
References: <20210927220328.1410161-1-bvanassche@acm.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-block.vger.kernel.org>
X-Mailing-List: linux-block@vger.kernel.org

In addition to reverting commit 7b05bf771084 ("Revert "block/mq-deadline:
Prioritize high-priority requests""), this patch uses 'jiffies' instead
of ktime_get() in the code for aging lower priority requests.

This patch has been tested as follows:

Measured QD=1/jobs=1 IOPS for nullb with the mq-deadline scheduler.
Result without and with this patch: 555 K IOPS.

Measured QD=1/jobs=8 IOPS for nullb with the mq-deadline scheduler.
Result without and with this patch: about 380 K IOPS.

Ran the following script:

set -e
scriptdir=$(dirname "$0")
if [ -e /sys/module/scsi_debug ]; then modprobe -r scsi_debug; fi
modprobe scsi_debug ndelay=1000000 max_queue=16
sd=''
while [ -z "$sd" ]; do
  sd=$(basename /sys/bus/pseudo/drivers/scsi_debug/adapter*/host*/target*/*/block/*)
done
echo $((100*1000)) > "/sys/block/$sd/queue/iosched/prio_aging_expire"
if [ -e /sys/fs/cgroup/io.prio.class ]; then
  cd /sys/fs/cgroup
  echo restrict-to-be >io.prio.class
  echo +io > cgroup.subtree_control
else
  cd /sys/fs/cgroup/blkio/
  echo restrict-to-be >blkio.prio.class
fi
echo $$ >cgroup.procs
mkdir -p hipri
cd hipri
if [ -e io.prio.class ]; then
  echo none-to-rt >io.prio.class
else
  echo none-to-rt >blkio.prio.class
fi
{ "${scriptdir}/max-iops" -a1 -d32 -j1 -e mq-deadline "/dev/$sd" >& ~/low-pri.txt & }
echo $$ >cgroup.procs
"${scriptdir}/max-iops" -a1 -d32 -j1 -e mq-deadline "/dev/$sd" >& ~/hi-pri.txt

Result:
* 11000 IOPS for the high-priority job
*    40 IOPS for the low-priority job

If the prio aging expiry time is changed from 100s into 0, the IOPS results
change into 6712 and 6796 IOPS.

The max-iops script is a script that runs fio with the following arguments:
--bs=4K --gtod_reduce=1 --ioengine=libaio --ioscheduler=${arg_e} --runtime=60
--norandommap --rw=read --thread --buffered=0 --numjobs=${arg_j}
--iodepth=${arg_d} --iodepth_batch_submit=${arg_a}
--iodepth_batch_complete=$((arg_d / 2)) --name=${positional_argument_1}
--filename=${positional_argument_1}

Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Niklas Cassel <Niklas.Cassel@wdc.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Niklas Cassel <niklas.cassel@wdc.com>
---
 block/mq-deadline.c | 77 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 4 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b262f40f32c0..bb723478baf1 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -31,6 +31,11 @@
  */
 static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
 static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+/*
+ * Time after which to dispatch lower priority requests even if higher
+ * priority requests are pending.
+ */
+static const int prio_aging_expire = 10 * HZ;
 static const int writes_starved = 2;    /* max times reads can starve a write */
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
 				     by the above parameters. For throughput. */
@@ -96,6 +101,7 @@ struct deadline_data {
 	int writes_starved;
 	int front_merges;
 	u32 async_depth;
+	int prio_aging_expire;
 
 	spinlock_t lock;
 	spinlock_t zone_lock;
@@ -338,12 +344,27 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 	return rq;
 }
 
+/*
+ * Returns true if and only if @rq started after @latest_start where
+ * @latest_start is in jiffies.
+ */
+static bool started_after(struct deadline_data *dd, struct request *rq,
+			  unsigned long latest_start)
+{
+	unsigned long start_time = (unsigned long)rq->fifo_time;
+
+	start_time -= dd->fifo_expire[rq_data_dir(rq)];
+
+	return time_after(start_time, latest_start);
+}
+
 /*
  * deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc
+ * read/write expire, fifo_batch, etc and with a start time <= @latest.
  */
 static struct request *__dd_dispatch_request(struct deadline_data *dd,
-					     struct dd_per_prio *per_prio)
+					     struct dd_per_prio *per_prio,
+					     unsigned long latest_start)
 {
 	struct request *rq, *next_rq;
 	enum dd_data_dir data_dir;
@@ -355,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	if (!list_empty(&per_prio->dispatch)) {
 		rq = list_first_entry(&per_prio->dispatch, struct request,
 				      queuelist);
+		if (started_after(dd, rq, latest_start))
+			return NULL;
 		list_del_init(&rq->queuelist);
 		goto done;
 	}
@@ -432,6 +455,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	dd->batching = 0;
 
 dispatch_request:
+	if (started_after(dd, rq, latest_start))
+		return NULL;
+
 	/*
 	 * rq is the selected appropriate request.
 	 */
@@ -449,6 +475,34 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	return rq;
 }
 
+/*
+ * Check whether there are any requests with priority other than DD_RT_PRIO
+ * that were inserted more than prio_aging_expire jiffies ago.
+ */
+static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
+						      unsigned long now)
+{
+	struct request *rq;
+	enum dd_prio prio;
+	int prio_cnt;
+
+	lockdep_assert_held(&dd->lock);
+
+	prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
+		   !!dd_queued(dd, DD_IDLE_PRIO);
+	if (prio_cnt < 2)
+		return NULL;
+
+	for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
+		rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
+					   now - dd->prio_aging_expire);
+		if (rq)
+			return rq;
+	}
+
+	return NULL;
+}
+
 /*
  * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
  *
@@ -460,15 +514,26 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+	const unsigned long now = jiffies;
 	struct request *rq;
 	enum dd_prio prio;
 
 	spin_lock(&dd->lock);
+	rq = dd_dispatch_prio_aged_requests(dd, now);
+	if (rq)
+		goto unlock;
+
+	/*
+	 * Next, dispatch requests in priority order. Ignore lower priority
+	 * requests if any higher priority requests are pending.
+	 */
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
-		rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
-		if (rq)
+		rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
+		if (rq || dd_queued(dd, prio))
 			break;
 	}
+
+unlock:
 	spin_unlock(&dd->lock);
 
 	return rq;
@@ -573,6 +638,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	dd->front_merges = 1;
 	dd->last_dir = DD_WRITE;
 	dd->fifo_batch = fifo_batch;
+	dd->prio_aging_expire = prio_aging_expire;
 	spin_lock_init(&dd->lock);
 	spin_lock_init(&dd->zone_lock);
 
@@ -796,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
 SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
 SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
+SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
 SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
 SHOW_INT(deadline_front_merges_show, dd->front_merges);
 SHOW_INT(deadline_async_depth_show, dd->front_merges);
@@ -825,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
 	STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
 STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
 STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
+STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
 STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
 STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
 STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
@@ -843,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = {
 	DD_ATTR(front_merges),
 	DD_ATTR(async_depth),
 	DD_ATTR(fifo_batch),
+	DD_ATTR(prio_aging_expire),
 	__ATTR_NULL
 };