@@ -536,8 +536,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (blkcg_init_queue(q))
goto fail_ref;
- for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++)
+ for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
q->poll_info[bucket].sleep_ns = 0;
+ atomic_set(&q->poll_info[bucket].nr_misses, 0);
+ }
return q;
@@ -528,6 +528,34 @@ void blk_mq_free_request(struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);
+static inline void blk_mq_record_stats(struct request *rq, u64 now)
+{
+ int bucket = blk_mq_poll_stats_bkt(rq);
+
+ if (bucket >= 0 && !(rq->rq_flags & RQF_MQ_POLLED)) {
+ struct poll_info *pi;
+ u64 threshold;
+
+ pi = &rq->q->poll_info[bucket];
+ /*
+ * Even if the time for hybrid polling predicted well, the
+ * completion could oversleep because of a timer's lag. Try
+ * to detect and skip accounting for such outliers.
+ */
+ threshold = pi->stat.mean;
+
+ /*
+ * Ideally, miss count should be close to 0,
+ * so should not happen often.
+ */
+ if (blk_rq_io_time(rq, now) < threshold)
+ atomic_inc(&pi->nr_misses);
+ }
+
+ blk_mq_poll_stats_start(rq->q);
+ blk_stat_add(rq, now);
+}
+
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
u64 now = 0;
@@ -574,10 +602,8 @@ static void __blk_mq_complete_request(struct request *rq)
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
- if (rq->rq_flags & RQF_STATS) {
- blk_mq_poll_stats_start(rq->q);
- blk_stat_add(rq, ktime_get_ns());
- }
+ if (rq->rq_flags & RQF_STATS)
+ blk_mq_record_stats(rq, ktime_get_ns());
/*
* Most of single queue controllers, there is only one irq vector
* for handling IO completion, and the only irq's affinity is set
@@ -3316,14 +3342,25 @@ static void blk_mq_update_poll_info(struct poll_info *pi,
struct blk_rq_stat *stat)
{
u64 sleep_ns;
+ u32 nr_misses, nr_samples;
+
+ nr_samples = stat->nr_samples;
+ nr_misses = atomic_read(&pi->nr_misses);
+ if (nr_misses > nr_samples)
+ nr_misses = nr_samples;
- if (!stat->nr_samples)
+ if (!nr_samples)
sleep_ns = 0;
else
sleep_ns = (stat->mean + 1) / 2;
+ /*
+ * Use miss ratio here to adjust sleep time
+ */
+
pi->stat = *stat;
pi->sleep_ns = sleep_ns;
+ atomic_set(&pi->nr_misses, 0);
}
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
@@ -3389,10 +3426,6 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
rq->rq_flags |= RQF_MQ_POLL_SLEPT;
- /*
- * This will be replaced with the stats tracking code, using
- * 'avg_completion_time / 2' as the pre-sleep target.
- */
kt = nsecs;
mode = HRTIMER_MODE_REL;
@@ -3417,30 +3450,34 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
}
static bool blk_mq_poll_hybrid(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
+ struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
{
- struct request *rq;
-
if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
return false;
- if (!blk_qc_t_is_internal(cookie))
- rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
- else {
- rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
- /*
- * With scheduling, if the request has completed, we'll
- * get a NULL return here, as we clear the sched tag when
- * that happens. The request still remains valid, like always,
- * so we should be safe with just the NULL check.
- */
- if (!rq)
- return false;
- }
+ /*
+ * With scheduling, if the request has completed, we'll
+ * get a NULL request here, as we clear the sched tag when
+ * that happens. The request still remains valid, like always,
+ * so we should be safe with just the NULL check.
+ */
+ if (!rq)
+ return false;
return blk_mq_poll_hybrid_sleep(q, hctx, rq);
}
+static inline struct request *qc_t_to_request(struct blk_mq_hw_ctx *hctx,
+ blk_qc_t cookie)
+{
+ struct blk_mq_tags *tags;
+
+ tags = blk_qc_t_is_internal(cookie) ? hctx->sched_tags : hctx->tags;
+
+ return blk_mq_tag_to_rq(tags, blk_qc_t_to_tag(cookie));
+}
+
/**
* blk_poll - poll for IO completions
* @q: the queue
@@ -3456,6 +3493,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
{
struct blk_mq_hw_ctx *hctx;
+ struct request *rq;
long state;
if (!blk_qc_t_valid(cookie) ||
@@ -3466,6 +3504,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
blk_flush_plug_list(current->plug, false);
hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+ rq = qc_t_to_request(hctx, cookie);
/*
* If we sleep, have the caller restart the poll loop to reset
@@ -3474,7 +3513,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
* the IO isn't complete, we'll get called again and will go
* straight to the busy poll loop.
*/
- if (blk_mq_poll_hybrid(q, hctx, cookie))
+ if (blk_mq_poll_hybrid(q, hctx, rq))
return 1;
hctx->poll_considered++;
@@ -3486,6 +3525,9 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
hctx->poll_invoked++;
ret = q->mq_ops->poll(hctx);
+ if (rq)
+ rq->rq_flags |= RQF_MQ_POLLED;
+
if (ret > 0) {
hctx->poll_success++;
__set_current_state(TASK_RUNNING);
@@ -94,7 +94,7 @@ void blk_stat_add(struct request *rq, u64 now)
int bucket;
u64 value;
- value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
+ value = blk_rq_io_time(rq, now);
blk_throtl_stat_add(rq, value);
@@ -109,6 +109,9 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
+/* Request has been polled at least once */
+#define RQF_MQ_POLLED ((__force req_flags_t)(1 << 22))
+
/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
@@ -389,6 +392,7 @@ struct poll_info
{
struct blk_rq_stat stat;
u64 sleep_ns;
+ atomic_t nr_misses;
};
struct request_queue {
@@ -924,6 +928,11 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
}
#endif /* CONFIG_BLK_DEV_ZONED */
+static inline u64 blk_rq_io_time(struct request *rq, u64 now)
+{
+ return (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
+}
+
/*
* Some commands like WRITE SAME have a payload or data transfer size which
* is different from the size of the request. Any driver that supports such