@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
+ [NVME_IOPOLICY_QD] = "queue-depth",
};
static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
iopolicy = NVME_IOPOLICY_NUMA;
else if (!strncmp(val, "round-robin", 11))
iopolicy = NVME_IOPOLICY_RR;
+ else if (!strncmp(val, "queue-depth", 11))
+ iopolicy = NVME_IOPOLICY_QD;
else
return -EINVAL;
@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
&iopolicy, 0644);
MODULE_PARM_DESC(iopolicy,
- "Default multipath I/O policy; 'numa' (default) or 'round-robin'");
+ "Default multipath I/O policy; 'numa' (default) , 'round-robin' or 'queue-depth'");
void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
{
@@ -130,6 +133,7 @@ void nvme_mpath_start_request(struct request *rq)
if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
return;
+ atomic_inc(&ns->ctrl->nr_active);
nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
jiffies);
@@ -142,6 +146,8 @@ void nvme_mpath_end_request(struct request *rq)
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
return;
+
+ atomic_dec(&ns->ctrl->nr_active);
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
blk_rq_bytes(rq) >> SECTOR_SHIFT,
nvme_req(rq)->start_time);
@@ -330,6 +336,40 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
return found;
}
+static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
+{
+ struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
+ unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+ unsigned int depth;
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ if (nvme_path_is_disabled(ns))
+ continue;
+
+ depth = atomic_read(&ns->ctrl->nr_active);
+
+ switch (ns->ana_state) {
+ case NVME_ANA_OPTIMIZED:
+ if (depth < min_depth_opt) {
+ min_depth_opt = depth;
+ best_opt = ns;
+ }
+ break;
+
+ case NVME_ANA_NONOPTIMIZED:
+ if (depth < min_depth_nonopt) {
+ min_depth_nonopt = depth;
+ best_nonopt = ns;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return best_opt ? best_opt : best_nonopt;
+}
+
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
@@ -338,15 +378,27 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
- int node = numa_node_id();
+ int iopolicy = READ_ONCE(head->subsys->iopolicy);
+ int node;
struct nvme_ns *ns;
+ /*
+ * queue-depth iopolicy does not need to reference ->current_path
+ * but round-robin needs the last path used to advance to the
+ * next one, and numa will continue to use the last path unless
+ * it is or has become not optimized
+ */
+ if (iopolicy == NVME_IOPOLICY_QD)
+ return nvme_queue_depth_path(head);
+
+ node = numa_node_id();
ns = srcu_dereference(head->current_path[node], &head->srcu);
if (unlikely(!ns))
return __nvme_find_path(head, node);
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+ if (iopolicy == NVME_IOPOLICY_RR)
return nvme_round_robin_path(head, node, ns);
+
if (unlikely(!nvme_path_is_optimized(ns)))
return __nvme_find_path(head, node);
return ns;
@@ -905,6 +957,7 @@ void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
mutex_init(&ctrl->ana_lock);
timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+ atomic_set(&ctrl->nr_active, 0);
}
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
@@ -354,6 +354,7 @@ struct nvme_ctrl {
size_t ana_log_size;
struct timer_list anatt_timer;
struct work_struct ana_work;
+ atomic_t nr_active;
#endif
#ifdef CONFIG_NVME_HOST_AUTH
@@ -402,6 +403,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
enum nvme_iopolicy {
NVME_IOPOLICY_NUMA,
NVME_IOPOLICY_RR,
+ NVME_IOPOLICY_QD,
};
struct nvme_subsystem {