@@ -27,6 +27,9 @@ Parameters
* - ``max_macs``
- driverinit
- The range is between 1 and 2^31. Only power of 2 values are supported.
+ * - ``cpu_affinity``
+ - driverinit | runtime
+ - empty affinity (0) means kernel assign the affinity
The ``mlx5`` driver also implements the following driver-specific
parameters.
@@ -10,6 +10,7 @@
#include "esw/qos.h"
#include "sf/dev/dev.h"
#include "sf/sf.h"
+#include "mlx5_irq.h"
static int mlx5_devlink_flash_update(struct devlink *devlink,
struct devlink_flash_update_params *params,
@@ -833,6 +834,121 @@ mlx5_devlink_max_uc_list_param_unregister(struct devlink *devlink)
devlink_param_unregister(devlink, &max_uc_list_param);
}
+static int mlx5_devlink_cpu_affinity_validate(struct devlink *devlink, u32 id,
+ union devlink_param_value val,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+ cpumask_var_t tmp;
+ int max_eqs;
+ int ret = 0;
+ int last;
+
+ /* Check whether the mask is valid cpu mask */
+ last = find_last_bit(val.vbitmap, MLX5_CPU_AFFINITY_MAX_LEN);
+ if (last == MLX5_CPU_AFFINITY_MAX_LEN)
+ /* Affinity is empty, will use default policy*/
+ return 0;
+ if (last >= num_present_cpus()) {
+ NL_SET_ERR_MSG_MOD(extack, "Some CPUs aren't present");
+ return -ERANGE;
+ }
+
+ if (!zalloc_cpumask_var(&tmp, GFP_KERNEL))
+ return -ENOMEM;
+
+ bitmap_copy(cpumask_bits(tmp), val.vbitmap, nr_cpu_ids);
+ if (!cpumask_subset(tmp, cpu_online_mask)) {
+ NL_SET_ERR_MSG_MOD(extack, "Some CPUs aren't online");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Check whether the PF/VF/SFs have enough IRQs. SF will
+ * perform IRQ->CPU check during load time.
+ */
+ if (mlx5_core_is_sf(dev))
+ max_eqs = min_t(int, MLX5_COMP_EQS_PER_SF,
+ mlx5_irq_table_get_sfs_vec(mlx5_irq_table_get(dev)));
+ else
+ max_eqs = mlx5_irq_table_get_num_comp(mlx5_irq_table_get(dev));
+ if (cpumask_weight(tmp) > max_eqs) {
+ NL_SET_ERR_MSG_MOD(extack, "PCI Function doesn’t have enough IRQs");
+ ret = -EINVAL;
+ }
+
+out:
+ free_cpumask_var(tmp);
+ return ret;
+}
+
+static int mlx5_devlink_cpu_affinity_set(struct devlink *devlink, u32 id,
+ struct devlink_param_gset_ctx *ctx)
+{
+ /* Runtime set of cpu_affinity is not supported */
+ return -EOPNOTSUPP;
+}
+
+static int mlx5_devlink_cpu_affinity_get(struct devlink *devlink, u32 id,
+ struct devlink_param_gset_ctx *ctx)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+ cpumask_var_t dev_mask;
+
+ if (!zalloc_cpumask_var(&dev_mask, GFP_KERNEL))
+ return -ENOMEM;
+ mlx5_core_affinity_get(dev, dev_mask);
+ bitmap_copy(ctx->val.vbitmap, cpumask_bits(dev_mask), nr_cpu_ids);
+ free_cpumask_var(dev_mask);
+ return 0;
+}
+
+static const struct devlink_param cpu_affinity_param =
+ DEVLINK_PARAM_DYNAMIC_GENERIC(CPU_AFFINITY, BIT(DEVLINK_PARAM_CMODE_RUNTIME) |
+ BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+ mlx5_devlink_cpu_affinity_get,
+ mlx5_devlink_cpu_affinity_set,
+ mlx5_devlink_cpu_affinity_validate,
+ MLX5_CPU_AFFINITY_MAX_LEN);
+
+static int mlx5_devlink_cpu_affinity_param_register(struct devlink *devlink)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+ union devlink_param_value value;
+ cpumask_var_t dev_mask;
+ int ret = 0;
+
+ if (mlx5_core_is_sf(dev) &&
+ !mlx5_irq_table_have_dedicated_sfs_irqs(mlx5_irq_table_get(dev)))
+ return 0;
+
+ if (!zalloc_cpumask_var(&dev_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = devlink_param_register(devlink, &cpu_affinity_param);
+ if (ret)
+ goto out;
+
+ value.vbitmap = cpumask_bits(dev_mask);
+ devlink_param_driverinit_value_set(devlink,
+ DEVLINK_PARAM_GENERIC_ID_CPU_AFFINITY,
+ value);
+out:
+ free_cpumask_var(dev_mask);
+ return ret;
+}
+
+static void mlx5_devlink_cpu_affinity_param_unregister(struct devlink *devlink)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+
+ if (mlx5_core_is_sf(dev) &&
+ !mlx5_irq_table_have_dedicated_sfs_irqs(mlx5_irq_table_get(dev)))
+ return;
+
+ devlink_param_unregister(devlink, &cpu_affinity_param);
+}
+
#define MLX5_TRAP_DROP(_id, _group_id) \
DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \
DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \
@@ -896,6 +1012,10 @@ int mlx5_devlink_register(struct devlink *devlink)
if (err)
goto max_uc_list_err;
+ err = mlx5_devlink_cpu_affinity_param_register(devlink);
+ if (err)
+ goto cpu_affinity_err;
+
err = mlx5_devlink_traps_register(devlink);
if (err)
goto traps_reg_err;
@@ -906,6 +1026,8 @@ int mlx5_devlink_register(struct devlink *devlink)
return 0;
traps_reg_err:
+ mlx5_devlink_cpu_affinity_param_unregister(devlink);
+cpu_affinity_err:
mlx5_devlink_max_uc_list_param_unregister(devlink);
max_uc_list_err:
mlx5_devlink_auxdev_params_unregister(devlink);
@@ -918,6 +1040,7 @@ int mlx5_devlink_register(struct devlink *devlink)
void mlx5_devlink_unregister(struct devlink *devlink)
{
mlx5_devlink_traps_unregister(devlink);
+ mlx5_devlink_cpu_affinity_param_unregister(devlink);
mlx5_devlink_max_uc_list_param_unregister(devlink);
mlx5_devlink_auxdev_params_unregister(devlink);
devlink_params_unregister(devlink, mlx5_devlink_params,
@@ -6,6 +6,8 @@
#include <net/devlink.h>
+#define MLX5_CPU_AFFINITY_MAX_LEN (NR_CPUS)
+
enum mlx5_devlink_param_id {
MLX5_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
MLX5_DEVLINK_PARAM_ID_FLOW_STEERING_MODE,
@@ -794,6 +794,30 @@ void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm)
}
EXPORT_SYMBOL(mlx5_eq_update_ci);
+static int comp_irqs_request_by_cpu_affinity(struct mlx5_core_dev *dev)
+{
+ struct mlx5_eq_table *table = dev->priv.eq_table;
+ struct devlink *devlink = priv_to_devlink(dev);
+ union devlink_param_value val;
+ cpumask_var_t user_mask;
+ int ret;
+
+ if (!zalloc_cpumask_var(&user_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ val.vbitmap = cpumask_bits(user_mask);
+ ret = devlink_param_driverinit_value_get(devlink,
+ DEVLINK_PARAM_GENERIC_ID_CPU_AFFINITY,
+ &val);
+ if (ret)
+ goto out;
+
+ ret = mlx5_irqs_request_mask(dev, table->comp_irqs, user_mask);
+out:
+ free_cpumask_var(user_mask);
+ return ret;
+}
+
static void comp_irqs_release(struct mlx5_core_dev *dev)
{
struct mlx5_eq_table *table = dev->priv.eq_table;
@@ -817,6 +841,11 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL);
if (!table->comp_irqs)
return -ENOMEM;
+
+ ret = comp_irqs_request_by_cpu_affinity(dev);
+ if (ret > 0)
+ return ret;
+ mlx5_core_dbg(dev, "failed to get param cpu_affinity. use default policy\n");
if (mlx5_core_is_sf(dev)) {
ret = mlx5_irq_affinity_irqs_request_auto(dev, ncomp_eqs, table->comp_irqs);
if (ret < 0)
@@ -987,6 +1016,16 @@ mlx5_comp_irq_get_affinity_mask(struct mlx5_core_dev *dev, int vector)
}
EXPORT_SYMBOL(mlx5_comp_irq_get_affinity_mask);
+void mlx5_core_affinity_get(struct mlx5_core_dev *dev, struct cpumask *dev_mask)
+{
+ struct mlx5_eq_table *table = dev->priv.eq_table;
+ struct mlx5_eq_comp *eq, *n;
+
+ list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list)
+ cpumask_or(dev_mask, dev_mask,
+ mlx5_irq_get_affinity_mask(eq->core.irq));
+}
+
#ifdef CONFIG_RFS_ACCEL
struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev)
{
@@ -307,4 +307,6 @@ bool mlx5_rdma_supported(struct mlx5_core_dev *dev);
bool mlx5_vnet_supported(struct mlx5_core_dev *dev);
bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev);
+void mlx5_core_affinity_get(struct mlx5_core_dev *dev, struct cpumask *dev_mask);
+
#endif /* __MLX5_CORE_H__ */
@@ -16,6 +16,7 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev);
void mlx5_irq_table_destroy(struct mlx5_core_dev *dev);
int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table);
int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table);
+bool mlx5_irq_table_have_dedicated_sfs_irqs(struct mlx5_irq_table *table);
struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev);
int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn,
@@ -25,10 +26,12 @@ int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs);
struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev);
void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq);
struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
- struct cpumask *affinity);
+ const struct cpumask *affinity);
int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
struct mlx5_irq **irqs);
void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs);
+int mlx5_irqs_request_mask(struct mlx5_core_dev *dev, struct mlx5_irq **irqs,
+ struct cpumask *irqs_req_mask);
int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq);
@@ -300,7 +300,7 @@ int mlx5_irq_get_index(struct mlx5_irq *irq)
/* requesting an irq from a given pool according to given index */
static struct mlx5_irq *
irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
- struct cpumask *affinity)
+ const struct cpumask *affinity)
{
struct mlx5_irq *irq;
@@ -420,7 +420,7 @@ struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev)
* This function returns a pointer to IRQ, or ERR_PTR in case of error.
*/
struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
- struct cpumask *affinity)
+ const struct cpumask *affinity)
{
struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
struct mlx5_irq_pool *pool;
@@ -481,6 +481,82 @@ int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
return i ? i : PTR_ERR(irq);
}
+static int req_mask_local_spread(unsigned int i, int node,
+ const struct cpumask *irqs_req_mask)
+{
+ int cpu;
+
+ if (node == NUMA_NO_NODE) {
+ for_each_cpu_and(cpu, cpu_online_mask, irqs_req_mask)
+ if (i-- == 0)
+ return cpu;
+ } else {
+ /* NUMA first. */
+ for_each_cpu_and(cpu, cpumask_of_node(node), irqs_req_mask)
+ if (cpu_online(cpu))
+ if (i-- == 0)
+ return cpu;
+
+ for_each_online_cpu(cpu) {
+ /* Skip NUMA nodes, done above. */
+ if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
+ continue;
+
+ if (i-- == 0)
+ return cpu;
+ }
+ }
+ WARN_ON(true);
+ return cpumask_first(cpu_online_mask);
+}
+
+/**
+ * mlx5_irqs_request_mask - request one or more IRQs for mlx5 device.
+ * @dev: mlx5 device that is requesting the IRQs.
+ * @irqs: an output array of IRQs pointers.
+ * @irqs_req_mask: cpumask requested for these IRQs.
+ *
+ * Each IRQ is bounded to at most 1 CPU.
+ * This function returns the number of IRQs requested, (which might be smaller than
+ * cpumask_weight(@irqs_req_mask)), if successful, or a negative error code in
+ * case of an error.
+ */
+int mlx5_irqs_request_mask(struct mlx5_core_dev *dev, struct mlx5_irq **irqs,
+ struct cpumask *irqs_req_mask)
+{
+ struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
+ struct mlx5_irq *irq;
+ int nirqs;
+ int cpu;
+ int i;
+
+ /* Request an IRQ for each online CPU in the given mask */
+ cpumask_and(irqs_req_mask, irqs_req_mask, cpu_online_mask);
+ nirqs = cpumask_weight(irqs_req_mask);
+ for (i = 0; i < nirqs; i++) {
+ /* Iterate over the mask the caller provided in numa aware fashion.
+ * Local CPUs are requested first, followed by non-local ones.
+ */
+ cpu = req_mask_local_spread(i, dev->priv.numa_node, irqs_req_mask);
+
+ if (mlx5_irq_pool_is_sf_pool(pool))
+ irq = mlx5_irq_affinity_request(pool, cpumask_of(cpu));
+ else
+ irq = mlx5_irq_request(dev, i, cpumask_of(cpu));
+ if (IS_ERR(irq)) {
+ if (!i)
+ return PTR_ERR(irq);
+ return i;
+ }
+ irqs[i] = irq;
+ mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n",
+ pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)),
+ cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)),
+ mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ);
+ }
+ return i;
+}
+
static struct mlx5_irq_pool *
irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name,
u32 min_threshold, u32 max_threshold)
@@ -670,6 +746,11 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
pci_free_irq_vectors(dev->pdev);
}
+bool mlx5_irq_table_have_dedicated_sfs_irqs(struct mlx5_irq_table *table)
+{
+ return table->sf_comp_pool;
+}
+
int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table)
{
if (table->sf_comp_pool)