diff mbox

[v3,for-4.13,2/6] mlx5: move affinity hints assignments to generic code

Message ID 1496644560-28923-3-git-send-email-sagi@grimberg.me (mailing list archive)
State Superseded
Headers show

Commit Message

Sagi Grimberg June 5, 2017, 6:35 a.m. UTC
generic api takes care of spreading affinity similar to
what mlx5 open coded (and even handles better asymmetric
configurations). Ask the generic API to spread affinity
for us, and feed him pre_vectors that do not participate
in affinity settings (which is an improvement to what we
had before).

The affinity assignments should match what mlx5 tried to
do earlier but now we do not set affinity to async, cmd
and pages dedicated vectors.

Also, remove mlx5e_get_cpu routine as we have generic helpers
to get cpumask and node given a irq vector, so use them
directly.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 12 ++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 83 ++---------------------
 include/linux/mlx5/driver.h                       |  1 -
 3 files changed, 10 insertions(+), 86 deletions(-)

Comments

Saeed Mahameed June 7, 2017, 6:16 a.m. UTC | #1
On Mon, Jun 5, 2017 at 9:35 AM, Sagi Grimberg <sagi@grimberg.me> wrote:
> generic api takes care of spreading affinity similar to
> what mlx5 open coded (and even handles better asymmetric
> configurations). Ask the generic API to spread affinity
> for us, and feed him pre_vectors that do not participate
> in affinity settings (which is an improvement to what we
> had before).
>
> The affinity assignments should match what mlx5 tried to
> do earlier but now we do not set affinity to async, cmd
> and pages dedicated vectors.
>

I am not sure the new assignment will match what we tried to do before
this patch, and i would like to preserve that behavior
from before we simply spread comp vectors to the close numa cpus first
then to other cores uniformly.
i.e prefer first IRQs to go to close numa cores.

for example if you have 2 numa nodes each have 4 cpus, and the device
is on 2nd numa,
Numa 1 cpus: 0 1 2 3
Numa 2 cpus: 4 5 6 7

this should be the affinity:

IRQ[0] -> cpu[4] (Numa 2)
IRQ[1] -> cpu[5]
IRQ[2] -> cpu[6]
IRQ[3] -> cpu[7]

IRQ[4] -> cpu[0] (Numa 1)
IRQ[5] -> cpu[1]
IRQ[6] -> cpu[2]
IRQ[7] -> cpu[3]

looking at irq_create_affinity_masks, and it seems not to be the case !
"nodemask_t nodemsk = NODE_MASK_NONE;" it doesn't seem to prefer any numa node.

I am sure that there is a way to force our mlx5 affinity strategy and
override the default one with the new API.

>
> Also, remove mlx5e_get_cpu routine as we have generic helpers
> to get cpumask and node given a irq vector, so use them
> directly.
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Acked-by: Leon Romanovsky <leonro@mellanox.com>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 12 ++--
>  drivers/net/ethernet/mellanox/mlx5/core/main.c    | 83 ++---------------------
>  include/linux/mlx5/driver.h                       |  1 -
>  3 files changed, 10 insertions(+), 86 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 2a3c59e55dcf..ebfda1eae6b4 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -1565,11 +1565,6 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
>         mlx5e_free_cq(cq);
>  }
>
> -static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
> -{
> -       return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
> -}
>

let's keep this abstraction, even let's consider moving this to a
helper function in the mlx5_core dirver main.c,
it is not right when mlx5_ib and mlx5e netdev know about internal mdev
structures and implementations of stuff.

I suggest to move mlx5_ib_get_vector_affinity from patch #4 into
drivers/net/ethernet/../mlx5/core/main.c
and rename it to mlx5_get_vector_affinity and use it from both rdma
and netdevice

and change the above function to:

static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
{
       return cpumask_first(mlx5_get_vector_affinity(priv->mdev, ix));
}

Also this way you don't have to touch all the lines that used
mlx5e_get_cpu in this file.

>                              struct mlx5e_params *params,
>                              struct mlx5e_channel_param *cparam)
> @@ -1718,11 +1713,11 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
>  {
>         struct mlx5e_cq_moder icocq_moder = {0, 0};
>         struct net_device *netdev = priv->netdev;
> -       int cpu = mlx5e_get_cpu(priv, ix);
>         struct mlx5e_channel *c;
>         int err;
>
> -       c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
> +       c = kzalloc_node(sizeof(*c), GFP_KERNEL,
> +               pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix));

this might yield different behavior of what originally intended we
want to get the node of the CPU and not of the IRQ's, maybe there is
no difference but
let's keep mlx5e_get_cpu abstraction as above.

>         if (!c)
>                 return -ENOMEM;
>
> @@ -1730,7 +1725,8 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
>         c->mdev     = priv->mdev;
>         c->tstamp   = &priv->tstamp;
>         c->ix       = ix;
> -       c->cpu      = cpu;
> +       c->cpu      = cpumask_first(pci_irq_get_affinity(priv->mdev->pdev,
> +                       MLX5_EQ_VEC_COMP_BASE + ix));
>         c->pdev     = &priv->mdev->pdev->dev;
>         c->netdev   = priv->netdev;
>         c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> index e4431aacce9d..7b9e7301929b 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> @@ -312,6 +312,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
>  {
>         struct mlx5_priv *priv = &dev->priv;
>         struct mlx5_eq_table *table = &priv->eq_table;
> +       struct irq_affinity irqdesc = {
> +               .pre_vectors = MLX5_EQ_VEC_COMP_BASE,
> +       };
>         int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
>         int nvec;
>
> @@ -325,9 +328,10 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
>         if (!priv->irq_info)
>                 goto err_free_msix;
>
> -       nvec = pci_alloc_irq_vectors(dev->pdev,
> +       nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
>                         MLX5_EQ_VEC_COMP_BASE + 1, nvec,
> -                       PCI_IRQ_MSIX);
> +                       PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
> +                       &irqdesc);
>         if (nvec < 0)
>                 return nvec;
>
> @@ -600,71 +604,6 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
>         return (u64)timer_l | (u64)timer_h1 << 32;
>  }
>
> -static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
> -{
> -       struct mlx5_priv *priv  = &mdev->priv;
> -       int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
> -       int err;
> -
> -       if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
> -               mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
> -               return -ENOMEM;
> -       }
> -
> -       cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
> -                       priv->irq_info[i].mask);
> -
> -       err = irq_set_affinity_hint(irq, priv->irq_info[i].mask);
> -       if (err) {
> -               mlx5_core_warn(mdev, "irq_set_affinity_hint failed,irq 0x%.4x",
> -                              irq);
> -               goto err_clear_mask;
> -       }
> -
> -       return 0;
> -
> -err_clear_mask:
> -       free_cpumask_var(priv->irq_info[i].mask);
> -       return err;
> -}
> -
> -static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
> -{
> -       struct mlx5_priv *priv  = &mdev->priv;
> -       int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
> -
> -       irq_set_affinity_hint(irq, NULL);
> -       free_cpumask_var(priv->irq_info[i].mask);
> -}
> -
> -static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
> -{
> -       int err;
> -       int i;
> -
> -       for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
> -               err = mlx5_irq_set_affinity_hint(mdev, i);
> -               if (err)
> -                       goto err_out;
> -       }
> -
> -       return 0;
> -
> -err_out:
> -       for (i--; i >= 0; i--)
> -               mlx5_irq_clear_affinity_hint(mdev, i);
> -
> -       return err;
> -}
> -
> -static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
> -{
> -       int i;
> -
> -       for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
> -               mlx5_irq_clear_affinity_hint(mdev, i);
> -}
> -
>  int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
>                     unsigned int *irqn)
>  {
> @@ -1116,12 +1055,6 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
>                 goto err_stop_eqs;
>         }
>
> -       err = mlx5_irq_set_affinity_hints(dev);
> -       if (err) {
> -               dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
> -               goto err_affinity_hints;
> -       }
> -
>         err = mlx5_init_fs(dev);
>         if (err) {
>                 dev_err(&pdev->dev, "Failed to init flow steering\n");
> @@ -1165,9 +1098,6 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
>         mlx5_cleanup_fs(dev);
>
>  err_fs:
> -       mlx5_irq_clear_affinity_hints(dev);
> -
> -err_affinity_hints:
>         free_comp_eqs(dev);
>
>  err_stop_eqs:
> @@ -1234,7 +1164,6 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
>         mlx5_eswitch_detach(dev->priv.eswitch);
>  #endif
>         mlx5_cleanup_fs(dev);
> -       mlx5_irq_clear_affinity_hints(dev);
>         free_comp_eqs(dev);
>         mlx5_stop_eqs(dev);
>         mlx5_put_uars_page(dev, priv->uar);
> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
> index 4843fab18b83..963e3d59d740 100644
> --- a/include/linux/mlx5/driver.h
> +++ b/include/linux/mlx5/driver.h
> @@ -527,7 +527,6 @@ struct mlx5_core_sriov {
>  };
>
>  struct mlx5_irq_info {
> -       cpumask_var_t mask;
>         char name[MLX5_MAX_IRQ_NAME];
>  };
>
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig June 7, 2017, 8:31 a.m. UTC | #2
On Wed, Jun 07, 2017 at 09:16:47AM +0300, Saeed Mahameed wrote:
> On Mon, Jun 5, 2017 at 9:35 AM, Sagi Grimberg <sagi@grimberg.me> wrote:
> > generic api takes care of spreading affinity similar to
> > what mlx5 open coded (and even handles better asymmetric
> > configurations). Ask the generic API to spread affinity
> > for us, and feed him pre_vectors that do not participate
> > in affinity settings (which is an improvement to what we
> > had before).
> >
> > The affinity assignments should match what mlx5 tried to
> > do earlier but now we do not set affinity to async, cmd
> > and pages dedicated vectors.
> >
> 
> I am not sure the new assignment will match what we tried to do before
> this patch, and i would like to preserve that behavior
> from before we simply spread comp vectors to the close numa cpus first
> then to other cores uniformly.
> i.e prefer first IRQs to go to close numa cores.
> 
> for example if you have 2 numa nodes each have 4 cpus, and the device
> is on 2nd numa,
> Numa 1 cpus: 0 1 2 3
> Numa 2 cpus: 4 5 6 7
> 
> this should be the affinity:
> 
> IRQ[0] -> cpu[4] (Numa 2)
> IRQ[1] -> cpu[5]
> IRQ[2] -> cpu[6]
> IRQ[3] -> cpu[7]
> 
> IRQ[4] -> cpu[0] (Numa 1)
> IRQ[5] -> cpu[1]
> IRQ[6] -> cpu[2]
> IRQ[7] -> cpu[3]
> 
> looking at irq_create_affinity_masks, and it seems not to be the case !
> "nodemask_t nodemsk = NODE_MASK_NONE;" it doesn't seem to prefer any numa node.

nodemsk is set up by get_nodes_in_cpumask.  The mask you should
get with the new code is:

IRQ[0] -> cpu[0] (Numa 1)
IRQ[1] -> cpu[1]
IRQ[2] -> cpu[2]
IRQ[3] -> cpu[3]

IRQ[4] -> cpu[4] (Numa 2)
IRQ[5] -> cpu[5]
IRQ[6] -> cpu[6]
IRQ[7] -> cpu[7]

is there any reason you want to start assining vectors on the local
node?  This is doable, but would complicate the code quite a bit
so it needs a good argument.
 
> I am sure that there is a way to force our mlx5 affinity strategy and
> override the default one with the new API.

No, there is not.  The whole point is that we want to come up with
a common policy instead of each driver doing their own weird little
thing.

> > -static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
> > -{
> > -       return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
> > -}
> >
> 
> let's keep this abstraction, even let's consider moving this to a
> helper function in the mlx5_core dirver main.c,
> it is not right when mlx5_ib and mlx5e netdev know about internal mdev
> structures and implementations of stuff.
> 
> I suggest to move mlx5_ib_get_vector_affinity from patch #4 into
> drivers/net/ethernet/../mlx5/core/main.c
> and rename it to mlx5_get_vector_affinity and use it from both rdma
> and netdevice
> 
> and change the above function to:
> 
> static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
> {
>        return cpumask_first(mlx5_get_vector_affinity(priv->mdev, ix));
> }

Take a look at my comment to Sagi's repost.  The driver never
actually cares about this weird cpu value - it cares about a node
for the vectors and PCI layer provides the pci_irq_get_node helper
for that.  We could wrap this with a mlx5e helper, but that's not
really the normal style in the kernel.

> >         int err;
> >
> > -       c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
> > +       c = kzalloc_node(sizeof(*c), GFP_KERNEL,
> > +               pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix));
> 
> this might yield different behavior of what originally intended we
> want to get the node of the CPU and not of the IRQ's, maybe there is
> no difference but
> let's keep mlx5e_get_cpu abstraction as above.

It's a completely bogus abstraction.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg June 7, 2017, 9:48 a.m. UTC | #3
>> I am not sure the new assignment will match what we tried to do before
>> this patch, and i would like to preserve that behavior
>> from before we simply spread comp vectors to the close numa cpus first
>> then to other cores uniformly.
>> i.e prefer first IRQs to go to close numa cores.
>>
>> for example if you have 2 numa nodes each have 4 cpus, and the device
>> is on 2nd numa,
>> Numa 1 cpus: 0 1 2 3
>> Numa 2 cpus: 4 5 6 7
>>
>> this should be the affinity:
>>
>> IRQ[0] -> cpu[4] (Numa 2)
>> IRQ[1] -> cpu[5]
>> IRQ[2] -> cpu[6]
>> IRQ[3] -> cpu[7]
>>
>> IRQ[4] -> cpu[0] (Numa 1)
>> IRQ[5] -> cpu[1]
>> IRQ[6] -> cpu[2]
>> IRQ[7] -> cpu[3]
>>
>> looking at irq_create_affinity_masks, and it seems not to be the case !
>> "nodemask_t nodemsk = NODE_MASK_NONE;" it doesn't seem to prefer any numa node.
> 
> nodemsk is set up by get_nodes_in_cpumask.  The mask you should
> get with the new code is:
> 
> IRQ[0] -> cpu[0] (Numa 1)
> IRQ[1] -> cpu[1]
> IRQ[2] -> cpu[2]
> IRQ[3] -> cpu[3]
> 
> IRQ[4] -> cpu[4] (Numa 2)
> IRQ[5] -> cpu[5]
> IRQ[6] -> cpu[6]
> IRQ[7] -> cpu[7]
> 
> is there any reason you want to start assining vectors on the local
> node?  This is doable, but would complicate the code quite a bit
> so it needs a good argument.

My interpretation is that mlx5 tried to do this for the (rather esoteric
in my mind) case where the platform does not have enough vectors for the
driver to allocate percpu. In this case, the next best thing is to stay
as close to the device affinity as possible.

>> I am sure that there is a way to force our mlx5 affinity strategy and
>> override the default one with the new API.
> 
> No, there is not.  The whole point is that we want to come up with
> a common policy instead of each driver doing their own weird little
> thing.

Agreed.

>>> -static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
>>> -{
>>> -       return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
>>> -}
>>>
>>
>> let's keep this abstraction, even let's consider moving this to a
>> helper function in the mlx5_core dirver main.c,
>> it is not right when mlx5_ib and mlx5e netdev know about internal mdev
>> structures and implementations of stuff.
>>
>> I suggest to move mlx5_ib_get_vector_affinity from patch #4 into
>> drivers/net/ethernet/../mlx5/core/main.c
>> and rename it to mlx5_get_vector_affinity and use it from both rdma
>> and netdevice
>>
>> and change the above function to:
>>
>> static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
>> {
>>         return cpumask_first(mlx5_get_vector_affinity(priv->mdev, ix));
>> }
> 
> Take a look at my comment to Sagi's repost.  The driver never
> actually cares about this weird cpu value - it cares about a node
> for the vectors and PCI layer provides the pci_irq_get_node helper
> for that.  We could wrap this with a mlx5e helper, but that's not
> really the normal style in the kernel.
> 
>>>          int err;
>>>
>>> -       c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
>>> +       c = kzalloc_node(sizeof(*c), GFP_KERNEL,
>>> +               pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix));
>>
>> this might yield different behavior of what originally intended we
>> want to get the node of the CPU and not of the IRQ's, maybe there is
>> no difference but

There is no difference, the node of the CPU _is_ the node of the IRQs
(it originates from irq affinity).

>> let's keep mlx5e_get_cpu abstraction as above.
> 
> It's a completely bogus abstraction.

I tend to agree, but can easily change it.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saeed Mahameed June 8, 2017, 9:28 a.m. UTC | #4
On Wed, Jun 7, 2017 at 12:48 PM, Sagi Grimberg <sagi@grimberg.me> wrote:
>
>>> I am not sure the new assignment will match what we tried to do before
>>> this patch, and i would like to preserve that behavior
>>> from before we simply spread comp vectors to the close numa cpus first
>>> then to other cores uniformly.
>>> i.e prefer first IRQs to go to close numa cores.
>>>
>>> for example if you have 2 numa nodes each have 4 cpus, and the device
>>> is on 2nd numa,
>>> Numa 1 cpus: 0 1 2 3
>>> Numa 2 cpus: 4 5 6 7
>>>
>>> this should be the affinity:
>>>
>>> IRQ[0] -> cpu[4] (Numa 2)
>>> IRQ[1] -> cpu[5]
>>> IRQ[2] -> cpu[6]
>>> IRQ[3] -> cpu[7]
>>>
>>> IRQ[4] -> cpu[0] (Numa 1)
>>> IRQ[5] -> cpu[1]
>>> IRQ[6] -> cpu[2]
>>> IRQ[7] -> cpu[3]
>>>
>>> looking at irq_create_affinity_masks, and it seems not to be the case !
>>> "nodemask_t nodemsk = NODE_MASK_NONE;" it doesn't seem to prefer any numa
>>> node.
>>
>>
>> nodemsk is set up by get_nodes_in_cpumask.  The mask you should
>> get with the new code is:
>>
>> IRQ[0] -> cpu[0] (Numa 1)
>> IRQ[1] -> cpu[1]
>> IRQ[2] -> cpu[2]
>> IRQ[3] -> cpu[3]
>>
>> IRQ[4] -> cpu[4] (Numa 2)
>> IRQ[5] -> cpu[5]
>> IRQ[6] -> cpu[6]
>> IRQ[7] -> cpu[7]
>>
>> is there any reason you want to start assining vectors on the local
>> node?  This is doable, but would complicate the code quite a bit
>> so it needs a good argument.
>
>
> My interpretation is that mlx5 tried to do this for the (rather esoteric
> in my mind) case where the platform does not have enough vectors for the
> driver to allocate percpu. In this case, the next best thing is to stay
> as close to the device affinity as possible.
>

No, we did it for the reason that mlx5e netdevice assumes that
IRQ[0]..IRQ[#num_numa/#cpu_per_numa]
are always bound to the numa close to the device. and the mlx5e driver
choose those IRQs to spread
the RSS hash only into them and never uses other IRQs/Cores

which means with the current mlx5e code
(mlx5e_build_default_indir_rqt), there is a good chance the driver
will only use
cpus/IRQs on the far numa for it's RX traffic.

one way to fix this is to change mlx5e_build_default_indir_rqt to not
have this assumption and spread the RSS hash across
all the IRQs.

But this will increase the risk of conflicting with current net-next,
is Doug ok with this ? did we merge test ?

>>> I am sure that there is a way to force our mlx5 affinity strategy and
>>> override the default one with the new API.
>>
>>
>> No, there is not.  The whole point is that we want to come up with
>> a common policy instead of each driver doing their own weird little
>> thing.
>
>
> Agreed.
>
>

I can live with that, but please address the above, since it will be a
regression.

>>>> -static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
>>>> -{
>>>> -       return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
>>>> -}
>>>>
>>>
>>> let's keep this abstraction, even let's consider moving this to a
>>> helper function in the mlx5_core dirver main.c,
>>> it is not right when mlx5_ib and mlx5e netdev know about internal mdev
>>> structures and implementations of stuff.
>>>
>>> I suggest to move mlx5_ib_get_vector_affinity from patch #4 into
>>> drivers/net/ethernet/../mlx5/core/main.c
>>> and rename it to mlx5_get_vector_affinity and use it from both rdma
>>> and netdevice
>>>
>>> and change the above function to:
>>>
>>> static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
>>> {
>>>         return cpumask_first(mlx5_get_vector_affinity(priv->mdev, ix));
>>> }
>>
>>
>> Take a look at my comment to Sagi's repost.  The driver never
>> actually cares about this weird cpu value - it cares about a node
>> for the vectors and PCI layer provides the pci_irq_get_node helper
>> for that.  We could wrap this with a mlx5e helper, but that's not
>> really the normal style in the kernel.
>>
>>>>          int err;
>>>>
>>>> -       c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
>>>> +       c = kzalloc_node(sizeof(*c), GFP_KERNEL,
>>>> +               pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE
>>>> + ix));
>>>
>>>
>>> this might yield different behavior of what originally intended we
>>> want to get the node of the CPU and not of the IRQ's, maybe there is
>>> no difference but
>
>
> There is no difference, the node of the CPU _is_ the node of the IRQs
> (it originates from irq affinity).
>
>>> let's keep mlx5e_get_cpu abstraction as above.
>>
>>
>> It's a completely bogus abstraction.
>
>
> I tend to agree, but can easily change it.

at least change to to mlx5e_get_node ! As i said i don't want to
pepper the mlx5e code with stuff like
(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix) just call
mlx5e_get_channel_node(priv, channel_ix);
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg June 8, 2017, 10:16 a.m. UTC | #5
>>> is there any reason you want to start assining vectors on the local
>>> node?  This is doable, but would complicate the code quite a bit
>>> so it needs a good argument.
>>
>>
>> My interpretation is that mlx5 tried to do this for the (rather esoteric
>> in my mind) case where the platform does not have enough vectors for the
>> driver to allocate percpu. In this case, the next best thing is to stay
>> as close to the device affinity as possible.
>>
> 
> No, we did it for the reason that mlx5e netdevice assumes that
> IRQ[0]..IRQ[#num_numa/#cpu_per_numa]
> are always bound to the numa close to the device. and the mlx5e driver
> choose those IRQs to spread
> the RSS hash only into them and never uses other IRQs/Cores

OK, that explains a lot of weirdness I've seen with mlx5e.

Can you explain why you're using only a single numa node for your RSS
table? What does it buy you? You open RX rings for _all_ cpus but
only spread on part of them? I must be missing something here...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saeed Mahameed June 8, 2017, 11:42 a.m. UTC | #6
On Thu, Jun 8, 2017 at 1:16 PM, Sagi Grimberg <sagi@grimberg.me> wrote:
>
>>>> is there any reason you want to start assining vectors on the local
>>>> node?  This is doable, but would complicate the code quite a bit
>>>> so it needs a good argument.
>>>
>>>
>>>
>>> My interpretation is that mlx5 tried to do this for the (rather esoteric
>>> in my mind) case where the platform does not have enough vectors for the
>>> driver to allocate percpu. In this case, the next best thing is to stay
>>> as close to the device affinity as possible.
>>>
>>
>> No, we did it for the reason that mlx5e netdevice assumes that
>> IRQ[0]..IRQ[#num_numa/#cpu_per_numa]
>> are always bound to the numa close to the device. and the mlx5e driver
>> choose those IRQs to spread
>> the RSS hash only into them and never uses other IRQs/Cores
>
>
> OK, that explains a lot of weirdness I've seen with mlx5e.
>
> Can you explain why you're using only a single numa node for your RSS
> table? What does it buy you? You open RX rings for _all_ cpus but
> only spread on part of them? I must be missing something here...

Adding Tariq,

this is also part of the weirdness :), we do that to make sure any OOB
test you run you always get the best performance
and we will guarantee to always use close numa cores.

we open RX rings on all of the cores in case if the user want to
change the RSS table to point to the whole thing on the fly "ethtool
-X"

But we are willing to change that, Tariq can provide the patch,
without changing this mlx5e is broken.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg June 8, 2017, 12:29 p.m. UTC | #7
>>>> My interpretation is that mlx5 tried to do this for the (rather esoteric
>>>> in my mind) case where the platform does not have enough vectors for the
>>>> driver to allocate percpu. In this case, the next best thing is to stay
>>>> as close to the device affinity as possible.
>>>>
>>>
>>> No, we did it for the reason that mlx5e netdevice assumes that
>>> IRQ[0]..IRQ[#num_numa/#cpu_per_numa]
>>> are always bound to the numa close to the device. and the mlx5e driver
>>> choose those IRQs to spread
>>> the RSS hash only into them and never uses other IRQs/Cores
>>
>>
>> OK, that explains a lot of weirdness I've seen with mlx5e.
>>
>> Can you explain why you're using only a single numa node for your RSS
>> table? What does it buy you? You open RX rings for _all_ cpus but
>> only spread on part of them? I must be missing something here...
> 
> Adding Tariq,
> 
> this is also part of the weirdness :), we do that to make sure any OOB
> test you run you always get the best performance
> and we will guarantee to always use close numa cores.

Well I wish I knew that before :( I got to a point where I started
to seriously doubt the math truth of xor/toeplitz hashing strength :)

I'm sure you ran plenty of performance tests, but from my experience,
application locality makes much more difference than device locality,
especially when the application needs to touch the data...

> we open RX rings on all of the cores in case if the user want to
> change the RSS table to point to the whole thing on the fly "ethtool
> -X"

That is very counter intuitive afaict, is it documented anywhere?

users might rely on the (absolutely reasonable) assumption that if a
NIC exposes X rx rings, rx hashing should spread across all of them and
not a subset.

> But we are willing to change that, Tariq can provide the patch,
> without changing this mlx5e is broken.

What patch? to modify the RSS spread? What is exactly broken?

So I'm not sure how to move forward here, should we modify the
indirection table construction to not rely on the unique affinity
mappings?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2a3c59e55dcf..ebfda1eae6b4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1565,11 +1565,6 @@  static void mlx5e_close_cq(struct mlx5e_cq *cq)
 	mlx5e_free_cq(cq);
 }
 
-static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
-{
-	return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
-}
-
 static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
 			     struct mlx5e_params *params,
 			     struct mlx5e_channel_param *cparam)
@@ -1718,11 +1713,11 @@  static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 {
 	struct mlx5e_cq_moder icocq_moder = {0, 0};
 	struct net_device *netdev = priv->netdev;
-	int cpu = mlx5e_get_cpu(priv, ix);
 	struct mlx5e_channel *c;
 	int err;
 
-	c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
+	c = kzalloc_node(sizeof(*c), GFP_KERNEL,
+		pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix));
 	if (!c)
 		return -ENOMEM;
 
@@ -1730,7 +1725,8 @@  static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	c->mdev     = priv->mdev;
 	c->tstamp   = &priv->tstamp;
 	c->ix       = ix;
-	c->cpu      = cpu;
+	c->cpu      = cpumask_first(pci_irq_get_affinity(priv->mdev->pdev,
+			MLX5_EQ_VEC_COMP_BASE + ix));
 	c->pdev     = &priv->mdev->pdev->dev;
 	c->netdev   = priv->netdev;
 	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index e4431aacce9d..7b9e7301929b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -312,6 +312,9 @@  static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_eq_table *table = &priv->eq_table;
+	struct irq_affinity irqdesc = {
+		.pre_vectors = MLX5_EQ_VEC_COMP_BASE,
+	};
 	int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
 	int nvec;
 
@@ -325,9 +328,10 @@  static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 	if (!priv->irq_info)
 		goto err_free_msix;
 
-	nvec = pci_alloc_irq_vectors(dev->pdev,
+	nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
 			MLX5_EQ_VEC_COMP_BASE + 1, nvec,
-			PCI_IRQ_MSIX);
+			PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
+			&irqdesc);
 	if (nvec < 0)
 		return nvec;
 
@@ -600,71 +604,6 @@  u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
 	return (u64)timer_l | (u64)timer_h1 << 32;
 }
 
-static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
-{
-	struct mlx5_priv *priv  = &mdev->priv;
-	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
-	int err;
-
-	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
-		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
-		return -ENOMEM;
-	}
-
-	cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
-			priv->irq_info[i].mask);
-
-	err = irq_set_affinity_hint(irq, priv->irq_info[i].mask);
-	if (err) {
-		mlx5_core_warn(mdev, "irq_set_affinity_hint failed,irq 0x%.4x",
-			       irq);
-		goto err_clear_mask;
-	}
-
-	return 0;
-
-err_clear_mask:
-	free_cpumask_var(priv->irq_info[i].mask);
-	return err;
-}
-
-static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
-{
-	struct mlx5_priv *priv  = &mdev->priv;
-	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
-
-	irq_set_affinity_hint(irq, NULL);
-	free_cpumask_var(priv->irq_info[i].mask);
-}
-
-static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
-{
-	int err;
-	int i;
-
-	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
-		err = mlx5_irq_set_affinity_hint(mdev, i);
-		if (err)
-			goto err_out;
-	}
-
-	return 0;
-
-err_out:
-	for (i--; i >= 0; i--)
-		mlx5_irq_clear_affinity_hint(mdev, i);
-
-	return err;
-}
-
-static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
-{
-	int i;
-
-	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
-		mlx5_irq_clear_affinity_hint(mdev, i);
-}
-
 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 		    unsigned int *irqn)
 {
@@ -1116,12 +1055,6 @@  static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_stop_eqs;
 	}
 
-	err = mlx5_irq_set_affinity_hints(dev);
-	if (err) {
-		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
-		goto err_affinity_hints;
-	}
-
 	err = mlx5_init_fs(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init flow steering\n");
@@ -1165,9 +1098,6 @@  static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_cleanup_fs(dev);
 
 err_fs:
-	mlx5_irq_clear_affinity_hints(dev);
-
-err_affinity_hints:
 	free_comp_eqs(dev);
 
 err_stop_eqs:
@@ -1234,7 +1164,6 @@  static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_eswitch_detach(dev->priv.eswitch);
 #endif
 	mlx5_cleanup_fs(dev);
-	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_put_uars_page(dev, priv->uar);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 4843fab18b83..963e3d59d740 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -527,7 +527,6 @@  struct mlx5_core_sriov {
 };
 
 struct mlx5_irq_info {
-	cpumask_var_t mask;
 	char name[MLX5_MAX_IRQ_NAME];
 };