diff mbox series

[net,3/3] net-sysfs: move the xps cpus/rxqs retrieval in a common function

Message ID 20210106180428.722521-4-atenart@kernel.org (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net-sysfs: move the xps cpus/rxqs retrieval in a common function | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present fail Series targets non-next tree, but doesn't contain any Fixes tags
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net
netdev/subject_prefix success Link
netdev/cc_maintainers warning 5 maintainers not CCed: andrew@lunn.ch alexanderduyck@fb.com christian.brauner@ubuntu.com edumazet@google.com abelits@marvell.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 1 this patch: 1
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 221 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 1 this patch: 1
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Antoine Tenart Jan. 6, 2021, 6:04 p.m. UTC
Most of the xps_cpus_show and xps_rxqs_show functions share the same
logic. Having it in two different functions does not help maintenance
and we can already see small implementation differences. This should not
be the case and this patch moves their common logic into a new function,
xps_queue_show, to improve maintenance.

While the rtnl lock could be held in the new xps_queue_show, it is still
held in xps_cpus_show and xps_rxqs_show as this is an important
information when looking at those two functions. This does not add
complexity.

Signed-off-by: Antoine Tenart <atenart@kernel.org>
---
 net/core/net-sysfs.c | 168 ++++++++++++++++++++-----------------------
 1 file changed, 79 insertions(+), 89 deletions(-)

Comments

Alexander Duyck Jan. 6, 2021, 7:54 p.m. UTC | #1
On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart <atenart@kernel.org> wrote:
>
> Most of the xps_cpus_show and xps_rxqs_show functions share the same
> logic. Having it in two different functions does not help maintenance
> and we can already see small implementation differences. This should not
> be the case and this patch moves their common logic into a new function,
> xps_queue_show, to improve maintenance.
>
> While the rtnl lock could be held in the new xps_queue_show, it is still
> held in xps_cpus_show and xps_rxqs_show as this is an important
> information when looking at those two functions. This does not add
> complexity.
>
> Signed-off-by: Antoine Tenart <atenart@kernel.org>
> ---
>  net/core/net-sysfs.c | 168 ++++++++++++++++++++-----------------------
>  1 file changed, 79 insertions(+), 89 deletions(-)
>
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 5a39e9b38e5f..6e6bc05181f6 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -1314,77 +1314,98 @@ static const struct attribute_group dql_group = {
>  #endif /* CONFIG_BQL */
>
>  #ifdef CONFIG_XPS
> -static ssize_t xps_cpus_show(struct netdev_queue *queue,
> -                            char *buf)
> +/* Should be called with the rtnl lock held. */
> +static int xps_queue_show(struct net_device *dev, unsigned long **mask,
> +                         unsigned int index, bool is_rxqs_map)

Why pass dev and index instead of just the queue which already
contains both? I think it would make more sense to just stick to
passing the queue through along with a pointer to the xps_dev_maps
value that we need to read.

>  {
> -       int cpu, len, ret, num_tc = 1, tc = 0;
> -       struct net_device *dev = queue->dev;
> +       const unsigned long *possible_mask = NULL;
> +       int j, num_tc = 0, tc = 0, ret = 0;
>         struct xps_dev_maps *dev_maps;
> -       unsigned long *mask;
> -       unsigned int index;
> -
> -       if (!netif_is_multiqueue(dev))
> -               return -ENOENT;
> -
> -       index = get_netdev_queue_index(queue);
> -
> -       if (!rtnl_trylock())
> -               return restart_syscall();
> +       unsigned int nr_ids;
>
>         if (dev->num_tc) {
>                 /* Do not allow XPS on subordinate device directly */
>                 num_tc = dev->num_tc;
> -               if (num_tc < 0) {
> -                       ret = -EINVAL;
> -                       goto err_rtnl_unlock;
> -               }
> +               if (num_tc < 0)
> +                       return -EINVAL;
>
>                 /* If queue belongs to subordinate dev use its map */
>                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
>
>                 tc = netdev_txq_to_tc(dev, index);
> -               if (tc < 0) {
> -                       ret = -EINVAL;
> -                       goto err_rtnl_unlock;
> -               }
> +               if (tc < 0)
> +                       return -EINVAL;
>         }
>

So if we store the num_tc and nr_ids in the dev_maps structure then we
could simplify this a bit by pulling the num_tc info out of the
dev_map and only asking the Tx queue for the tc in that case and
validating it against (tc <0 || num_tc <= tc) and returning an error
if either are true.

This would also allow us to address the fact that the rxqs feature
doesn't support the subordinate devices as you could pull out the bit
above related to the sb_dev and instead call that prior to calling
xps_queue_show so that you are operating on the correct device map.

> -       mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
> -       if (!mask) {
> -               ret = -ENOMEM;
> -               goto err_rtnl_unlock;
> +       rcu_read_lock();
> +
> +       if (is_rxqs_map) {
> +               dev_maps = rcu_dereference(dev->xps_rxqs_map);
> +               nr_ids = dev->num_rx_queues;
> +       } else {
> +               dev_maps = rcu_dereference(dev->xps_cpus_map);
> +               nr_ids = nr_cpu_ids;
> +               if (num_possible_cpus() > 1)
> +                       possible_mask = cpumask_bits(cpu_possible_mask);
>         }

I think Jakub had mentioned earlier the idea of possibly moving some
fields into the xps_cpus_map and xps_rxqs_map in order to reduce the
complexity of this so that certain values would be protected by the
RCU lock.

This might be a good time to look at encoding things like the number
of IDs and the number of TCs there in order to avoid a bunch of this
duplication. Then you could just pass a pointer to the map you want to
display and the code should be able to just dump the values.:

> +       if (!dev_maps)
> +               goto rcu_unlock;
>
> -       rcu_read_lock();
> -       dev_maps = rcu_dereference(dev->xps_cpus_map);
> -       if (dev_maps) {
> -               for_each_possible_cpu(cpu) {
> -                       int i, tci = cpu * num_tc + tc;
> -                       struct xps_map *map;
> -
> -                       map = rcu_dereference(dev_maps->attr_map[tci]);
> -                       if (!map)
> -                               continue;
> -
> -                       for (i = map->len; i--;) {
> -                               if (map->queues[i] == index) {
> -                                       set_bit(cpu, mask);
> -                                       break;
> -                               }
> +       for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
> +            j < nr_ids;) {
> +               int i, tci = j * num_tc + tc;
> +               struct xps_map *map;
> +
> +               map = rcu_dereference(dev_maps->attr_map[tci]);
> +               if (!map)
> +                       continue;
> +
> +               for (i = map->len; i--;) {
> +                       if (map->queues[i] == index) {
> +                               set_bit(j, *mask);
> +                               break;
>                         }
>                 }
>         }
> +
> +rcu_unlock:
>         rcu_read_unlock();
>
> +       return ret;
> +}
> +
> +static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
> +{
> +       struct net_device *dev = queue->dev;
> +       unsigned long *mask;
> +       unsigned int index;
> +       int len, ret;
> +
> +       if (!netif_is_multiqueue(dev))
> +               return -ENOENT;
> +
> +       index = get_netdev_queue_index(queue);
> +
> +       mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
> +       if (!mask)
> +               return -ENOMEM;
> +
> +       if (!rtnl_trylock()) {
> +               bitmap_free(mask);
> +               return restart_syscall();
> +       }
> +
> +       ret = xps_queue_show(dev, &mask, index, false);
>         rtnl_unlock();
>
> +       if (ret) {
> +               bitmap_free(mask);
> +               return ret;
> +       }
> +
>         len = bitmap_print_to_pagebuf(false, buf, mask, nr_cpu_ids);
>         bitmap_free(mask);
>         return len < PAGE_SIZE ? len : -EINVAL;
> -
> -err_rtnl_unlock:
> -       rtnl_unlock();
> -       return ret;
>  }
>
>  static ssize_t xps_cpus_store(struct netdev_queue *queue,
> @@ -1430,65 +1451,34 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
>
>  static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
>  {
> -       int j, len, ret, num_tc = 1, tc = 0;
>         struct net_device *dev = queue->dev;
> -       struct xps_dev_maps *dev_maps;
>         unsigned long *mask;
>         unsigned int index;
> +       int len, ret;
>
>         index = get_netdev_queue_index(queue);
>
> -       if (!rtnl_trylock())
> -               return restart_syscall();
> -
> -       if (dev->num_tc) {
> -               num_tc = dev->num_tc;
> -               tc = netdev_txq_to_tc(dev, index);
> -               if (tc < 0) {
> -                       ret = -EINVAL;
> -                       goto err_rtnl_unlock;
> -               }
> -       }
>         mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
> -       if (!mask) {
> -               ret = -ENOMEM;
> -               goto err_rtnl_unlock;
> -       }
> -
> -       rcu_read_lock();
> -       dev_maps = rcu_dereference(dev->xps_rxqs_map);
> -       if (!dev_maps)
> -               goto out_no_maps;
> -
> -       for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
> -            j < dev->num_rx_queues;) {
> -               int i, tci = j * num_tc + tc;
> -               struct xps_map *map;
> -
> -               map = rcu_dereference(dev_maps->attr_map[tci]);
> -               if (!map)
> -                       continue;
> +       if (!mask)
> +               return -ENOMEM;
>
> -               for (i = map->len; i--;) {
> -                       if (map->queues[i] == index) {
> -                               set_bit(j, mask);
> -                               break;
> -                       }
> -               }
> +       if (!rtnl_trylock()) {
> +               bitmap_free(mask);
> +               return restart_syscall();
>         }
> -out_no_maps:
> -       rcu_read_unlock();
>
> +       ret = xps_queue_show(dev, &mask, index, true);
>         rtnl_unlock();
>
> +       if (ret) {
> +               bitmap_free(mask);
> +               return ret;
> +       }
> +
>         len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
>         bitmap_free(mask);
>
>         return len < PAGE_SIZE ? len : -EINVAL;
> -
> -err_rtnl_unlock:
> -       rtnl_unlock();
> -       return ret;
>  }
>
>  static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
> --
> 2.29.2
>
Antoine Tenart Jan. 7, 2021, 8:54 a.m. UTC | #2
Quoting Alexander Duyck (2021-01-06 20:54:11)
> On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart <atenart@kernel.org> wrote:
> > +/* Should be called with the rtnl lock held. */
> > +static int xps_queue_show(struct net_device *dev, unsigned long **mask,
> > +                         unsigned int index, bool is_rxqs_map)
> 
> Why pass dev and index instead of just the queue which already
> contains both?

Right, I can do that.

> I think it would make more sense to just stick to passing the queue
> through along with a pointer to the xps_dev_maps value that we need to
> read.

That would require to hold rcu_read_lock in the caller and I'd like to
keep it in that function.

> >         if (dev->num_tc) {
> >                 /* Do not allow XPS on subordinate device directly */
> >                 num_tc = dev->num_tc;
> > -               if (num_tc < 0) {
> > -                       ret = -EINVAL;
> > -                       goto err_rtnl_unlock;
> > -               }
> > +               if (num_tc < 0)
> > +                       return -EINVAL;
> >
> >                 /* If queue belongs to subordinate dev use its map */
> >                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> >
> >                 tc = netdev_txq_to_tc(dev, index);
> > -               if (tc < 0) {
> > -                       ret = -EINVAL;
> > -                       goto err_rtnl_unlock;
> > -               }
> > +               if (tc < 0)
> > +                       return -EINVAL;
> >         }
> >
> 
> So if we store the num_tc and nr_ids in the dev_maps structure then we
> could simplify this a bit by pulling the num_tc info out of the
> dev_map and only asking the Tx queue for the tc in that case and
> validating it against (tc <0 || num_tc <= tc) and returning an error
> if either are true.
> 
> This would also allow us to address the fact that the rxqs feature
> doesn't support the subordinate devices as you could pull out the bit
> above related to the sb_dev and instead call that prior to calling
> xps_queue_show so that you are operating on the correct device map.
> 
> > -       mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
> > -       if (!mask) {
> > -               ret = -ENOMEM;
> > -               goto err_rtnl_unlock;
> > +       rcu_read_lock();
> > +
> > +       if (is_rxqs_map) {
> > +               dev_maps = rcu_dereference(dev->xps_rxqs_map);
> > +               nr_ids = dev->num_rx_queues;
> > +       } else {
> > +               dev_maps = rcu_dereference(dev->xps_cpus_map);
> > +               nr_ids = nr_cpu_ids;
> > +               if (num_possible_cpus() > 1)
> > +                       possible_mask = cpumask_bits(cpu_possible_mask);
> >         }
> 
> I think Jakub had mentioned earlier the idea of possibly moving some
> fields into the xps_cpus_map and xps_rxqs_map in order to reduce the
> complexity of this so that certain values would be protected by the
> RCU lock.
> 
> This might be a good time to look at encoding things like the number
> of IDs and the number of TCs there in order to avoid a bunch of this
> duplication. Then you could just pass a pointer to the map you want to
> display and the code should be able to just dump the values.:

100% agree to all the above. That would also prevent from making out of
bound accesses when dev->num_tc is increased after dev_maps is
allocated. I do have a series ready to be send storing num_tc into the
maps, and reworking code to use it instead of dev->num_tc. The series
also adds checks to ensure the map is valid when we access it (such as
making sure dev->num_tc == map->num_tc). I however did not move nr_ids
into the map yet, but I'll look into it.

The idea is to send it as a follow up series, as this one is only moving
code around to improve maintenance and readability. Even if all the
patches were in the same series that would be a prerequisite.

Thanks!
Antoine
Alexander Duyck Jan. 7, 2021, 4:38 p.m. UTC | #3
On Thu, Jan 7, 2021 at 12:54 AM Antoine Tenart <atenart@kernel.org> wrote:
>
> Quoting Alexander Duyck (2021-01-06 20:54:11)
> > On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart <atenart@kernel.org> wrote:
> > > +/* Should be called with the rtnl lock held. */
> > > +static int xps_queue_show(struct net_device *dev, unsigned long **mask,
> > > +                         unsigned int index, bool is_rxqs_map)
> >
> > Why pass dev and index instead of just the queue which already
> > contains both?
>
> Right, I can do that.

Actually I have to backtrack on that a bit. More on that below.

> > I think it would make more sense to just stick to passing the queue
> > through along with a pointer to the xps_dev_maps value that we need to
> > read.
>
> That would require to hold rcu_read_lock in the caller and I'd like to
> keep it in that function.

Actually you could probably make it work if you were to pass a pointer
to the RCU pointer.

> > >         if (dev->num_tc) {
> > >                 /* Do not allow XPS on subordinate device directly */
> > >                 num_tc = dev->num_tc;
> > > -               if (num_tc < 0) {
> > > -                       ret = -EINVAL;
> > > -                       goto err_rtnl_unlock;
> > > -               }
> > > +               if (num_tc < 0)
> > > +                       return -EINVAL;
> > >
> > >                 /* If queue belongs to subordinate dev use its map */
> > >                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> > >
> > >                 tc = netdev_txq_to_tc(dev, index);
> > > -               if (tc < 0) {
> > > -                       ret = -EINVAL;
> > > -                       goto err_rtnl_unlock;
> > > -               }
> > > +               if (tc < 0)
> > > +                       return -EINVAL;
> > >         }
> > >
> >
> > So if we store the num_tc and nr_ids in the dev_maps structure then we
> > could simplify this a bit by pulling the num_tc info out of the
> > dev_map and only asking the Tx queue for the tc in that case and
> > validating it against (tc <0 || num_tc <= tc) and returning an error
> > if either are true.
> >
> > This would also allow us to address the fact that the rxqs feature
> > doesn't support the subordinate devices as you could pull out the bit
> > above related to the sb_dev and instead call that prior to calling
> > xps_queue_show so that you are operating on the correct device map.

It probably would be necessary to pass dev and index if we pull the
netdev_get_tx_queue()->sb_dev bit out and performed that before we
called the xps_queue_show function. Specifically as the subordinate
device wouldn't match up with the queue device so we would be better
off pulling it out first.

> >
> > > -       mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
> > > -       if (!mask) {
> > > -               ret = -ENOMEM;
> > > -               goto err_rtnl_unlock;
> > > +       rcu_read_lock();
> > > +
> > > +       if (is_rxqs_map) {
> > > +               dev_maps = rcu_dereference(dev->xps_rxqs_map);
> > > +               nr_ids = dev->num_rx_queues;
> > > +       } else {
> > > +               dev_maps = rcu_dereference(dev->xps_cpus_map);
> > > +               nr_ids = nr_cpu_ids;
> > > +               if (num_possible_cpus() > 1)
> > > +                       possible_mask = cpumask_bits(cpu_possible_mask);
> > >         }
> >

I don't think we need the possible_mask check. That is mostly just an
optimization that was making use of an existing "for_each" loop macro.
If we are going to go through 0 through nr_ids then there is no need
for the possible_mask as we can just brute force our way through and
will not find CPU that aren't there since we couldn't have added them
to the map anyway.

> > I think Jakub had mentioned earlier the idea of possibly moving some
> > fields into the xps_cpus_map and xps_rxqs_map in order to reduce the
> > complexity of this so that certain values would be protected by the
> > RCU lock.
> >
> > This might be a good time to look at encoding things like the number
> > of IDs and the number of TCs there in order to avoid a bunch of this
> > duplication. Then you could just pass a pointer to the map you want to
> > display and the code should be able to just dump the values.:
>
> 100% agree to all the above. That would also prevent from making out of
> bound accesses when dev->num_tc is increased after dev_maps is
> allocated. I do have a series ready to be send storing num_tc into the
> maps, and reworking code to use it instead of dev->num_tc. The series
> also adds checks to ensure the map is valid when we access it (such as
> making sure dev->num_tc == map->num_tc). I however did not move nr_ids
> into the map yet, but I'll look into it.
>
> The idea is to send it as a follow up series, as this one is only moving
> code around to improve maintenance and readability. Even if all the
> patches were in the same series that would be a prerequisite.
>
> Thanks!
> Antoine

Okay, so if we are going to do it as a follow-up that is fine I
suppose, but one of the reasons I brought it up is that it would help
this patch set in terms of readability/maintainability. An additional
change we could look at making would be to create an xps_map pointer
array instead of having individual pointers. Then you could simply be
passing an index into the array to indicate if we are accessing the
CPUs or the RXQs map.
Antoine Tenart Jan. 8, 2021, 9:07 a.m. UTC | #4
Quoting Alexander Duyck (2021-01-07 17:38:35)
> On Thu, Jan 7, 2021 at 12:54 AM Antoine Tenart <atenart@kernel.org> wrote:
> >
> > Quoting Alexander Duyck (2021-01-06 20:54:11)
> > > On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart <atenart@kernel.org> wrote:
> >
> > That would require to hold rcu_read_lock in the caller and I'd like to
> > keep it in that function.
> 
> Actually you could probably make it work if you were to pass a pointer
> to the RCU pointer.

That should work but IMHO that could be easily breakable by future
changes as it's a bit tricky.

> > > >         if (dev->num_tc) {
> > > >                 /* Do not allow XPS on subordinate device directly */
> > > >                 num_tc = dev->num_tc;
> > > > -               if (num_tc < 0) {
> > > > -                       ret = -EINVAL;
> > > > -                       goto err_rtnl_unlock;
> > > > -               }
> > > > +               if (num_tc < 0)
> > > > +                       return -EINVAL;
> > > >
> > > >                 /* If queue belongs to subordinate dev use its map */
> > > >                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> > > >
> > > >                 tc = netdev_txq_to_tc(dev, index);
> > > > -               if (tc < 0) {
> > > > -                       ret = -EINVAL;
> > > > -                       goto err_rtnl_unlock;
> > > > -               }
> > > > +               if (tc < 0)
> > > > +                       return -EINVAL;
> > > >         }
> > > >
> > >
> > > So if we store the num_tc and nr_ids in the dev_maps structure then we
> > > could simplify this a bit by pulling the num_tc info out of the
> > > dev_map and only asking the Tx queue for the tc in that case and
> > > validating it against (tc <0 || num_tc <= tc) and returning an error
> > > if either are true.
> > >
> > > This would also allow us to address the fact that the rxqs feature
> > > doesn't support the subordinate devices as you could pull out the bit
> > > above related to the sb_dev and instead call that prior to calling
> > > xps_queue_show so that you are operating on the correct device map.
> 
> It probably would be necessary to pass dev and index if we pull the
> netdev_get_tx_queue()->sb_dev bit out and performed that before we
> called the xps_queue_show function. Specifically as the subordinate
> device wouldn't match up with the queue device so we would be better
> off pulling it out first.

While I agree moving the netdev_get_tx_queue()->sb_dev bit out of
xps_queue_show seems like a good idea for consistency, I'm not sure
it'll work: dev->num_tc is not only used to retrieve the number of tc
but also as a condition on not being 0. We have things like:

  // Always done with the original dev.
  if (dev->num_tc) {

      [...]

      // Can be a subordinate dev.
      tc = netdev_txq_to_tc(dev, index);
  }

And after moving num_tc in the map, we'll have checks like:

  if (dev_maps->num_tc != dev->num_tc)
      return -EINVAL;

I don't think the subordinate dev holds the same num_tc value as dev.
What's your take on this?

> > > > -       mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
> > > > -       if (!mask) {
> > > > -               ret = -ENOMEM;
> > > > -               goto err_rtnl_unlock;
> > > > +       rcu_read_lock();
> > > > +
> > > > +       if (is_rxqs_map) {
> > > > +               dev_maps = rcu_dereference(dev->xps_rxqs_map);
> > > > +               nr_ids = dev->num_rx_queues;
> > > > +       } else {
> > > > +               dev_maps = rcu_dereference(dev->xps_cpus_map);
> > > > +               nr_ids = nr_cpu_ids;
> > > > +               if (num_possible_cpus() > 1)
> > > > +                       possible_mask = cpumask_bits(cpu_possible_mask);
> > > >         }
> > >
> 
> I don't think we need the possible_mask check. That is mostly just an
> optimization that was making use of an existing "for_each" loop macro.
> If we are going to go through 0 through nr_ids then there is no need
> for the possible_mask as we can just brute force our way through and
> will not find CPU that aren't there since we couldn't have added them
> to the map anyway.

I'll remove it then. __netif_set_xps_queue could also benefit from it.

> > > I think Jakub had mentioned earlier the idea of possibly moving some
> > > fields into the xps_cpus_map and xps_rxqs_map in order to reduce the
> > > complexity of this so that certain values would be protected by the
> > > RCU lock.
> > >
> > > This might be a good time to look at encoding things like the number
> > > of IDs and the number of TCs there in order to avoid a bunch of this
> > > duplication. Then you could just pass a pointer to the map you want to
> > > display and the code should be able to just dump the values.:
> >
> > 100% agree to all the above. That would also prevent from making out of
> > bound accesses when dev->num_tc is increased after dev_maps is
> > allocated. I do have a series ready to be send storing num_tc into the
> > maps, and reworking code to use it instead of dev->num_tc. The series
> > also adds checks to ensure the map is valid when we access it (such as
> > making sure dev->num_tc == map->num_tc). I however did not move nr_ids
> > into the map yet, but I'll look into it.
> >
> > The idea is to send it as a follow up series, as this one is only moving
> > code around to improve maintenance and readability. Even if all the
> > patches were in the same series that would be a prerequisite.
> 
> Okay, so if we are going to do it as a follow-up that is fine I
> suppose, but one of the reasons I brought it up is that it would help
> this patch set in terms of readability/maintainability. An additional
> change we could look at making would be to create an xps_map pointer
> array instead of having individual pointers. Then you could simply be
> passing an index into the array to indicate if we are accessing the
> CPUs or the RXQs map.

Merging the two maps and embedding an offset in the struct? With the
upcoming changes embedding information in the map themselves we should
have a single check to know what map to use. Using a single array with
offsets would not improve that. Also maps maintenance when num_tc
is updated would need to take care of both maps, having side effects
such as removing the old rxqs map when allocating the cpus one (or the
opposite).

Thanks,
Antoine
Alexander Duyck Jan. 8, 2021, 4:33 p.m. UTC | #5
On Fri, Jan 8, 2021 at 1:07 AM Antoine Tenart <atenart@kernel.org> wrote:
>
> Quoting Alexander Duyck (2021-01-07 17:38:35)
> > On Thu, Jan 7, 2021 at 12:54 AM Antoine Tenart <atenart@kernel.org> wrote:
> > >
> > > Quoting Alexander Duyck (2021-01-06 20:54:11)
> > > > On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart <atenart@kernel.org> wrote:
> > >
> > > That would require to hold rcu_read_lock in the caller and I'd like to
> > > keep it in that function.
> >
> > Actually you could probably make it work if you were to pass a pointer
> > to the RCU pointer.
>
> That should work but IMHO that could be easily breakable by future
> changes as it's a bit tricky.
>
> > > > >         if (dev->num_tc) {
> > > > >                 /* Do not allow XPS on subordinate device directly */
> > > > >                 num_tc = dev->num_tc;
> > > > > -               if (num_tc < 0) {
> > > > > -                       ret = -EINVAL;
> > > > > -                       goto err_rtnl_unlock;
> > > > > -               }
> > > > > +               if (num_tc < 0)
> > > > > +                       return -EINVAL;
> > > > >
> > > > >                 /* If queue belongs to subordinate dev use its map */
> > > > >                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> > > > >
> > > > >                 tc = netdev_txq_to_tc(dev, index);
> > > > > -               if (tc < 0) {
> > > > > -                       ret = -EINVAL;
> > > > > -                       goto err_rtnl_unlock;
> > > > > -               }
> > > > > +               if (tc < 0)
> > > > > +                       return -EINVAL;
> > > > >         }
> > > > >
> > > >
> > > > So if we store the num_tc and nr_ids in the dev_maps structure then we
> > > > could simplify this a bit by pulling the num_tc info out of the
> > > > dev_map and only asking the Tx queue for the tc in that case and
> > > > validating it against (tc <0 || num_tc <= tc) and returning an error
> > > > if either are true.
> > > >
> > > > This would also allow us to address the fact that the rxqs feature
> > > > doesn't support the subordinate devices as you could pull out the bit
> > > > above related to the sb_dev and instead call that prior to calling
> > > > xps_queue_show so that you are operating on the correct device map.
> >
> > It probably would be necessary to pass dev and index if we pull the
> > netdev_get_tx_queue()->sb_dev bit out and performed that before we
> > called the xps_queue_show function. Specifically as the subordinate
> > device wouldn't match up with the queue device so we would be better
> > off pulling it out first.
>
> While I agree moving the netdev_get_tx_queue()->sb_dev bit out of
> xps_queue_show seems like a good idea for consistency, I'm not sure
> it'll work: dev->num_tc is not only used to retrieve the number of tc
> but also as a condition on not being 0. We have things like:
>
>   // Always done with the original dev.
>   if (dev->num_tc) {
>
>       [...]
>
>       // Can be a subordinate dev.
>       tc = netdev_txq_to_tc(dev, index);
>   }
>
> And after moving num_tc in the map, we'll have checks like:
>
>   if (dev_maps->num_tc != dev->num_tc)
>       return -EINVAL;

We shouldn't. That defeats the whole point and you will never be able
to rely on the num_tc in the device to remain constant. If we are
moving the value to an RCU accessible attribute we should just be
using that value. We can only use that check if we are in an rtnl_lock
anyway and we won't need that if we are just displaying the value.

The checks should only be used to verify the tc of the queue is within
the bounds of the num_tc of the xps_map.

> I don't think the subordinate dev holds the same num_tc value as dev.
> What's your take on this?

So if I recall the num_tc for the subordinate device would be negative
since the subordinate devices start at -1 and just further decrease.
That is yet another reason why we shouldn't be looking at the num_tc
of the device.

> > > > > -       mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
> > > > > -       if (!mask) {
> > > > > -               ret = -ENOMEM;
> > > > > -               goto err_rtnl_unlock;
> > > > > +       rcu_read_lock();
> > > > > +
> > > > > +       if (is_rxqs_map) {
> > > > > +               dev_maps = rcu_dereference(dev->xps_rxqs_map);
> > > > > +               nr_ids = dev->num_rx_queues;
> > > > > +       } else {
> > > > > +               dev_maps = rcu_dereference(dev->xps_cpus_map);
> > > > > +               nr_ids = nr_cpu_ids;
> > > > > +               if (num_possible_cpus() > 1)
> > > > > +                       possible_mask = cpumask_bits(cpu_possible_mask);
> > > > >         }
> > > >
> >
> > I don't think we need the possible_mask check. That is mostly just an
> > optimization that was making use of an existing "for_each" loop macro.
> > If we are going to go through 0 through nr_ids then there is no need
> > for the possible_mask as we can just brute force our way through and
> > will not find CPU that aren't there since we couldn't have added them
> > to the map anyway.
>
> I'll remove it then. __netif_set_xps_queue could also benefit from it.

Sounds good.

> > > > I think Jakub had mentioned earlier the idea of possibly moving some
> > > > fields into the xps_cpus_map and xps_rxqs_map in order to reduce the
> > > > complexity of this so that certain values would be protected by the
> > > > RCU lock.
> > > >
> > > > This might be a good time to look at encoding things like the number
> > > > of IDs and the number of TCs there in order to avoid a bunch of this
> > > > duplication. Then you could just pass a pointer to the map you want to
> > > > display and the code should be able to just dump the values.:
> > >
> > > 100% agree to all the above. That would also prevent from making out of
> > > bound accesses when dev->num_tc is increased after dev_maps is
> > > allocated. I do have a series ready to be send storing num_tc into the
> > > maps, and reworking code to use it instead of dev->num_tc. The series
> > > also adds checks to ensure the map is valid when we access it (such as
> > > making sure dev->num_tc == map->num_tc). I however did not move nr_ids
> > > into the map yet, but I'll look into it.
> > >
> > > The idea is to send it as a follow up series, as this one is only moving
> > > code around to improve maintenance and readability. Even if all the
> > > patches were in the same series that would be a prerequisite.
> >
> > Okay, so if we are going to do it as a follow-up that is fine I
> > suppose, but one of the reasons I brought it up is that it would help
> > this patch set in terms of readability/maintainability. An additional
> > change we could look at making would be to create an xps_map pointer
> > array instead of having individual pointers. Then you could simply be
> > passing an index into the array to indicate if we are accessing the
> > CPUs or the RXQs map.
>
> Merging the two maps and embedding an offset in the struct? With the
> upcoming changes embedding information in the map themselves we should
> have a single check to know what map to use. Using a single array with
> offsets would not improve that. Also maps maintenance when num_tc
> is updated would need to take care of both maps, having side effects
> such as removing the old rxqs map when allocating the cpus one (or the
> opposite).
>
> Thanks,
> Antoine

Sorry, I didn't mean to merge the two maps. Just go from two pointers
to an array containing two pointers. Right now them sitting right next
to each other it becomes pretty easy to just convert them so that
instead of accessing them as:

dev->xps_rxqs_map
dev->xps_cpus_map

You could instead access them as:
dev->xps_map[XPS_RXQ];
dev->xps_map[XPS_CPU];

Then instead of all the if/else logic we have in the code you just are
passing the index of the xps_map you want to access and we have the
nr_ids and num_tc all encoded in the map so the code itself. Then for
displaying we are just using the fields from the map to validate.

We will still end up needing to take the rtnl_lock for the
__netif_set_xps_queue case, however that should be the only case where
we really need it as we have to re-read dev->num_tc and the
netdev_txq_to_tc and guarantee they don't change while we are
programming the map.

That reminds me we may want to add an ASSERT_RTNL to the start of
__netif_set_xps_queue and a comment indicating that we need to hold
the rtnl lock to guarantee that num_tc and the Tx queue to TC mapping
cannot change while we are programming the value into the map.
Antoine Tenart Jan. 8, 2021, 6:58 p.m. UTC | #6
Quoting Alexander Duyck (2021-01-08 17:33:01)
> On Fri, Jan 8, 2021 at 1:07 AM Antoine Tenart <atenart@kernel.org> wrote:
> >
> > Quoting Alexander Duyck (2021-01-07 17:38:35)
> > > On Thu, Jan 7, 2021 at 12:54 AM Antoine Tenart <atenart@kernel.org> wrote:
> > > >
> > > > Quoting Alexander Duyck (2021-01-06 20:54:11)
> > > > > On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart <atenart@kernel.org> wrote:
> > > >
> > > > That would require to hold rcu_read_lock in the caller and I'd like to
> > > > keep it in that function.
> > >
> > > Actually you could probably make it work if you were to pass a pointer
> > > to the RCU pointer.
> >
> > That should work but IMHO that could be easily breakable by future
> > changes as it's a bit tricky.
> >
> > > > > >         if (dev->num_tc) {
> > > > > >                 /* Do not allow XPS on subordinate device directly */
> > > > > >                 num_tc = dev->num_tc;
> > > > > > -               if (num_tc < 0) {
> > > > > > -                       ret = -EINVAL;
> > > > > > -                       goto err_rtnl_unlock;
> > > > > > -               }
> > > > > > +               if (num_tc < 0)
> > > > > > +                       return -EINVAL;
> > > > > >
> > > > > >                 /* If queue belongs to subordinate dev use its map */
> > > > > >                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> > > > > >
> > > > > >                 tc = netdev_txq_to_tc(dev, index);
> > > > > > -               if (tc < 0) {
> > > > > > -                       ret = -EINVAL;
> > > > > > -                       goto err_rtnl_unlock;
> > > > > > -               }
> > > > > > +               if (tc < 0)
> > > > > > +                       return -EINVAL;
> > > > > >         }
> > > > > >
> > > > >
> > > > > So if we store the num_tc and nr_ids in the dev_maps structure then we
> > > > > could simplify this a bit by pulling the num_tc info out of the
> > > > > dev_map and only asking the Tx queue for the tc in that case and
> > > > > validating it against (tc <0 || num_tc <= tc) and returning an error
> > > > > if either are true.
> > > > >
> > > > > This would also allow us to address the fact that the rxqs feature
> > > > > doesn't support the subordinate devices as you could pull out the bit
> > > > > above related to the sb_dev and instead call that prior to calling
> > > > > xps_queue_show so that you are operating on the correct device map.
> > >
> > > It probably would be necessary to pass dev and index if we pull the
> > > netdev_get_tx_queue()->sb_dev bit out and performed that before we
> > > called the xps_queue_show function. Specifically as the subordinate
> > > device wouldn't match up with the queue device so we would be better
> > > off pulling it out first.
> >
> > While I agree moving the netdev_get_tx_queue()->sb_dev bit out of
> > xps_queue_show seems like a good idea for consistency, I'm not sure
> > it'll work: dev->num_tc is not only used to retrieve the number of tc
> > but also as a condition on not being 0. We have things like:
> >
> >   // Always done with the original dev.
> >   if (dev->num_tc) {
> >
> >       [...]
> >
> >       // Can be a subordinate dev.
> >       tc = netdev_txq_to_tc(dev, index);
> >   }
> >
> > And after moving num_tc in the map, we'll have checks like:
> >
> >   if (dev_maps->num_tc != dev->num_tc)
> >       return -EINVAL;
> 
> We shouldn't. That defeats the whole point and you will never be able
> to rely on the num_tc in the device to remain constant. If we are
> moving the value to an RCU accessible attribute we should just be
> using that value. We can only use that check if we are in an rtnl_lock
> anyway and we won't need that if we are just displaying the value.
> 
> The checks should only be used to verify the tc of the queue is within
> the bounds of the num_tc of the xps_map.

Right. So that would mean we have to choose between:

- Removing the rtnl lock but with the understanding that the value we
  get when reading the maps might be invalid and not make sense with
  the current dev->num_tc configuration.

- Keeping the rtnl lock (which, I mean, I'd like to remove).

My first goal for embedding num_tc in the maps was to prevent accessing
the maps out-of-bound when dev->num_tc is updated after the maps are
allocated. That's a possibility (I could produce such behaviour with
KASAN enabled) even when taking the rtnl lock in the show/store helpers.

We're now talking of also removing the rtnl lock, which is fine, I just
want to make those two different goals explicit as they're not
interdependent.

> > > > > I think Jakub had mentioned earlier the idea of possibly moving some
> > > > > fields into the xps_cpus_map and xps_rxqs_map in order to reduce the
> > > > > complexity of this so that certain values would be protected by the
> > > > > RCU lock.
> > > > >
> > > > > This might be a good time to look at encoding things like the number
> > > > > of IDs and the number of TCs there in order to avoid a bunch of this
> > > > > duplication. Then you could just pass a pointer to the map you want to
> > > > > display and the code should be able to just dump the values.:
> > > >
> > > > 100% agree to all the above. That would also prevent from making out of
> > > > bound accesses when dev->num_tc is increased after dev_maps is
> > > > allocated. I do have a series ready to be send storing num_tc into the
> > > > maps, and reworking code to use it instead of dev->num_tc. The series
> > > > also adds checks to ensure the map is valid when we access it (such as
> > > > making sure dev->num_tc == map->num_tc). I however did not move nr_ids
> > > > into the map yet, but I'll look into it.
> > > >
> > > > The idea is to send it as a follow up series, as this one is only moving
> > > > code around to improve maintenance and readability. Even if all the
> > > > patches were in the same series that would be a prerequisite.
> > >
> > > Okay, so if we are going to do it as a follow-up that is fine I
> > > suppose, but one of the reasons I brought it up is that it would help
> > > this patch set in terms of readability/maintainability. An additional
> > > change we could look at making would be to create an xps_map pointer
> > > array instead of having individual pointers. Then you could simply be
> > > passing an index into the array to indicate if we are accessing the
> > > CPUs or the RXQs map.
> >
> > Merging the two maps and embedding an offset in the struct? With the
> > upcoming changes embedding information in the map themselves we should
> > have a single check to know what map to use. Using a single array with
> > offsets would not improve that. Also maps maintenance when num_tc
> > is updated would need to take care of both maps, having side effects
> > such as removing the old rxqs map when allocating the cpus one (or the
> > opposite).
> 
> Sorry, I didn't mean to merge the two maps. Just go from two pointers
> to an array containing two pointers. Right now them sitting right next
> to each other it becomes pretty easy to just convert them so that
> instead of accessing them as:
> 
> dev->xps_rxqs_map
> dev->xps_cpus_map
> 
> You could instead access them as:
> dev->xps_map[XPS_RXQ];
> dev->xps_map[XPS_CPU];
> 
> Then instead of all the if/else logic we have in the code you just are
> passing the index of the xps_map you want to access and we have the
> nr_ids and num_tc all encoded in the map so the code itself. Then for
> displaying we are just using the fields from the map to validate.
> 
> We will still end up needing to take the rtnl_lock for the
> __netif_set_xps_queue case, however that should be the only case where
> we really need it as we have to re-read dev->num_tc and the
> netdev_txq_to_tc and guarantee they don't change while we are
> programming the map.

Thanks for the detailed explanations. That indeed would be good.

> That reminds me we may want to add an ASSERT_RTNL to the start of
> __netif_set_xps_queue and a comment indicating that we need to hold
> the rtnl lock to guarantee that num_tc and the Tx queue to TC mapping
> cannot change while we are programming the value into the map.

Good idea!

Thanks,
Antoine
Alexander Duyck Jan. 8, 2021, 10:04 p.m. UTC | #7
On Fri, Jan 8, 2021 at 10:58 AM Antoine Tenart <atenart@kernel.org> wrote:
>
> Quoting Alexander Duyck (2021-01-08 17:33:01)
> > On Fri, Jan 8, 2021 at 1:07 AM Antoine Tenart <atenart@kernel.org> wrote:
> > >
> > > Quoting Alexander Duyck (2021-01-07 17:38:35)
> > > > On Thu, Jan 7, 2021 at 12:54 AM Antoine Tenart <atenart@kernel.org> wrote:
> > > > >
> > > > > Quoting Alexander Duyck (2021-01-06 20:54:11)
> > > > > > On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart <atenart@kernel.org> wrote:
> > > > >
> > > > > That would require to hold rcu_read_lock in the caller and I'd like to
> > > > > keep it in that function.
> > > >
> > > > Actually you could probably make it work if you were to pass a pointer
> > > > to the RCU pointer.
> > >
> > > That should work but IMHO that could be easily breakable by future
> > > changes as it's a bit tricky.
> > >
> > > > > > >         if (dev->num_tc) {
> > > > > > >                 /* Do not allow XPS on subordinate device directly */
> > > > > > >                 num_tc = dev->num_tc;
> > > > > > > -               if (num_tc < 0) {
> > > > > > > -                       ret = -EINVAL;
> > > > > > > -                       goto err_rtnl_unlock;
> > > > > > > -               }
> > > > > > > +               if (num_tc < 0)
> > > > > > > +                       return -EINVAL;
> > > > > > >
> > > > > > >                 /* If queue belongs to subordinate dev use its map */
> > > > > > >                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> > > > > > >
> > > > > > >                 tc = netdev_txq_to_tc(dev, index);
> > > > > > > -               if (tc < 0) {
> > > > > > > -                       ret = -EINVAL;
> > > > > > > -                       goto err_rtnl_unlock;
> > > > > > > -               }
> > > > > > > +               if (tc < 0)
> > > > > > > +                       return -EINVAL;
> > > > > > >         }
> > > > > > >
> > > > > >
> > > > > > So if we store the num_tc and nr_ids in the dev_maps structure then we
> > > > > > could simplify this a bit by pulling the num_tc info out of the
> > > > > > dev_map and only asking the Tx queue for the tc in that case and
> > > > > > validating it against (tc <0 || num_tc <= tc) and returning an error
> > > > > > if either are true.
> > > > > >
> > > > > > This would also allow us to address the fact that the rxqs feature
> > > > > > doesn't support the subordinate devices as you could pull out the bit
> > > > > > above related to the sb_dev and instead call that prior to calling
> > > > > > xps_queue_show so that you are operating on the correct device map.
> > > >
> > > > It probably would be necessary to pass dev and index if we pull the
> > > > netdev_get_tx_queue()->sb_dev bit out and performed that before we
> > > > called the xps_queue_show function. Specifically as the subordinate
> > > > device wouldn't match up with the queue device so we would be better
> > > > off pulling it out first.
> > >
> > > While I agree moving the netdev_get_tx_queue()->sb_dev bit out of
> > > xps_queue_show seems like a good idea for consistency, I'm not sure
> > > it'll work: dev->num_tc is not only used to retrieve the number of tc
> > > but also as a condition on not being 0. We have things like:
> > >
> > >   // Always done with the original dev.
> > >   if (dev->num_tc) {
> > >
> > >       [...]
> > >
> > >       // Can be a subordinate dev.
> > >       tc = netdev_txq_to_tc(dev, index);
> > >   }
> > >
> > > And after moving num_tc in the map, we'll have checks like:
> > >
> > >   if (dev_maps->num_tc != dev->num_tc)
> > >       return -EINVAL;
> >
> > We shouldn't. That defeats the whole point and you will never be able
> > to rely on the num_tc in the device to remain constant. If we are
> > moving the value to an RCU accessible attribute we should just be
> > using that value. We can only use that check if we are in an rtnl_lock
> > anyway and we won't need that if we are just displaying the value.
> >
> > The checks should only be used to verify the tc of the queue is within
> > the bounds of the num_tc of the xps_map.
>
> Right. So that would mean we have to choose between:
>
> - Removing the rtnl lock but with the understanding that the value we
>   get when reading the maps might be invalid and not make sense with
>   the current dev->num_tc configuration.
>
> - Keeping the rtnl lock (which, I mean, I'd like to remove).
>
> My first goal for embedding num_tc in the maps was to prevent accessing
> the maps out-of-bound when dev->num_tc is updated after the maps are
> allocated. That's a possibility (I could produce such behaviour with
> KASAN enabled) even when taking the rtnl lock in the show/store helpers.
>
> We're now talking of also removing the rtnl lock, which is fine, I just
> want to make those two different goals explicit as they're not
> interdependent.

The problem is in the grand scheme of things the XPS map data is
already out of date by the time we look at it anyway regardless of the
locking mechanics. The problem is if we are doing both at the same
time we could already be looking at stale data as the num_tcs could
change after we dump the map and then it would still be invalid even
if we bothered with the RTNL lock or not.

In my mind we are better of with the RCU style behavior as we are just
using this to display the current setup anyway, and if we are updating
then we need the RTNL lock so that we can guarantee a consistent state
on the device while we are putting together the xps map.

One additional thought I had is that we may want to treat an
out-of-bounds reference the same as XPS not being enabled. So that we
would return a 0 mask instead of an error.  So instead of just
checking for "if (dev_maps)" we would check for "if (dev_maps && tc <
dev_maps->num_tc)".

> > > > > > I think Jakub had mentioned earlier the idea of possibly moving some
> > > > > > fields into the xps_cpus_map and xps_rxqs_map in order to reduce the
> > > > > > complexity of this so that certain values would be protected by the
> > > > > > RCU lock.
> > > > > >
> > > > > > This might be a good time to look at encoding things like the number
> > > > > > of IDs and the number of TCs there in order to avoid a bunch of this
> > > > > > duplication. Then you could just pass a pointer to the map you want to
> > > > > > display and the code should be able to just dump the values.:
> > > > >
> > > > > 100% agree to all the above. That would also prevent from making out of
> > > > > bound accesses when dev->num_tc is increased after dev_maps is
> > > > > allocated. I do have a series ready to be send storing num_tc into the
> > > > > maps, and reworking code to use it instead of dev->num_tc. The series
> > > > > also adds checks to ensure the map is valid when we access it (such as
> > > > > making sure dev->num_tc == map->num_tc). I however did not move nr_ids
> > > > > into the map yet, but I'll look into it.
> > > > >
> > > > > The idea is to send it as a follow up series, as this one is only moving
> > > > > code around to improve maintenance and readability. Even if all the
> > > > > patches were in the same series that would be a prerequisite.
> > > >
> > > > Okay, so if we are going to do it as a follow-up that is fine I
> > > > suppose, but one of the reasons I brought it up is that it would help
> > > > this patch set in terms of readability/maintainability. An additional
> > > > change we could look at making would be to create an xps_map pointer
> > > > array instead of having individual pointers. Then you could simply be
> > > > passing an index into the array to indicate if we are accessing the
> > > > CPUs or the RXQs map.
> > >
> > > Merging the two maps and embedding an offset in the struct? With the
> > > upcoming changes embedding information in the map themselves we should
> > > have a single check to know what map to use. Using a single array with
> > > offsets would not improve that. Also maps maintenance when num_tc
> > > is updated would need to take care of both maps, having side effects
> > > such as removing the old rxqs map when allocating the cpus one (or the
> > > opposite).
> >
> > Sorry, I didn't mean to merge the two maps. Just go from two pointers
> > to an array containing two pointers. Right now them sitting right next
> > to each other it becomes pretty easy to just convert them so that
> > instead of accessing them as:
> >
> > dev->xps_rxqs_map
> > dev->xps_cpus_map
> >
> > You could instead access them as:
> > dev->xps_map[XPS_RXQ];
> > dev->xps_map[XPS_CPU];
> >
> > Then instead of all the if/else logic we have in the code you just are
> > passing the index of the xps_map you want to access and we have the
> > nr_ids and num_tc all encoded in the map so the code itself. Then for
> > displaying we are just using the fields from the map to validate.
> >
> > We will still end up needing to take the rtnl_lock for the
> > __netif_set_xps_queue case, however that should be the only case where
> > we really need it as we have to re-read dev->num_tc and the
> > netdev_txq_to_tc and guarantee they don't change while we are
> > programming the map.
>
> Thanks for the detailed explanations. That indeed would be good.

It does simplify things quite a bit, at least for the display logic
since essentially all the secondary values that tell us the size and
shape of the XPS map would essentially be stored in the map itself
now.

Odds are we could probably consolidate __netif_set_xps_queues quite a
bit with this as well. From what I can tell it looks like we could
probably default nr_ids to dev->num_rx_queues and then if we are doing
a CPU based map then we overwrite nr_ids with nr_cpu_ids and populate
the online mask.

> > That reminds me we may want to add an ASSERT_RTNL to the start of
> > __netif_set_xps_queue and a comment indicating that we need to hold
> > the rtnl lock to guarantee that num_tc and the Tx queue to TC mapping
> > cannot change while we are programming the value into the map.
>
> Good idea!
Antoine Tenart Jan. 8, 2021, 10:46 p.m. UTC | #8
Quoting Alexander Duyck (2021-01-08 23:04:57)
> On Fri, Jan 8, 2021 at 10:58 AM Antoine Tenart <atenart@kernel.org> wrote:
> >
> > Quoting Alexander Duyck (2021-01-08 17:33:01)
> > > On Fri, Jan 8, 2021 at 1:07 AM Antoine Tenart <atenart@kernel.org> wrote:
> > > >
> > > > Quoting Alexander Duyck (2021-01-07 17:38:35)
> > > > > On Thu, Jan 7, 2021 at 12:54 AM Antoine Tenart <atenart@kernel.org> wrote:
> > > > > >
> > > > > > Quoting Alexander Duyck (2021-01-06 20:54:11)
> > > > > > > On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart <atenart@kernel.org> wrote:
> > > > > > > >         if (dev->num_tc) {
> > > > > > > >                 /* Do not allow XPS on subordinate device directly */
> > > > > > > >                 num_tc = dev->num_tc;
> > > > > > > > -               if (num_tc < 0) {
> > > > > > > > -                       ret = -EINVAL;
> > > > > > > > -                       goto err_rtnl_unlock;
> > > > > > > > -               }
> > > > > > > > +               if (num_tc < 0)
> > > > > > > > +                       return -EINVAL;
> > > > > > > >
> > > > > > > >                 /* If queue belongs to subordinate dev use its map */
> > > > > > > >                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> > > > > > > >
> > > > > > > >                 tc = netdev_txq_to_tc(dev, index);
> > > > > > > > -               if (tc < 0) {
> > > > > > > > -                       ret = -EINVAL;
> > > > > > > > -                       goto err_rtnl_unlock;
> > > > > > > > -               }
> > > > > > > > +               if (tc < 0)
> > > > > > > > +                       return -EINVAL;
> > > > > > > >         }
> > > > > > > >
> > > > > > >
> > > > > > > So if we store the num_tc and nr_ids in the dev_maps structure then we
> > > > > > > could simplify this a bit by pulling the num_tc info out of the
> > > > > > > dev_map and only asking the Tx queue for the tc in that case and
> > > > > > > validating it against (tc <0 || num_tc <= tc) and returning an error
> > > > > > > if either are true.
> > > > > > >
> > > > > > > This would also allow us to address the fact that the rxqs feature
> > > > > > > doesn't support the subordinate devices as you could pull out the bit
> > > > > > > above related to the sb_dev and instead call that prior to calling
> > > > > > > xps_queue_show so that you are operating on the correct device map.
> > > > >
> > > > > It probably would be necessary to pass dev and index if we pull the
> > > > > netdev_get_tx_queue()->sb_dev bit out and performed that before we
> > > > > called the xps_queue_show function. Specifically as the subordinate
> > > > > device wouldn't match up with the queue device so we would be better
> > > > > off pulling it out first.
> > > >
> > > > While I agree moving the netdev_get_tx_queue()->sb_dev bit out of
> > > > xps_queue_show seems like a good idea for consistency, I'm not sure
> > > > it'll work: dev->num_tc is not only used to retrieve the number of tc
> > > > but also as a condition on not being 0. We have things like:
> > > >
> > > >   // Always done with the original dev.
> > > >   if (dev->num_tc) {
> > > >
> > > >       [...]
> > > >
> > > >       // Can be a subordinate dev.
> > > >       tc = netdev_txq_to_tc(dev, index);
> > > >   }
> > > >
> > > > And after moving num_tc in the map, we'll have checks like:
> > > >
> > > >   if (dev_maps->num_tc != dev->num_tc)
> > > >       return -EINVAL;
> > >
> > > We shouldn't. That defeats the whole point and you will never be able
> > > to rely on the num_tc in the device to remain constant. If we are
> > > moving the value to an RCU accessible attribute we should just be
> > > using that value. We can only use that check if we are in an rtnl_lock
> > > anyway and we won't need that if we are just displaying the value.
> > >
> > > The checks should only be used to verify the tc of the queue is within
> > > the bounds of the num_tc of the xps_map.
> >
> > Right. So that would mean we have to choose between:
> >
> > - Removing the rtnl lock but with the understanding that the value we
> >   get when reading the maps might be invalid and not make sense with
> >   the current dev->num_tc configuration.
> >
> > - Keeping the rtnl lock (which, I mean, I'd like to remove).
> >
> > My first goal for embedding num_tc in the maps was to prevent accessing
> > the maps out-of-bound when dev->num_tc is updated after the maps are
> > allocated. That's a possibility (I could produce such behaviour with
> > KASAN enabled) even when taking the rtnl lock in the show/store helpers.
> >
> > We're now talking of also removing the rtnl lock, which is fine, I just
> > want to make those two different goals explicit as they're not
> > interdependent.
> 
> The problem is in the grand scheme of things the XPS map data is
> already out of date by the time we look at it anyway regardless of the
> locking mechanics. The problem is if we are doing both at the same
> time we could already be looking at stale data as the num_tcs could
> change after we dump the map and then it would still be invalid even
> if we bothered with the RTNL lock or not.
> 
> In my mind we are better of with the RCU style behavior as we are just
> using this to display the current setup anyway, and if we are updating
> then we need the RTNL lock so that we can guarantee a consistent state
> on the device while we are putting together the xps map.

You're right, there's no way to ensure we're looking at the current
setup. In that case let's use the embedded map values only.

> One additional thought I had is that we may want to treat an
> out-of-bounds reference the same as XPS not being enabled. So that we
> would return a 0 mask instead of an error.  So instead of just
> checking for "if (dev_maps)" we would check for "if (dev_maps && tc <
> dev_maps->num_tc)".

Agreed, that would be good and less confusing for the user.

> > > > > > > I think Jakub had mentioned earlier the idea of possibly moving some
> > > > > > > fields into the xps_cpus_map and xps_rxqs_map in order to reduce the
> > > > > > > complexity of this so that certain values would be protected by the
> > > > > > > RCU lock.
> > > > > > >
> > > > > > > This might be a good time to look at encoding things like the number
> > > > > > > of IDs and the number of TCs there in order to avoid a bunch of this
> > > > > > > duplication. Then you could just pass a pointer to the map you want to
> > > > > > > display and the code should be able to just dump the values.:
> > > > > >
> > > > > > 100% agree to all the above. That would also prevent from making out of
> > > > > > bound accesses when dev->num_tc is increased after dev_maps is
> > > > > > allocated. I do have a series ready to be send storing num_tc into the
> > > > > > maps, and reworking code to use it instead of dev->num_tc. The series
> > > > > > also adds checks to ensure the map is valid when we access it (such as
> > > > > > making sure dev->num_tc == map->num_tc). I however did not move nr_ids
> > > > > > into the map yet, but I'll look into it.
> > > > > >
> > > > > > The idea is to send it as a follow up series, as this one is only moving
> > > > > > code around to improve maintenance and readability. Even if all the
> > > > > > patches were in the same series that would be a prerequisite.
> > > > >
> > > > > Okay, so if we are going to do it as a follow-up that is fine I
> > > > > suppose, but one of the reasons I brought it up is that it would help
> > > > > this patch set in terms of readability/maintainability. An additional
> > > > > change we could look at making would be to create an xps_map pointer
> > > > > array instead of having individual pointers. Then you could simply be
> > > > > passing an index into the array to indicate if we are accessing the
> > > > > CPUs or the RXQs map.
> > > >
> > > > Merging the two maps and embedding an offset in the struct? With the
> > > > upcoming changes embedding information in the map themselves we should
> > > > have a single check to know what map to use. Using a single array with
> > > > offsets would not improve that. Also maps maintenance when num_tc
> > > > is updated would need to take care of both maps, having side effects
> > > > such as removing the old rxqs map when allocating the cpus one (or the
> > > > opposite).
> > >
> > > Sorry, I didn't mean to merge the two maps. Just go from two pointers
> > > to an array containing two pointers. Right now them sitting right next
> > > to each other it becomes pretty easy to just convert them so that
> > > instead of accessing them as:
> > >
> > > dev->xps_rxqs_map
> > > dev->xps_cpus_map
> > >
> > > You could instead access them as:
> > > dev->xps_map[XPS_RXQ];
> > > dev->xps_map[XPS_CPU];
> > >
> > > Then instead of all the if/else logic we have in the code you just are
> > > passing the index of the xps_map you want to access and we have the
> > > nr_ids and num_tc all encoded in the map so the code itself. Then for
> > > displaying we are just using the fields from the map to validate.
> > >
> > > We will still end up needing to take the rtnl_lock for the
> > > __netif_set_xps_queue case, however that should be the only case where
> > > we really need it as we have to re-read dev->num_tc and the
> > > netdev_txq_to_tc and guarantee they don't change while we are
> > > programming the map.
> >
> > Thanks for the detailed explanations. That indeed would be good.
> 
> It does simplify things quite a bit, at least for the display logic
> since essentially all the secondary values that tell us the size and
> shape of the XPS map would essentially be stored in the map itself
> now.
> 
> Odds are we could probably consolidate __netif_set_xps_queues quite a
> bit with this as well. From what I can tell it looks like we could
> probably default nr_ids to dev->num_rx_queues and then if we are doing
> a CPU based map then we overwrite nr_ids with nr_cpu_ids and populate
> the online mask.

Sure.

All right, I'll cook up a v2 for this series and take the other comments
and suggestions into account for the follow-up one.

Thanks!
Antoine
diff mbox series

Patch

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 5a39e9b38e5f..6e6bc05181f6 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1314,77 +1314,98 @@  static const struct attribute_group dql_group = {
 #endif /* CONFIG_BQL */
 
 #ifdef CONFIG_XPS
-static ssize_t xps_cpus_show(struct netdev_queue *queue,
-			     char *buf)
+/* Should be called with the rtnl lock held. */
+static int xps_queue_show(struct net_device *dev, unsigned long **mask,
+			  unsigned int index, bool is_rxqs_map)
 {
-	int cpu, len, ret, num_tc = 1, tc = 0;
-	struct net_device *dev = queue->dev;
+	const unsigned long *possible_mask = NULL;
+	int j, num_tc = 0, tc = 0, ret = 0;
 	struct xps_dev_maps *dev_maps;
-	unsigned long *mask;
-	unsigned int index;
-
-	if (!netif_is_multiqueue(dev))
-		return -ENOENT;
-
-	index = get_netdev_queue_index(queue);
-
-	if (!rtnl_trylock())
-		return restart_syscall();
+	unsigned int nr_ids;
 
 	if (dev->num_tc) {
 		/* Do not allow XPS on subordinate device directly */
 		num_tc = dev->num_tc;
-		if (num_tc < 0) {
-			ret = -EINVAL;
-			goto err_rtnl_unlock;
-		}
+		if (num_tc < 0)
+			return -EINVAL;
 
 		/* If queue belongs to subordinate dev use its map */
 		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 
 		tc = netdev_txq_to_tc(dev, index);
-		if (tc < 0) {
-			ret = -EINVAL;
-			goto err_rtnl_unlock;
-		}
+		if (tc < 0)
+			return -EINVAL;
 	}
 
-	mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
-	if (!mask) {
-		ret = -ENOMEM;
-		goto err_rtnl_unlock;
+	rcu_read_lock();
+
+	if (is_rxqs_map) {
+		dev_maps = rcu_dereference(dev->xps_rxqs_map);
+		nr_ids = dev->num_rx_queues;
+	} else {
+		dev_maps = rcu_dereference(dev->xps_cpus_map);
+		nr_ids = nr_cpu_ids;
+		if (num_possible_cpus() > 1)
+			possible_mask = cpumask_bits(cpu_possible_mask);
 	}
+	if (!dev_maps)
+		goto rcu_unlock;
 
-	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_cpus_map);
-	if (dev_maps) {
-		for_each_possible_cpu(cpu) {
-			int i, tci = cpu * num_tc + tc;
-			struct xps_map *map;
-
-			map = rcu_dereference(dev_maps->attr_map[tci]);
-			if (!map)
-				continue;
-
-			for (i = map->len; i--;) {
-				if (map->queues[i] == index) {
-					set_bit(cpu, mask);
-					break;
-				}
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+	     j < nr_ids;) {
+		int i, tci = j * num_tc + tc;
+		struct xps_map *map;
+
+		map = rcu_dereference(dev_maps->attr_map[tci]);
+		if (!map)
+			continue;
+
+		for (i = map->len; i--;) {
+			if (map->queues[i] == index) {
+				set_bit(j, *mask);
+				break;
 			}
 		}
 	}
+
+rcu_unlock:
 	rcu_read_unlock();
 
+	return ret;
+}
+
+static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
+{
+	struct net_device *dev = queue->dev;
+	unsigned long *mask;
+	unsigned int index;
+	int len, ret;
+
+	if (!netif_is_multiqueue(dev))
+		return -ENOENT;
+
+	index = get_netdev_queue_index(queue);
+
+	mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
+	if (!mask)
+		return -ENOMEM;
+
+	if (!rtnl_trylock()) {
+		bitmap_free(mask);
+		return restart_syscall();
+	}
+
+	ret = xps_queue_show(dev, &mask, index, false);
 	rtnl_unlock();
 
+	if (ret) {
+		bitmap_free(mask);
+		return ret;
+	}
+
 	len = bitmap_print_to_pagebuf(false, buf, mask, nr_cpu_ids);
 	bitmap_free(mask);
 	return len < PAGE_SIZE ? len : -EINVAL;
-
-err_rtnl_unlock:
-	rtnl_unlock();
-	return ret;
 }
 
 static ssize_t xps_cpus_store(struct netdev_queue *queue,
@@ -1430,65 +1451,34 @@  static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
 
 static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 {
-	int j, len, ret, num_tc = 1, tc = 0;
 	struct net_device *dev = queue->dev;
-	struct xps_dev_maps *dev_maps;
 	unsigned long *mask;
 	unsigned int index;
+	int len, ret;
 
 	index = get_netdev_queue_index(queue);
 
-	if (!rtnl_trylock())
-		return restart_syscall();
-
-	if (dev->num_tc) {
-		num_tc = dev->num_tc;
-		tc = netdev_txq_to_tc(dev, index);
-		if (tc < 0) {
-			ret = -EINVAL;
-			goto err_rtnl_unlock;
-		}
-	}
 	mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
-	if (!mask) {
-		ret = -ENOMEM;
-		goto err_rtnl_unlock;
-	}
-
-	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_rxqs_map);
-	if (!dev_maps)
-		goto out_no_maps;
-
-	for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
-	     j < dev->num_rx_queues;) {
-		int i, tci = j * num_tc + tc;
-		struct xps_map *map;
-
-		map = rcu_dereference(dev_maps->attr_map[tci]);
-		if (!map)
-			continue;
+	if (!mask)
+		return -ENOMEM;
 
-		for (i = map->len; i--;) {
-			if (map->queues[i] == index) {
-				set_bit(j, mask);
-				break;
-			}
-		}
+	if (!rtnl_trylock()) {
+		bitmap_free(mask);
+		return restart_syscall();
 	}
-out_no_maps:
-	rcu_read_unlock();
 
+	ret = xps_queue_show(dev, &mask, index, true);
 	rtnl_unlock();
 
+	if (ret) {
+		bitmap_free(mask);
+		return ret;
+	}
+
 	len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
 	bitmap_free(mask);
 
 	return len < PAGE_SIZE ? len : -EINVAL;
-
-err_rtnl_unlock:
-	rtnl_unlock();
-	return ret;
 }
 
 static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,