Message ID | 20220105132141.2648876-3-vladimir.oltean@nxp.com (mailing list archive) |
---|---|
State | Accepted |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | Cleanup to main DSA structures | expand |
On 1/5/22 5:21 AM, Vladimir Oltean wrote: > struct dsa_port has 5 bool members which create quite a number of 7 byte > holes in the structure layout. By merging them all into bitfields of an > u8, and placing that u8 in the 1-byte hole after dp->mac and dp->stp_state, > we can reduce the structure size from 576 bytes to 552 bytes on arm64. > > Before: > > pahole -C dsa_port net/dsa/slave.o > struct dsa_port { > union { > struct net_device * master; /* 0 8 */ > struct net_device * slave; /* 0 8 */ > }; /* 0 8 */ > const struct dsa_device_ops * tag_ops; /* 8 8 */ > struct dsa_switch_tree * dst; /* 16 8 */ > struct sk_buff * (*rcv)(struct sk_buff *, struct net_device *); /* 24 8 */ > enum { > DSA_PORT_TYPE_UNUSED = 0, > DSA_PORT_TYPE_CPU = 1, > DSA_PORT_TYPE_DSA = 2, > DSA_PORT_TYPE_USER = 3, > } type; /* 32 4 */ > > /* XXX 4 bytes hole, try to pack */ > > struct dsa_switch * ds; /* 40 8 */ > unsigned int index; /* 48 4 */ > > /* XXX 4 bytes hole, try to pack */ > > const char * name; /* 56 8 */ > /* --- cacheline 1 boundary (64 bytes) --- */ > struct dsa_port * cpu_dp; /* 64 8 */ > u8 mac[6]; /* 72 6 */ > u8 stp_state; /* 78 1 */ > > /* XXX 1 byte hole, try to pack */ > > struct device_node * dn; /* 80 8 */ > unsigned int ageing_time; /* 88 4 */ > bool vlan_filtering; /* 92 1 */ > bool learning; /* 93 1 */ > > /* XXX 2 bytes hole, try to pack */ > > struct dsa_bridge * bridge; /* 96 8 */ > struct devlink_port devlink_port; /* 104 288 */ > /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */ > bool devlink_port_setup; /* 392 1 */ > > /* XXX 7 bytes hole, try to pack */ > > struct phylink * pl; /* 400 8 */ > struct phylink_config pl_config; /* 408 40 */ > /* --- cacheline 7 boundary (448 bytes) --- */ > struct net_device * lag_dev; /* 448 8 */ > bool lag_tx_enabled; /* 456 1 */ > > /* XXX 7 bytes hole, try to pack */ > > struct net_device * hsr_dev; /* 464 8 */ > struct list_head list; /* 472 16 */ > const struct ethtool_ops * orig_ethtool_ops; /* 488 8 */ > const struct dsa_netdevice_ops * netdev_ops; /* 496 8 */ > struct mutex addr_lists_lock; /* 504 32 */ > /* --- cacheline 8 boundary (512 bytes) was 24 bytes ago --- */ > struct list_head fdbs; /* 536 16 */ > struct list_head mdbs; /* 552 16 */ > bool setup; /* 568 1 */ > > /* size: 576, cachelines: 9, members: 30 */ > /* sum members: 544, holes: 6, sum holes: 25 */ > /* padding: 7 */ > }; > > After: > > pahole -C dsa_port net/dsa/slave.o > struct dsa_port { > union { > struct net_device * master; /* 0 8 */ > struct net_device * slave; /* 0 8 */ > }; /* 0 8 */ > const struct dsa_device_ops * tag_ops; /* 8 8 */ > struct dsa_switch_tree * dst; /* 16 8 */ > struct sk_buff * (*rcv)(struct sk_buff *, struct net_device *); /* 24 8 */ > enum { > DSA_PORT_TYPE_UNUSED = 0, > DSA_PORT_TYPE_CPU = 1, > DSA_PORT_TYPE_DSA = 2, > DSA_PORT_TYPE_USER = 3, > } type; /* 32 4 */ > > /* XXX 4 bytes hole, try to pack */ > > struct dsa_switch * ds; /* 40 8 */ > unsigned int index; /* 48 4 */ > > /* XXX 4 bytes hole, try to pack */ > > const char * name; /* 56 8 */ > /* --- cacheline 1 boundary (64 bytes) --- */ > struct dsa_port * cpu_dp; /* 64 8 */ > u8 mac[6]; /* 72 6 */ > u8 stp_state; /* 78 1 */ > u8 vlan_filtering:1; /* 79: 0 1 */ > u8 learning:1; /* 79: 1 1 */ > u8 lag_tx_enabled:1; /* 79: 2 1 */ > u8 devlink_port_setup:1; /* 79: 3 1 */ > u8 setup:1; /* 79: 4 1 */ > > /* XXX 3 bits hole, try to pack */ > > struct device_node * dn; /* 80 8 */ > unsigned int ageing_time; /* 88 4 */ > > /* XXX 4 bytes hole, try to pack */ > > struct dsa_bridge * bridge; /* 96 8 */ > struct devlink_port devlink_port; /* 104 288 */ > /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */ > struct phylink * pl; /* 392 8 */ > struct phylink_config pl_config; /* 400 40 */ > struct net_device * lag_dev; /* 440 8 */ > /* --- cacheline 7 boundary (448 bytes) --- */ > struct net_device * hsr_dev; /* 448 8 */ > struct list_head list; /* 456 16 */ > const struct ethtool_ops * orig_ethtool_ops; /* 472 8 */ > const struct dsa_netdevice_ops * netdev_ops; /* 480 8 */ > struct mutex addr_lists_lock; /* 488 32 */ > /* --- cacheline 8 boundary (512 bytes) was 8 bytes ago --- */ > struct list_head fdbs; /* 520 16 */ > struct list_head mdbs; /* 536 16 */ > > /* size: 552, cachelines: 9, members: 30 */ > /* sum members: 539, holes: 3, sum holes: 12 */ > /* sum bitfield members: 5 bits, bit holes: 1, sum bit holes: 3 bits */ > /* last cacheline: 40 bytes */ > }; > > Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Hi Florian,
On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote:
> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Thanks a lot for the review.
I'm a bit on the fence on this patch and the other one for dsa_switch.
The thing is that bit fields are not atomic in C89, so if we update any
of the flags inside dp or ds concurrently (like dp->vlan_filtering),
we're in trouble. Right now this isn't a problem, because most of the
flags are set either during probe, or during ds->ops->setup, or are
serialized by the rtnl_mutex in ways that are there to stay (switchdev
notifiers). That's why I didn't say anything about it. But it may be a
caveat to watch out for in the future. Do you think we need to do
something about it? A lock would not be necessary, strictly speaking.
On 1/5/22 10:39 AM, Vladimir Oltean wrote: > Hi Florian, > > On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote: >> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> > > Thanks a lot for the review. > > I'm a bit on the fence on this patch and the other one for dsa_switch. > The thing is that bit fields are not atomic in C89, so if we update any > of the flags inside dp or ds concurrently (like dp->vlan_filtering), > we're in trouble. Right now this isn't a problem, because most of the > flags are set either during probe, or during ds->ops->setup, or are > serialized by the rtnl_mutex in ways that are there to stay (switchdev > notifiers). That's why I didn't say anything about it. But it may be a > caveat to watch out for in the future. Do you think we need to do > something about it? A lock would not be necessary, strictly speaking. I would probably start with a comment that describes these pitfalls, I wish we had a programmatic way to ensure that these flags would not be set dynamically and outside of the probe/setup path but that won't happen easily. Should we be switching to a bitmask and bitmap helpers to be future proof?
On Wed, Jan 05, 2022 at 10:46:44AM -0800, Florian Fainelli wrote: > On 1/5/22 10:39 AM, Vladimir Oltean wrote: > > Hi Florian, > > > > On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote: > >> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> > > > > Thanks a lot for the review. > > > > I'm a bit on the fence on this patch and the other one for dsa_switch. > > The thing is that bit fields are not atomic in C89, so if we update any > > of the flags inside dp or ds concurrently (like dp->vlan_filtering), > > we're in trouble. Right now this isn't a problem, because most of the > > flags are set either during probe, or during ds->ops->setup, or are > > serialized by the rtnl_mutex in ways that are there to stay (switchdev > > notifiers). That's why I didn't say anything about it. But it may be a > > caveat to watch out for in the future. Do you think we need to do > > something about it? A lock would not be necessary, strictly speaking. > > I would probably start with a comment that describes these pitfalls, I > wish we had a programmatic way to ensure that these flags would not be > set dynamically and outside of the probe/setup path but that won't > happen easily. A comment is probably good. > Should we be switching to a bitmask and bitmap helpers to be future proof? We could have done that, and it would certainly raise the awareness a bit more, but the reason I went with the bit fields at least in the first place was to reduce the churn in drivers. Otherwise, sure, if driver changes are on the table for this, we can even discuss about adding more arguments to dsa_register_switch(). The amount of poking that drivers do inside struct dsa_switch has grown, and sometimes it's not even immediately clear which members of that structure are supposed to be populated by whom and when. We could definitely just tell them to provide their desired behavior as arguments (vlan_filtering_is_global, needs_standalone_vlan_filtering, configure_vlan_while_not_filtering, untag_bridge_pvid, assisted_learning_on_cpu_port, ageing_time_min, ageing_time_max, phys_mii_mask, num_tx_queues, num_lag_ids, max_num_bridges) and only DSA will update struct dsa_switch, and we could then control races better that way. But the downside is that we'd have 11 extra arguments to dsa_register_switch()....
On 1/5/22 10:56 AM, Vladimir Oltean wrote: > On Wed, Jan 05, 2022 at 10:46:44AM -0800, Florian Fainelli wrote: >> On 1/5/22 10:39 AM, Vladimir Oltean wrote: >>> Hi Florian, >>> >>> On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote: >>>> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> >>> >>> Thanks a lot for the review. >>> >>> I'm a bit on the fence on this patch and the other one for dsa_switch. >>> The thing is that bit fields are not atomic in C89, so if we update any >>> of the flags inside dp or ds concurrently (like dp->vlan_filtering), >>> we're in trouble. Right now this isn't a problem, because most of the >>> flags are set either during probe, or during ds->ops->setup, or are >>> serialized by the rtnl_mutex in ways that are there to stay (switchdev >>> notifiers). That's why I didn't say anything about it. But it may be a >>> caveat to watch out for in the future. Do you think we need to do >>> something about it? A lock would not be necessary, strictly speaking. >> >> I would probably start with a comment that describes these pitfalls, I >> wish we had a programmatic way to ensure that these flags would not be >> set dynamically and outside of the probe/setup path but that won't >> happen easily. > > A comment is probably good. > >> Should we be switching to a bitmask and bitmap helpers to be future proof? > > We could have done that, and it would certainly raise the awareness a > bit more, but the reason I went with the bit fields at least in the > first place was to reduce the churn in drivers. Otherwise, sure, if > driver changes are on the table for this, we can even discuss about > adding more arguments to dsa_register_switch(). The amount of poking > that drivers do inside struct dsa_switch has grown, and sometimes it's > not even immediately clear which members of that structure are supposed > to be populated by whom and when. We could definitely just tell them to > provide their desired behavior as arguments (vlan_filtering_is_global, > needs_standalone_vlan_filtering, configure_vlan_while_not_filtering, > untag_bridge_pvid, assisted_learning_on_cpu_port, ageing_time_min, > ageing_time_max, phys_mii_mask, num_tx_queues, num_lag_ids, max_num_bridges) > and only DSA will update struct dsa_switch, and we could then control > races better that way. But the downside is that we'd have 11 extra > arguments to dsa_register_switch().... I would not mind if we introduced a dsa_switch::features or similar bitmap and require driver to set bits in there before dsa_register_switch() but that seems outside the scope of your patch set, and I am not sure what the benefits would be unless we do happen to do dynamic bit/feature toggling. So for now, I believe a comment is enough, and if we spot a driver changing that paradigm we consider a more "dynamic" solution.
On Wed, Jan 05, 2022 at 06:39:34PM +0000, Vladimir Oltean wrote: > Hi Florian, > > On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote: > > Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> > > Thanks a lot for the review. > > I'm a bit on the fence on this patch and the other one for dsa_switch. > The thing is that bit fields are not atomic in C89, so if we update any > of the flags inside dp or ds concurrently (like dp->vlan_filtering), > we're in trouble. Right now this isn't a problem, because most of the > flags are set either during probe, or during ds->ops->setup I've no idea if it ever got merged, but: https://lwn.net/Articles/724319/ Andrew
diff --git a/include/net/dsa.h b/include/net/dsa.h index 8878f9ce251b..a8f0037b58e2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -261,18 +261,23 @@ struct dsa_port { u8 stp_state; + u8 vlan_filtering:1, + /* Managed by DSA on user ports and by + * drivers on CPU and DSA ports + */ + learning:1, + lag_tx_enabled:1, + devlink_port_setup:1, + setup:1; + struct device_node *dn; unsigned int ageing_time; - bool vlan_filtering; - /* Managed by DSA on user ports and by drivers on CPU and DSA ports */ - bool learning; + struct dsa_bridge *bridge; struct devlink_port devlink_port; - bool devlink_port_setup; struct phylink *pl; struct phylink_config pl_config; struct net_device *lag_dev; - bool lag_tx_enabled; struct net_device *hsr_dev; struct list_head list; @@ -293,8 +298,6 @@ struct dsa_port { struct mutex addr_lists_lock; struct list_head fdbs; struct list_head mdbs; - - bool setup; }; /* TODO: ideally DSA ports would have a single dp->link_dp member,
struct dsa_port has 5 bool members which create quite a number of 7 byte holes in the structure layout. By merging them all into bitfields of an u8, and placing that u8 in the 1-byte hole after dp->mac and dp->stp_state, we can reduce the structure size from 576 bytes to 552 bytes on arm64. Before: pahole -C dsa_port net/dsa/slave.o struct dsa_port { union { struct net_device * master; /* 0 8 */ struct net_device * slave; /* 0 8 */ }; /* 0 8 */ const struct dsa_device_ops * tag_ops; /* 8 8 */ struct dsa_switch_tree * dst; /* 16 8 */ struct sk_buff * (*rcv)(struct sk_buff *, struct net_device *); /* 24 8 */ enum { DSA_PORT_TYPE_UNUSED = 0, DSA_PORT_TYPE_CPU = 1, DSA_PORT_TYPE_DSA = 2, DSA_PORT_TYPE_USER = 3, } type; /* 32 4 */ /* XXX 4 bytes hole, try to pack */ struct dsa_switch * ds; /* 40 8 */ unsigned int index; /* 48 4 */ /* XXX 4 bytes hole, try to pack */ const char * name; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct dsa_port * cpu_dp; /* 64 8 */ u8 mac[6]; /* 72 6 */ u8 stp_state; /* 78 1 */ /* XXX 1 byte hole, try to pack */ struct device_node * dn; /* 80 8 */ unsigned int ageing_time; /* 88 4 */ bool vlan_filtering; /* 92 1 */ bool learning; /* 93 1 */ /* XXX 2 bytes hole, try to pack */ struct dsa_bridge * bridge; /* 96 8 */ struct devlink_port devlink_port; /* 104 288 */ /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */ bool devlink_port_setup; /* 392 1 */ /* XXX 7 bytes hole, try to pack */ struct phylink * pl; /* 400 8 */ struct phylink_config pl_config; /* 408 40 */ /* --- cacheline 7 boundary (448 bytes) --- */ struct net_device * lag_dev; /* 448 8 */ bool lag_tx_enabled; /* 456 1 */ /* XXX 7 bytes hole, try to pack */ struct net_device * hsr_dev; /* 464 8 */ struct list_head list; /* 472 16 */ const struct ethtool_ops * orig_ethtool_ops; /* 488 8 */ const struct dsa_netdevice_ops * netdev_ops; /* 496 8 */ struct mutex addr_lists_lock; /* 504 32 */ /* --- cacheline 8 boundary (512 bytes) was 24 bytes ago --- */ struct list_head fdbs; /* 536 16 */ struct list_head mdbs; /* 552 16 */ bool setup; /* 568 1 */ /* size: 576, cachelines: 9, members: 30 */ /* sum members: 544, holes: 6, sum holes: 25 */ /* padding: 7 */ }; After: pahole -C dsa_port net/dsa/slave.o struct dsa_port { union { struct net_device * master; /* 0 8 */ struct net_device * slave; /* 0 8 */ }; /* 0 8 */ const struct dsa_device_ops * tag_ops; /* 8 8 */ struct dsa_switch_tree * dst; /* 16 8 */ struct sk_buff * (*rcv)(struct sk_buff *, struct net_device *); /* 24 8 */ enum { DSA_PORT_TYPE_UNUSED = 0, DSA_PORT_TYPE_CPU = 1, DSA_PORT_TYPE_DSA = 2, DSA_PORT_TYPE_USER = 3, } type; /* 32 4 */ /* XXX 4 bytes hole, try to pack */ struct dsa_switch * ds; /* 40 8 */ unsigned int index; /* 48 4 */ /* XXX 4 bytes hole, try to pack */ const char * name; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct dsa_port * cpu_dp; /* 64 8 */ u8 mac[6]; /* 72 6 */ u8 stp_state; /* 78 1 */ u8 vlan_filtering:1; /* 79: 0 1 */ u8 learning:1; /* 79: 1 1 */ u8 lag_tx_enabled:1; /* 79: 2 1 */ u8 devlink_port_setup:1; /* 79: 3 1 */ u8 setup:1; /* 79: 4 1 */ /* XXX 3 bits hole, try to pack */ struct device_node * dn; /* 80 8 */ unsigned int ageing_time; /* 88 4 */ /* XXX 4 bytes hole, try to pack */ struct dsa_bridge * bridge; /* 96 8 */ struct devlink_port devlink_port; /* 104 288 */ /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */ struct phylink * pl; /* 392 8 */ struct phylink_config pl_config; /* 400 40 */ struct net_device * lag_dev; /* 440 8 */ /* --- cacheline 7 boundary (448 bytes) --- */ struct net_device * hsr_dev; /* 448 8 */ struct list_head list; /* 456 16 */ const struct ethtool_ops * orig_ethtool_ops; /* 472 8 */ const struct dsa_netdevice_ops * netdev_ops; /* 480 8 */ struct mutex addr_lists_lock; /* 488 32 */ /* --- cacheline 8 boundary (512 bytes) was 8 bytes ago --- */ struct list_head fdbs; /* 520 16 */ struct list_head mdbs; /* 536 16 */ /* size: 552, cachelines: 9, members: 30 */ /* sum members: 539, holes: 3, sum holes: 12 */ /* sum bitfield members: 5 bits, bit holes: 1, sum bit holes: 3 bits */ /* last cacheline: 40 bytes */ }; Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> --- include/net/dsa.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-)