diff mbox series

[v2,net-next,2/7] net: dsa: merge all bools of struct dsa_port into a single u8

Message ID 20220105132141.2648876-3-vladimir.oltean@nxp.com (mailing list archive)
State Accepted
Delegated to: Netdev Maintainers
Headers show
Series Cleanup to main DSA structures | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 18 this patch: 18
netdev/cc_maintainers warning 2 maintainers not CCed: olteanv@gmail.com linux@armlinux.org.uk
netdev/build_clang success Errors and warnings before: 26 this patch: 26
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 20 this patch: 20
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 36 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Vladimir Oltean Jan. 5, 2022, 1:21 p.m. UTC
struct dsa_port has 5 bool members which create quite a number of 7 byte
holes in the structure layout. By merging them all into bitfields of an
u8, and placing that u8 in the 1-byte hole after dp->mac and dp->stp_state,
we can reduce the structure size from 576 bytes to 552 bytes on arm64.

Before:

pahole -C dsa_port net/dsa/slave.o
struct dsa_port {
        union {
                struct net_device * master;              /*     0     8 */
                struct net_device * slave;               /*     0     8 */
        };                                               /*     0     8 */
        const struct dsa_device_ops  * tag_ops;          /*     8     8 */
        struct dsa_switch_tree *   dst;                  /*    16     8 */
        struct sk_buff *           (*rcv)(struct sk_buff *, struct net_device *); /*    24     8 */
        enum {
                DSA_PORT_TYPE_UNUSED = 0,
                DSA_PORT_TYPE_CPU    = 1,
                DSA_PORT_TYPE_DSA    = 2,
                DSA_PORT_TYPE_USER   = 3,
        } type;                                          /*    32     4 */

        /* XXX 4 bytes hole, try to pack */

        struct dsa_switch *        ds;                   /*    40     8 */
        unsigned int               index;                /*    48     4 */

        /* XXX 4 bytes hole, try to pack */

        const char  *              name;                 /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        struct dsa_port *          cpu_dp;               /*    64     8 */
        u8                         mac[6];               /*    72     6 */
        u8                         stp_state;            /*    78     1 */

        /* XXX 1 byte hole, try to pack */

        struct device_node *       dn;                   /*    80     8 */
        unsigned int               ageing_time;          /*    88     4 */
        bool                       vlan_filtering;       /*    92     1 */
        bool                       learning;             /*    93     1 */

        /* XXX 2 bytes hole, try to pack */

        struct dsa_bridge *        bridge;               /*    96     8 */
        struct devlink_port        devlink_port;         /*   104   288 */
        /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
        bool                       devlink_port_setup;   /*   392     1 */

        /* XXX 7 bytes hole, try to pack */

        struct phylink *           pl;                   /*   400     8 */
        struct phylink_config      pl_config;            /*   408    40 */
        /* --- cacheline 7 boundary (448 bytes) --- */
        struct net_device *        lag_dev;              /*   448     8 */
        bool                       lag_tx_enabled;       /*   456     1 */

        /* XXX 7 bytes hole, try to pack */

        struct net_device *        hsr_dev;              /*   464     8 */
        struct list_head           list;                 /*   472    16 */
        const struct ethtool_ops  * orig_ethtool_ops;    /*   488     8 */
        const struct dsa_netdevice_ops  * netdev_ops;    /*   496     8 */
        struct mutex               addr_lists_lock;      /*   504    32 */
        /* --- cacheline 8 boundary (512 bytes) was 24 bytes ago --- */
        struct list_head           fdbs;                 /*   536    16 */
        struct list_head           mdbs;                 /*   552    16 */
        bool                       setup;                /*   568     1 */

        /* size: 576, cachelines: 9, members: 30 */
        /* sum members: 544, holes: 6, sum holes: 25 */
        /* padding: 7 */
};

After:

pahole -C dsa_port net/dsa/slave.o
struct dsa_port {
        union {
                struct net_device * master;              /*     0     8 */
                struct net_device * slave;               /*     0     8 */
        };                                               /*     0     8 */
        const struct dsa_device_ops  * tag_ops;          /*     8     8 */
        struct dsa_switch_tree *   dst;                  /*    16     8 */
        struct sk_buff *           (*rcv)(struct sk_buff *, struct net_device *); /*    24     8 */
        enum {
                DSA_PORT_TYPE_UNUSED = 0,
                DSA_PORT_TYPE_CPU    = 1,
                DSA_PORT_TYPE_DSA    = 2,
                DSA_PORT_TYPE_USER   = 3,
        } type;                                          /*    32     4 */

        /* XXX 4 bytes hole, try to pack */

        struct dsa_switch *        ds;                   /*    40     8 */
        unsigned int               index;                /*    48     4 */

        /* XXX 4 bytes hole, try to pack */

        const char  *              name;                 /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        struct dsa_port *          cpu_dp;               /*    64     8 */
        u8                         mac[6];               /*    72     6 */
        u8                         stp_state;            /*    78     1 */
        u8                         vlan_filtering:1;     /*    79: 0  1 */
        u8                         learning:1;           /*    79: 1  1 */
        u8                         lag_tx_enabled:1;     /*    79: 2  1 */
        u8                         devlink_port_setup:1; /*    79: 3  1 */
        u8                         setup:1;              /*    79: 4  1 */

        /* XXX 3 bits hole, try to pack */

        struct device_node *       dn;                   /*    80     8 */
        unsigned int               ageing_time;          /*    88     4 */

        /* XXX 4 bytes hole, try to pack */

        struct dsa_bridge *        bridge;               /*    96     8 */
        struct devlink_port        devlink_port;         /*   104   288 */
        /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
        struct phylink *           pl;                   /*   392     8 */
        struct phylink_config      pl_config;            /*   400    40 */
        struct net_device *        lag_dev;              /*   440     8 */
        /* --- cacheline 7 boundary (448 bytes) --- */
        struct net_device *        hsr_dev;              /*   448     8 */
        struct list_head           list;                 /*   456    16 */
        const struct ethtool_ops  * orig_ethtool_ops;    /*   472     8 */
        const struct dsa_netdevice_ops  * netdev_ops;    /*   480     8 */
        struct mutex               addr_lists_lock;      /*   488    32 */
        /* --- cacheline 8 boundary (512 bytes) was 8 bytes ago --- */
        struct list_head           fdbs;                 /*   520    16 */
        struct list_head           mdbs;                 /*   536    16 */

        /* size: 552, cachelines: 9, members: 30 */
        /* sum members: 539, holes: 3, sum holes: 12 */
        /* sum bitfield members: 5 bits, bit holes: 1, sum bit holes: 3 bits */
        /* last cacheline: 40 bytes */
};

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 include/net/dsa.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

Comments

Florian Fainelli Jan. 5, 2022, 6:30 p.m. UTC | #1
On 1/5/22 5:21 AM, Vladimir Oltean wrote:
> struct dsa_port has 5 bool members which create quite a number of 7 byte
> holes in the structure layout. By merging them all into bitfields of an
> u8, and placing that u8 in the 1-byte hole after dp->mac and dp->stp_state,
> we can reduce the structure size from 576 bytes to 552 bytes on arm64.
> 
> Before:
> 
> pahole -C dsa_port net/dsa/slave.o
> struct dsa_port {
>         union {
>                 struct net_device * master;              /*     0     8 */
>                 struct net_device * slave;               /*     0     8 */
>         };                                               /*     0     8 */
>         const struct dsa_device_ops  * tag_ops;          /*     8     8 */
>         struct dsa_switch_tree *   dst;                  /*    16     8 */
>         struct sk_buff *           (*rcv)(struct sk_buff *, struct net_device *); /*    24     8 */
>         enum {
>                 DSA_PORT_TYPE_UNUSED = 0,
>                 DSA_PORT_TYPE_CPU    = 1,
>                 DSA_PORT_TYPE_DSA    = 2,
>                 DSA_PORT_TYPE_USER   = 3,
>         } type;                                          /*    32     4 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         struct dsa_switch *        ds;                   /*    40     8 */
>         unsigned int               index;                /*    48     4 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         const char  *              name;                 /*    56     8 */
>         /* --- cacheline 1 boundary (64 bytes) --- */
>         struct dsa_port *          cpu_dp;               /*    64     8 */
>         u8                         mac[6];               /*    72     6 */
>         u8                         stp_state;            /*    78     1 */
> 
>         /* XXX 1 byte hole, try to pack */
> 
>         struct device_node *       dn;                   /*    80     8 */
>         unsigned int               ageing_time;          /*    88     4 */
>         bool                       vlan_filtering;       /*    92     1 */
>         bool                       learning;             /*    93     1 */
> 
>         /* XXX 2 bytes hole, try to pack */
> 
>         struct dsa_bridge *        bridge;               /*    96     8 */
>         struct devlink_port        devlink_port;         /*   104   288 */
>         /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
>         bool                       devlink_port_setup;   /*   392     1 */
> 
>         /* XXX 7 bytes hole, try to pack */
> 
>         struct phylink *           pl;                   /*   400     8 */
>         struct phylink_config      pl_config;            /*   408    40 */
>         /* --- cacheline 7 boundary (448 bytes) --- */
>         struct net_device *        lag_dev;              /*   448     8 */
>         bool                       lag_tx_enabled;       /*   456     1 */
> 
>         /* XXX 7 bytes hole, try to pack */
> 
>         struct net_device *        hsr_dev;              /*   464     8 */
>         struct list_head           list;                 /*   472    16 */
>         const struct ethtool_ops  * orig_ethtool_ops;    /*   488     8 */
>         const struct dsa_netdevice_ops  * netdev_ops;    /*   496     8 */
>         struct mutex               addr_lists_lock;      /*   504    32 */
>         /* --- cacheline 8 boundary (512 bytes) was 24 bytes ago --- */
>         struct list_head           fdbs;                 /*   536    16 */
>         struct list_head           mdbs;                 /*   552    16 */
>         bool                       setup;                /*   568     1 */
> 
>         /* size: 576, cachelines: 9, members: 30 */
>         /* sum members: 544, holes: 6, sum holes: 25 */
>         /* padding: 7 */
> };
> 
> After:
> 
> pahole -C dsa_port net/dsa/slave.o
> struct dsa_port {
>         union {
>                 struct net_device * master;              /*     0     8 */
>                 struct net_device * slave;               /*     0     8 */
>         };                                               /*     0     8 */
>         const struct dsa_device_ops  * tag_ops;          /*     8     8 */
>         struct dsa_switch_tree *   dst;                  /*    16     8 */
>         struct sk_buff *           (*rcv)(struct sk_buff *, struct net_device *); /*    24     8 */
>         enum {
>                 DSA_PORT_TYPE_UNUSED = 0,
>                 DSA_PORT_TYPE_CPU    = 1,
>                 DSA_PORT_TYPE_DSA    = 2,
>                 DSA_PORT_TYPE_USER   = 3,
>         } type;                                          /*    32     4 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         struct dsa_switch *        ds;                   /*    40     8 */
>         unsigned int               index;                /*    48     4 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         const char  *              name;                 /*    56     8 */
>         /* --- cacheline 1 boundary (64 bytes) --- */
>         struct dsa_port *          cpu_dp;               /*    64     8 */
>         u8                         mac[6];               /*    72     6 */
>         u8                         stp_state;            /*    78     1 */
>         u8                         vlan_filtering:1;     /*    79: 0  1 */
>         u8                         learning:1;           /*    79: 1  1 */
>         u8                         lag_tx_enabled:1;     /*    79: 2  1 */
>         u8                         devlink_port_setup:1; /*    79: 3  1 */
>         u8                         setup:1;              /*    79: 4  1 */
> 
>         /* XXX 3 bits hole, try to pack */
> 
>         struct device_node *       dn;                   /*    80     8 */
>         unsigned int               ageing_time;          /*    88     4 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         struct dsa_bridge *        bridge;               /*    96     8 */
>         struct devlink_port        devlink_port;         /*   104   288 */
>         /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
>         struct phylink *           pl;                   /*   392     8 */
>         struct phylink_config      pl_config;            /*   400    40 */
>         struct net_device *        lag_dev;              /*   440     8 */
>         /* --- cacheline 7 boundary (448 bytes) --- */
>         struct net_device *        hsr_dev;              /*   448     8 */
>         struct list_head           list;                 /*   456    16 */
>         const struct ethtool_ops  * orig_ethtool_ops;    /*   472     8 */
>         const struct dsa_netdevice_ops  * netdev_ops;    /*   480     8 */
>         struct mutex               addr_lists_lock;      /*   488    32 */
>         /* --- cacheline 8 boundary (512 bytes) was 8 bytes ago --- */
>         struct list_head           fdbs;                 /*   520    16 */
>         struct list_head           mdbs;                 /*   536    16 */
> 
>         /* size: 552, cachelines: 9, members: 30 */
>         /* sum members: 539, holes: 3, sum holes: 12 */
>         /* sum bitfield members: 5 bits, bit holes: 1, sum bit holes: 3 bits */
>         /* last cacheline: 40 bytes */
> };
> 
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Vladimir Oltean Jan. 5, 2022, 6:39 p.m. UTC | #2
Hi Florian,

On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote:
> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>

Thanks a lot for the review.

I'm a bit on the fence on this patch and the other one for dsa_switch.
The thing is that bit fields are not atomic in C89, so if we update any
of the flags inside dp or ds concurrently (like dp->vlan_filtering),
we're in trouble. Right now this isn't a problem, because most of the
flags are set either during probe, or during ds->ops->setup, or are
serialized by the rtnl_mutex in ways that are there to stay (switchdev
notifiers). That's why I didn't say anything about it. But it may be a
caveat to watch out for in the future. Do you think we need to do
something about it? A lock would not be necessary, strictly speaking.
Florian Fainelli Jan. 5, 2022, 6:46 p.m. UTC | #3
On 1/5/22 10:39 AM, Vladimir Oltean wrote:
> Hi Florian,
> 
> On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote:
>> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
> 
> Thanks a lot for the review.
> 
> I'm a bit on the fence on this patch and the other one for dsa_switch.
> The thing is that bit fields are not atomic in C89, so if we update any
> of the flags inside dp or ds concurrently (like dp->vlan_filtering),
> we're in trouble. Right now this isn't a problem, because most of the
> flags are set either during probe, or during ds->ops->setup, or are
> serialized by the rtnl_mutex in ways that are there to stay (switchdev
> notifiers). That's why I didn't say anything about it. But it may be a
> caveat to watch out for in the future. Do you think we need to do
> something about it? A lock would not be necessary, strictly speaking.

I would probably start with a comment that describes these pitfalls, I
wish we had a programmatic way to ensure that these flags would not be
set dynamically and outside of the probe/setup path but that won't
happen easily.

Should we be switching to a bitmask and bitmap helpers to be future proof?
Vladimir Oltean Jan. 5, 2022, 6:56 p.m. UTC | #4
On Wed, Jan 05, 2022 at 10:46:44AM -0800, Florian Fainelli wrote:
> On 1/5/22 10:39 AM, Vladimir Oltean wrote:
> > Hi Florian,
> > 
> > On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote:
> >> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
> > 
> > Thanks a lot for the review.
> > 
> > I'm a bit on the fence on this patch and the other one for dsa_switch.
> > The thing is that bit fields are not atomic in C89, so if we update any
> > of the flags inside dp or ds concurrently (like dp->vlan_filtering),
> > we're in trouble. Right now this isn't a problem, because most of the
> > flags are set either during probe, or during ds->ops->setup, or are
> > serialized by the rtnl_mutex in ways that are there to stay (switchdev
> > notifiers). That's why I didn't say anything about it. But it may be a
> > caveat to watch out for in the future. Do you think we need to do
> > something about it? A lock would not be necessary, strictly speaking.
> 
> I would probably start with a comment that describes these pitfalls, I
> wish we had a programmatic way to ensure that these flags would not be
> set dynamically and outside of the probe/setup path but that won't
> happen easily.

A comment is probably good.

> Should we be switching to a bitmask and bitmap helpers to be future proof?

We could have done that, and it would certainly raise the awareness a
bit more, but the reason I went with the bit fields at least in the
first place was to reduce the churn in drivers. Otherwise, sure, if
driver changes are on the table for this, we can even discuss about
adding more arguments to dsa_register_switch(). The amount of poking
that drivers do inside struct dsa_switch has grown, and sometimes it's
not even immediately clear which members of that structure are supposed
to be populated by whom and when. We could definitely just tell them to
provide their desired behavior as arguments (vlan_filtering_is_global,
needs_standalone_vlan_filtering, configure_vlan_while_not_filtering,
untag_bridge_pvid, assisted_learning_on_cpu_port, ageing_time_min,
ageing_time_max, phys_mii_mask, num_tx_queues, num_lag_ids, max_num_bridges)
and only DSA will update struct dsa_switch, and we could then control
races better that way. But the downside is that we'd have 11 extra
arguments to dsa_register_switch()....
Florian Fainelli Jan. 5, 2022, 7:42 p.m. UTC | #5
On 1/5/22 10:56 AM, Vladimir Oltean wrote:
> On Wed, Jan 05, 2022 at 10:46:44AM -0800, Florian Fainelli wrote:
>> On 1/5/22 10:39 AM, Vladimir Oltean wrote:
>>> Hi Florian,
>>>
>>> On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote:
>>>> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
>>>
>>> Thanks a lot for the review.
>>>
>>> I'm a bit on the fence on this patch and the other one for dsa_switch.
>>> The thing is that bit fields are not atomic in C89, so if we update any
>>> of the flags inside dp or ds concurrently (like dp->vlan_filtering),
>>> we're in trouble. Right now this isn't a problem, because most of the
>>> flags are set either during probe, or during ds->ops->setup, or are
>>> serialized by the rtnl_mutex in ways that are there to stay (switchdev
>>> notifiers). That's why I didn't say anything about it. But it may be a
>>> caveat to watch out for in the future. Do you think we need to do
>>> something about it? A lock would not be necessary, strictly speaking.
>>
>> I would probably start with a comment that describes these pitfalls, I
>> wish we had a programmatic way to ensure that these flags would not be
>> set dynamically and outside of the probe/setup path but that won't
>> happen easily.
> 
> A comment is probably good.
> 
>> Should we be switching to a bitmask and bitmap helpers to be future proof?
> 
> We could have done that, and it would certainly raise the awareness a
> bit more, but the reason I went with the bit fields at least in the
> first place was to reduce the churn in drivers. Otherwise, sure, if
> driver changes are on the table for this, we can even discuss about
> adding more arguments to dsa_register_switch(). The amount of poking
> that drivers do inside struct dsa_switch has grown, and sometimes it's
> not even immediately clear which members of that structure are supposed
> to be populated by whom and when. We could definitely just tell them to
> provide their desired behavior as arguments (vlan_filtering_is_global,
> needs_standalone_vlan_filtering, configure_vlan_while_not_filtering,
> untag_bridge_pvid, assisted_learning_on_cpu_port, ageing_time_min,
> ageing_time_max, phys_mii_mask, num_tx_queues, num_lag_ids, max_num_bridges)
> and only DSA will update struct dsa_switch, and we could then control
> races better that way. But the downside is that we'd have 11 extra
> arguments to dsa_register_switch()....

I would not mind if we introduced a dsa_switch::features or similar
bitmap and require driver to set bits in there before
dsa_register_switch() but that seems outside the scope of your patch
set, and I am not sure what the benefits would be unless we do happen to
do dynamic bit/feature toggling.

So for now, I believe a comment is enough, and if we spot a driver
changing that paradigm we consider a more "dynamic" solution.
Andrew Lunn Jan. 5, 2022, 10:10 p.m. UTC | #6
On Wed, Jan 05, 2022 at 06:39:34PM +0000, Vladimir Oltean wrote:
> Hi Florian,
> 
> On Wed, Jan 05, 2022 at 10:30:54AM -0800, Florian Fainelli wrote:
> > Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
> 
> Thanks a lot for the review.
> 
> I'm a bit on the fence on this patch and the other one for dsa_switch.
> The thing is that bit fields are not atomic in C89, so if we update any
> of the flags inside dp or ds concurrently (like dp->vlan_filtering),
> we're in trouble. Right now this isn't a problem, because most of the
> flags are set either during probe, or during ds->ops->setup

I've no idea if it ever got merged, but:

https://lwn.net/Articles/724319/

	Andrew
diff mbox series

Patch

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 8878f9ce251b..a8f0037b58e2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -261,18 +261,23 @@  struct dsa_port {
 
 	u8			stp_state;
 
+	u8			vlan_filtering:1,
+				/* Managed by DSA on user ports and by
+				 * drivers on CPU and DSA ports
+				 */
+				learning:1,
+				lag_tx_enabled:1,
+				devlink_port_setup:1,
+				setup:1;
+
 	struct device_node	*dn;
 	unsigned int		ageing_time;
-	bool			vlan_filtering;
-	/* Managed by DSA on user ports and by drivers on CPU and DSA ports */
-	bool			learning;
+
 	struct dsa_bridge	*bridge;
 	struct devlink_port	devlink_port;
-	bool			devlink_port_setup;
 	struct phylink		*pl;
 	struct phylink_config	pl_config;
 	struct net_device	*lag_dev;
-	bool			lag_tx_enabled;
 	struct net_device	*hsr_dev;
 
 	struct list_head list;
@@ -293,8 +298,6 @@  struct dsa_port {
 	struct mutex		addr_lists_lock;
 	struct list_head	fdbs;
 	struct list_head	mdbs;
-
-	bool setup;
 };
 
 /* TODO: ideally DSA ports would have a single dp->link_dp member,