diff mbox series

[V3,net-next,1/4] net: bridge: add fdb flag to extent locked port feature

Message ID 20220524152144.40527-2-schultz.hans+netdev@gmail.com (mailing list archive)
State Deferred
Delegated to: Netdev Maintainers
Headers show
Series Extend locked port feature with FDB locked flag (MAC-Auth/MAB) | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4771 this patch: 4771
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang success Errors and warnings before: 1155 this patch: 1155
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4924 this patch: 4924
netdev/checkpatch warning WARNING: From:/Signed-off-by: email subaddress mismatch: 'From: Hans Schultz <schultz.hans@gmail.com>' != 'Signed-off-by: Hans Schultz <schultz.hans+netdev@gmail.com>' WARNING: line length of 100 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/source_inline success Was 0 now: 0

Commit Message

Hans S May 24, 2022, 3:21 p.m. UTC
Add an intermediate state for clients behind a locked port to allow for
possible opening of the port for said clients. This feature corresponds
to the Mac-Auth and MAC Authentication Bypass (MAB) named features. The
latter defined by Cisco.
Locked FDB entries will be limited in number, so as to prevent DOS
attacks by spamming the port with random entries. The limit will be
a per port limit as it is a port based feature and that the port flushes
all FDB entries on link down.

Only the kernel can set this FDB entry flag, while userspace can read
the flag and remove it by deleting the FDB entry.

Signed-off-by: Hans Schultz <schultz.hans+netdev@gmail.com>
---
 include/uapi/linux/neighbour.h |  1 +
 net/bridge/br_fdb.c            | 11 +++++++++++
 net/bridge/br_if.c             |  1 +
 net/bridge/br_input.c          | 11 ++++++++++-
 net/bridge/br_private.h        |  7 ++++++-
 5 files changed, 29 insertions(+), 2 deletions(-)

Comments

Nikolay Aleksandrov May 24, 2022, 3:39 p.m. UTC | #1
On 24/05/2022 18:21, Hans Schultz wrote:
> Add an intermediate state for clients behind a locked port to allow for
> possible opening of the port for said clients. This feature corresponds
> to the Mac-Auth and MAC Authentication Bypass (MAB) named features. The
> latter defined by Cisco.
> Locked FDB entries will be limited in number, so as to prevent DOS
> attacks by spamming the port with random entries. The limit will be
> a per port limit as it is a port based feature and that the port flushes
> all FDB entries on link down.
> 
> Only the kernel can set this FDB entry flag, while userspace can read
> the flag and remove it by deleting the FDB entry.
> 
> Signed-off-by: Hans Schultz <schultz.hans+netdev@gmail.com>
> ---
>  include/uapi/linux/neighbour.h |  1 +
>  net/bridge/br_fdb.c            | 11 +++++++++++
>  net/bridge/br_if.c             |  1 +
>  net/bridge/br_input.c          | 11 ++++++++++-
>  net/bridge/br_private.h        |  7 ++++++-
>  5 files changed, 29 insertions(+), 2 deletions(-)
> 

Hi Hans,
So this approach has a fundamental problem, f->dst is changed without any synchronization
you cannot rely on it and thus you cannot account for these entries properly. We must be very
careful if we try to add any new synchronization not to affect performance as well.
More below...

> diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
> index 39c565e460c7..76d65b481086 100644
> --- a/include/uapi/linux/neighbour.h
> +++ b/include/uapi/linux/neighbour.h
> @@ -53,6 +53,7 @@ enum {
>  #define NTF_ROUTER	(1 << 7)
>  /* Extended flags under NDA_FLAGS_EXT: */
>  #define NTF_EXT_MANAGED	(1 << 0)
> +#define NTF_EXT_LOCKED	(1 << 1)
>  
>  /*
>   *	Neighbor Cache Entry States.
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index e7f4fccb6adb..6b83e2d6435d 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -105,6 +105,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
>  	struct nda_cacheinfo ci;
>  	struct nlmsghdr *nlh;
>  	struct ndmsg *ndm;
> +	u32 ext_flags = 0;
>  
>  	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
>  	if (nlh == NULL)
> @@ -125,11 +126,16 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
>  		ndm->ndm_flags |= NTF_EXT_LEARNED;
>  	if (test_bit(BR_FDB_STICKY, &fdb->flags))
>  		ndm->ndm_flags |= NTF_STICKY;
> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &fdb->flags))
> +		ext_flags |= NTF_EXT_LOCKED;
>  
>  	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
>  		goto nla_put_failure;
>  	if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
>  		goto nla_put_failure;
> +	if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags))
> +		goto nla_put_failure;
> +
>  	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
>  	ci.ndm_confirmed = 0;
>  	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
> @@ -171,6 +177,7 @@ static inline size_t fdb_nlmsg_size(void)
>  	return NLMSG_ALIGN(sizeof(struct ndmsg))
>  		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
>  		+ nla_total_size(sizeof(u32)) /* NDA_MASTER */
> +		+ nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */
>  		+ nla_total_size(sizeof(u16)) /* NDA_VLAN */
>  		+ nla_total_size(sizeof(struct nda_cacheinfo))
>  		+ nla_total_size(0) /* NDA_FDB_EXT_ATTRS */
> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
>  	if (test_bit(BR_FDB_STATIC, &f->flags))
>  		fdb_del_hw_addr(br, f->key.addr.addr);
>  
> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
> +		atomic_dec(&f->dst->locked_entry_cnt);

Sorry but you cannot do this for multiple reasons:
 - f->dst can be NULL
 - f->dst changes without any synchronization
 - there is no synchronization between fdb's flags and its ->dst

Cheers,
 Nik
Hans S May 24, 2022, 4:08 p.m. UTC | #2
>
> Hi Hans,
> So this approach has a fundamental problem, f->dst is changed without any synchronization
> you cannot rely on it and thus you cannot account for these entries properly. We must be very
> careful if we try to add any new synchronization not to affect performance as well.
> More below...
>
>> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
>>  	if (test_bit(BR_FDB_STATIC, &f->flags))
>>  		fdb_del_hw_addr(br, f->key.addr.addr);
>>  
>> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
>> +		atomic_dec(&f->dst->locked_entry_cnt);
>
> Sorry but you cannot do this for multiple reasons:
>  - f->dst can be NULL
>  - f->dst changes without any synchronization
>  - there is no synchronization between fdb's flags and its ->dst
>
> Cheers,
>  Nik

Hi Nik,

I could check if f->dst is NULL, but in general this should be able to
work on a per port basis, so do you have an idea of how to keep a per
port counter of added locked fdb entries?

Best,
Hans
Hans S May 24, 2022, 4:21 p.m. UTC | #3
>
> Hi Hans,
> So this approach has a fundamental problem, f->dst is changed without any synchronization
> you cannot rely on it and thus you cannot account for these entries properly. We must be very
> careful if we try to add any new synchronization not to affect performance as well.
> More below...
>
>> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
>>  	if (test_bit(BR_FDB_STATIC, &f->flags))
>>  		fdb_del_hw_addr(br, f->key.addr.addr);
>>  
>> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
>> +		atomic_dec(&f->dst->locked_entry_cnt);
>
> Sorry but you cannot do this for multiple reasons:
>  - f->dst can be NULL
>  - f->dst changes without any synchronization
>  - there is no synchronization between fdb's flags and its ->dst
>
> Cheers,
>  Nik

Hi Nik,

if a port is decoupled from the bridge, the locked entries would of
course be invalid, so maybe if adding and removing a port is accounted
for wrt locked entries and the count of locked entries, would that not
work?

Best,
Hans
Nikolay Aleksandrov May 25, 2022, 8:06 a.m. UTC | #4
On 24/05/2022 19:21, Hans Schultz wrote:
>>
>> Hi Hans,
>> So this approach has a fundamental problem, f->dst is changed without any synchronization
>> you cannot rely on it and thus you cannot account for these entries properly. We must be very
>> careful if we try to add any new synchronization not to affect performance as well.
>> More below...
>>
>>> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
>>>  	if (test_bit(BR_FDB_STATIC, &f->flags))
>>>  		fdb_del_hw_addr(br, f->key.addr.addr);
>>>  
>>> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
>>> +		atomic_dec(&f->dst->locked_entry_cnt);
>>
>> Sorry but you cannot do this for multiple reasons:
>>  - f->dst can be NULL
>>  - f->dst changes without any synchronization
>>  - there is no synchronization between fdb's flags and its ->dst
>>
>> Cheers,
>>  Nik
> 
> Hi Nik,
> 
> if a port is decoupled from the bridge, the locked entries would of
> course be invalid, so maybe if adding and removing a port is accounted
> for wrt locked entries and the count of locked entries, would that not
> work?
> 
> Best,
> Hans

Hi Hans,
Unfortunately you need the correct amount of locked entries per-port if you want
to limit their number per-port, instead of globally. So you need a consistent
fdb view with all its attributes when changing its dst in this case, which would
require new locking because you have multiple dependent struct fields and it will
kill roaming/learning scalability. I don't think this use case is worth the complexity it
will bring, so I'd suggest an alternative - you can monitor the number of locked entries
per-port from a user-space agent and disable port learning or some similar solution that
doesn't require any complex kernel changes. Is the limit a requirement to add the feature?

I have an idea how to do it and to minimize the performance hit if it really is needed
but it'll add a lot of complexity which I'd like to avoid if possible.

Cheers,
 Nik
Hans S May 25, 2022, 8:34 a.m. UTC | #5
On ons, maj 25, 2022 at 11:06, Nikolay Aleksandrov <razor@blackwall.org> wrote:
> On 24/05/2022 19:21, Hans Schultz wrote:
>>>
>>> Hi Hans,
>>> So this approach has a fundamental problem, f->dst is changed without any synchronization
>>> you cannot rely on it and thus you cannot account for these entries properly. We must be very
>>> careful if we try to add any new synchronization not to affect performance as well.
>>> More below...
>>>
>>>> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
>>>>  	if (test_bit(BR_FDB_STATIC, &f->flags))
>>>>  		fdb_del_hw_addr(br, f->key.addr.addr);
>>>>  
>>>> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
>>>> +		atomic_dec(&f->dst->locked_entry_cnt);
>>>
>>> Sorry but you cannot do this for multiple reasons:
>>>  - f->dst can be NULL
>>>  - f->dst changes without any synchronization
>>>  - there is no synchronization between fdb's flags and its ->dst
>>>
>>> Cheers,
>>>  Nik
>> 
>> Hi Nik,
>> 
>> if a port is decoupled from the bridge, the locked entries would of
>> course be invalid, so maybe if adding and removing a port is accounted
>> for wrt locked entries and the count of locked entries, would that not
>> work?
>> 
>> Best,
>> Hans
>
> Hi Hans,
> Unfortunately you need the correct amount of locked entries per-port if you want
> to limit their number per-port, instead of globally. So you need a
> consistent

Hi Nik,
the used dst is a port structure, so it is per-port and not globally.

Best,
Hans

> fdb view with all its attributes when changing its dst in this case, which would
> require new locking because you have multiple dependent struct fields and it will
> kill roaming/learning scalability. I don't think this use case is worth the complexity it
> will bring, so I'd suggest an alternative - you can monitor the number of locked entries
> per-port from a user-space agent and disable port learning or some similar solution that
> doesn't require any complex kernel changes. Is the limit a requirement to add the feature?
>
> I have an idea how to do it and to minimize the performance hit if it really is needed
> but it'll add a lot of complexity which I'd like to avoid if possible.
>
> Cheers,
>  Nik
Nikolay Aleksandrov May 25, 2022, 8:38 a.m. UTC | #6
On 25/05/2022 11:34, Hans Schultz wrote:
> On ons, maj 25, 2022 at 11:06, Nikolay Aleksandrov <razor@blackwall.org> wrote:
>> On 24/05/2022 19:21, Hans Schultz wrote:
>>>>
>>>> Hi Hans,
>>>> So this approach has a fundamental problem, f->dst is changed without any synchronization
>>>> you cannot rely on it and thus you cannot account for these entries properly. We must be very
>>>> careful if we try to add any new synchronization not to affect performance as well.
>>>> More below...
>>>>
>>>>> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
>>>>>  	if (test_bit(BR_FDB_STATIC, &f->flags))
>>>>>  		fdb_del_hw_addr(br, f->key.addr.addr);
>>>>>  
>>>>> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
>>>>> +		atomic_dec(&f->dst->locked_entry_cnt);
>>>>
>>>> Sorry but you cannot do this for multiple reasons:
>>>>  - f->dst can be NULL
>>>>  - f->dst changes without any synchronization
>>>>  - there is no synchronization between fdb's flags and its ->dst
>>>>
>>>> Cheers,
>>>>  Nik
>>>
>>> Hi Nik,
>>>
>>> if a port is decoupled from the bridge, the locked entries would of
>>> course be invalid, so maybe if adding and removing a port is accounted
>>> for wrt locked entries and the count of locked entries, would that not
>>> work?
>>>
>>> Best,
>>> Hans
>>
>> Hi Hans,
>> Unfortunately you need the correct amount of locked entries per-port if you want
>> to limit their number per-port, instead of globally. So you need a
>> consistent
> 
> Hi Nik,
> the used dst is a port structure, so it is per-port and not globally.
> 
> Best,
> Hans
> 

Yeah, I know. :) That's why I wrote it, if the limit is not a feature requirement I'd suggest
dropping it altogether, it can be enforced externally (e.g. from user-space) if needed.

By the way just fyi net-next is closed right now due to merge window. And one more
thing please include a short log of changes between versions when you send a new one.
I had to go look for v2 to find out what changed.

>> fdb view with all its attributes when changing its dst in this case, which would
>> require new locking because you have multiple dependent struct fields and it will
>> kill roaming/learning scalability. I don't think this use case is worth the complexity it
>> will bring, so I'd suggest an alternative - you can monitor the number of locked entries
>> per-port from a user-space agent and disable port learning or some similar solution that
>> doesn't require any complex kernel changes. Is the limit a requirement to add the feature?
>>
>> I have an idea how to do it and to minimize the performance hit if it really is needed
>> but it'll add a lot of complexity which I'd like to avoid if possible.
>>
>> Cheers,
>>  Nik
Hans S May 25, 2022, 9:11 a.m. UTC | #7
On ons, maj 25, 2022 at 11:38, Nikolay Aleksandrov <razor@blackwall.org> wrote:
> On 25/05/2022 11:34, Hans Schultz wrote:
>> On ons, maj 25, 2022 at 11:06, Nikolay Aleksandrov <razor@blackwall.org> wrote:
>>> On 24/05/2022 19:21, Hans Schultz wrote:
>>>>>
>>>>> Hi Hans,
>>>>> So this approach has a fundamental problem, f->dst is changed without any synchronization
>>>>> you cannot rely on it and thus you cannot account for these entries properly. We must be very
>>>>> careful if we try to add any new synchronization not to affect performance as well.
>>>>> More below...
>>>>>
>>>>>> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
>>>>>>  	if (test_bit(BR_FDB_STATIC, &f->flags))
>>>>>>  		fdb_del_hw_addr(br, f->key.addr.addr);
>>>>>>  
>>>>>> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
>>>>>> +		atomic_dec(&f->dst->locked_entry_cnt);
>>>>>
>>>>> Sorry but you cannot do this for multiple reasons:
>>>>>  - f->dst can be NULL
>>>>>  - f->dst changes without any synchronization
>>>>>  - there is no synchronization between fdb's flags and its ->dst
>>>>>
>>>>> Cheers,
>>>>>  Nik
>>>>
>>>> Hi Nik,
>>>>
>>>> if a port is decoupled from the bridge, the locked entries would of
>>>> course be invalid, so maybe if adding and removing a port is accounted
>>>> for wrt locked entries and the count of locked entries, would that not
>>>> work?
>>>>
>>>> Best,
>>>> Hans
>>>
>>> Hi Hans,
>>> Unfortunately you need the correct amount of locked entries per-port if you want
>>> to limit their number per-port, instead of globally. So you need a
>>> consistent
>> 
>> Hi Nik,
>> the used dst is a port structure, so it is per-port and not globally.
>> 
>> Best,
>> Hans
>> 
>
> Yeah, I know. :) That's why I wrote it, if the limit is not a feature requirement I'd suggest
> dropping it altogether, it can be enforced externally (e.g. from user-space) if needed.
>
> By the way just fyi net-next is closed right now due to merge window. And one more
> thing please include a short log of changes between versions when you send a new one.
> I had to go look for v2 to find out what changed.
>

Okay, I will drop the limit in the bridge module, which is an easy thing
to do. :) (It is mostly there to ensure against DOS attacks if someone
bombards a locked port with random mac addresses.)
I have a similar limitation in the driver, which should then probably be
dropped too?

The mayor difference between v2 and v3 is in the mv88e6xxx driver, where
I now keep an inventory of locked ATU entries and remove them based on a
timer (mv88e6xxx_switchcore.c).

I guess the mentioned log should be in the cover letter part?


>>> fdb view with all its attributes when changing its dst in this case, which would
>>> require new locking because you have multiple dependent struct fields and it will
>>> kill roaming/learning scalability. I don't think this use case is worth the complexity it
>>> will bring, so I'd suggest an alternative - you can monitor the number of locked entries
>>> per-port from a user-space agent and disable port learning or some similar solution that
>>> doesn't require any complex kernel changes. Is the limit a requirement to add the feature?
>>>
>>> I have an idea how to do it and to minimize the performance hit if it really is needed
>>> but it'll add a lot of complexity which I'd like to avoid if possible.
>>>
>>> Cheers,
>>>  Nik
Nikolay Aleksandrov May 25, 2022, 10:18 a.m. UTC | #8
On 25/05/2022 12:11, Hans Schultz wrote:
> On ons, maj 25, 2022 at 11:38, Nikolay Aleksandrov <razor@blackwall.org> wrote:
>> On 25/05/2022 11:34, Hans Schultz wrote:
>>> On ons, maj 25, 2022 at 11:06, Nikolay Aleksandrov <razor@blackwall.org> wrote:
>>>> On 24/05/2022 19:21, Hans Schultz wrote:
>>>>>>
>>>>>> Hi Hans,
>>>>>> So this approach has a fundamental problem, f->dst is changed without any synchronization
>>>>>> you cannot rely on it and thus you cannot account for these entries properly. We must be very
>>>>>> careful if we try to add any new synchronization not to affect performance as well.
>>>>>> More below...
>>>>>>
>>>>>>> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
>>>>>>>  	if (test_bit(BR_FDB_STATIC, &f->flags))
>>>>>>>  		fdb_del_hw_addr(br, f->key.addr.addr);
>>>>>>>  
>>>>>>> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
>>>>>>> +		atomic_dec(&f->dst->locked_entry_cnt);
>>>>>>
>>>>>> Sorry but you cannot do this for multiple reasons:
>>>>>>  - f->dst can be NULL
>>>>>>  - f->dst changes without any synchronization
>>>>>>  - there is no synchronization between fdb's flags and its ->dst
>>>>>>
>>>>>> Cheers,
>>>>>>  Nik
>>>>>
>>>>> Hi Nik,
>>>>>
>>>>> if a port is decoupled from the bridge, the locked entries would of
>>>>> course be invalid, so maybe if adding and removing a port is accounted
>>>>> for wrt locked entries and the count of locked entries, would that not
>>>>> work?
>>>>>
>>>>> Best,
>>>>> Hans
>>>>
>>>> Hi Hans,
>>>> Unfortunately you need the correct amount of locked entries per-port if you want
>>>> to limit their number per-port, instead of globally. So you need a
>>>> consistent
>>>
>>> Hi Nik,
>>> the used dst is a port structure, so it is per-port and not globally.
>>>
>>> Best,
>>> Hans
>>>
>>
>> Yeah, I know. :) That's why I wrote it, if the limit is not a feature requirement I'd suggest
>> dropping it altogether, it can be enforced externally (e.g. from user-space) if needed.
>>
>> By the way just fyi net-next is closed right now due to merge window. And one more
>> thing please include a short log of changes between versions when you send a new one.
>> I had to go look for v2 to find out what changed.
>>
> 
> Okay, I will drop the limit in the bridge module, which is an easy thing
> to do. :) (It is mostly there to ensure against DOS attacks if someone
> bombards a locked port with random mac addresses.)
> I have a similar limitation in the driver, which should then probably be
> dropped too?
> 

That is up to you/driver, I'd try looking for similar problems in other switch drivers
and check how those were handled. There are people in the CC above that can
directly answer that. :)

> The mayor difference between v2 and v3 is in the mv88e6xxx driver, where
> I now keep an inventory of locked ATU entries and remove them based on a
> timer (mv88e6xxx_switchcore.c).
> 

ack

> I guess the mentioned log should be in the cover letter part?
> 

Yep, usually a short mention of what changed to make it easier for reviewers.
Some people also add the patch-specific changes to each patch under the ---
so they're not included in the log, but I'm fine either way as long as I don't
have to go digging up the old versions.

> 
>>>> fdb view with all its attributes when changing its dst in this case, which would
>>>> require new locking because you have multiple dependent struct fields and it will
>>>> kill roaming/learning scalability. I don't think this use case is worth the complexity it
>>>> will bring, so I'd suggest an alternative - you can monitor the number of locked entries
>>>> per-port from a user-space agent and disable port learning or some similar solution that
>>>> doesn't require any complex kernel changes. Is the limit a requirement to add the feature?
>>>>
>>>> I have an idea how to do it and to minimize the performance hit if it really is needed
>>>> but it'll add a lot of complexity which I'd like to avoid if possible.
>>>>
>>>> Cheers,
>>>>  Nik
Ido Schimmel May 26, 2022, 2:13 p.m. UTC | #9
On Tue, May 24, 2022 at 05:21:41PM +0200, Hans Schultz wrote:
> Add an intermediate state for clients behind a locked port to allow for
> possible opening of the port for said clients. This feature corresponds
> to the Mac-Auth and MAC Authentication Bypass (MAB) named features. The
> latter defined by Cisco.
> Locked FDB entries will be limited in number, so as to prevent DOS
> attacks by spamming the port with random entries. The limit will be
> a per port limit as it is a port based feature and that the port flushes
> all FDB entries on link down.

Why locked FDB entries need a special treatment compared to regular
entries? A port that has learning enabled can be spammed with random
source MACs just as well.

The authorization daemon that is monitoring FDB notifications can have a
policy to shut down a port if the rate / number of locked entries is
above a given threshold.

I don't think this kind of policy belongs in the kernel. If it resides
in user space, then the threshold can be adjusted. Currently it's hard
coded to 64 and I don't see how user space can change or monitor it.
Hans S May 27, 2022, 8:52 a.m. UTC | #10
On tor, maj 26, 2022 at 17:13, Ido Schimmel <idosch@idosch.org> wrote:
> On Tue, May 24, 2022 at 05:21:41PM +0200, Hans Schultz wrote:
>> Add an intermediate state for clients behind a locked port to allow for
>> possible opening of the port for said clients. This feature corresponds
>> to the Mac-Auth and MAC Authentication Bypass (MAB) named features. The
>> latter defined by Cisco.
>> Locked FDB entries will be limited in number, so as to prevent DOS
>> attacks by spamming the port with random entries. The limit will be
>> a per port limit as it is a port based feature and that the port flushes
>> all FDB entries on link down.
>
> Why locked FDB entries need a special treatment compared to regular
> entries? A port that has learning enabled can be spammed with random
> source MACs just as well.
>
> The authorization daemon that is monitoring FDB notifications can have a
> policy to shut down a port if the rate / number of locked entries is
> above a given threshold.
>
> I don't think this kind of policy belongs in the kernel. If it resides
> in user space, then the threshold can be adjusted. Currently it's hard
> coded to 64 and I don't see how user space can change or monitor it.

In the Mac-Auth/MAB context, the locked port feature is really a form of
CPU based learning, and on mv88e6xxx switchcores, this is facilitated by
violation interrupts. Based on miss violation interrupts, the locked
entries are then added to a list with a timer to remove the entries
according to the bridge timeout.
As this is very CPU intensive compared to normal operation, the
assessment is that all this will jam up most devices if bombarded with
random entries at link speed, and my estimate is that any userspace 
daemon that listens to the ensuing fdb events will never get a chance
to stop this flood and eventually the device will lock down/reset. To
prevent this, the limit is introduced.

Ideally this limit could be adjustable from userspace, but in real
use-cases a cap like 64 should be more than enough, as that corresponds
to 64 possible devices behind a port that cannot authenticate by other
means (printers etc.) than having their mac addresses white-listed.

The software bridge behavior was then just set to correspond to the
offloaded behavior, but after correspondence with Nik, the software
bridge locked entries limit will be removed.
Ido Schimmel May 27, 2022, 9:58 a.m. UTC | #11
On Fri, May 27, 2022 at 10:52:27AM +0200, Hans Schultz wrote:
> On tor, maj 26, 2022 at 17:13, Ido Schimmel <idosch@idosch.org> wrote:
> > On Tue, May 24, 2022 at 05:21:41PM +0200, Hans Schultz wrote:
> >> Add an intermediate state for clients behind a locked port to allow for
> >> possible opening of the port for said clients. This feature corresponds
> >> to the Mac-Auth and MAC Authentication Bypass (MAB) named features. The
> >> latter defined by Cisco.
> >> Locked FDB entries will be limited in number, so as to prevent DOS
> >> attacks by spamming the port with random entries. The limit will be
> >> a per port limit as it is a port based feature and that the port flushes
> >> all FDB entries on link down.
> >
> > Why locked FDB entries need a special treatment compared to regular
> > entries? A port that has learning enabled can be spammed with random
> > source MACs just as well.
> >
> > The authorization daemon that is monitoring FDB notifications can have a
> > policy to shut down a port if the rate / number of locked entries is
> > above a given threshold.
> >
> > I don't think this kind of policy belongs in the kernel. If it resides
> > in user space, then the threshold can be adjusted. Currently it's hard
> > coded to 64 and I don't see how user space can change or monitor it.
> 
> In the Mac-Auth/MAB context, the locked port feature is really a form of
> CPU based learning, and on mv88e6xxx switchcores, this is facilitated by
> violation interrupts. Based on miss violation interrupts, the locked
> entries are then added to a list with a timer to remove the entries
> according to the bridge timeout.
> As this is very CPU intensive compared to normal operation, the
> assessment is that all this will jam up most devices if bombarded with
> random entries at link speed, and my estimate is that any userspace 
> daemon that listens to the ensuing fdb events will never get a chance
> to stop this flood and eventually the device will lock down/reset. To
> prevent this, the limit is introduced.
> 
> Ideally this limit could be adjustable from userspace, but in real
> use-cases a cap like 64 should be more than enough, as that corresponds
> to 64 possible devices behind a port that cannot authenticate by other
> means (printers etc.) than having their mac addresses white-listed.
> 
> The software bridge behavior was then just set to correspond to the
> offloaded behavior, but after correspondence with Nik, the software
> bridge locked entries limit will be removed.

As far as the bridge is concerned, locked entries are not really
different from regular learned entries in terms of processing and since
we don't have limits for regular entries I don't think we should have
limits for locked entries.

I do understand the problem you have in mv88e6xxx and I think it would
be wise to hard code a reasonable limit there. It can be adjusted over
time based on feedback and possibly exposed to user space.

Just to give you another data point about how this works in other
devices, I can say that at least in Spectrum this works a bit
differently. Packets that ingress via a locked port and incur an FDB
miss are trapped to the CPU where they should be injected into the Rx
path so that the bridge will create the 'locked' FDB entry and notify it
to user space. The packets are obviously rated limited as the CPU cannot
handle billions of packets per second, unlike the ASIC. The limit is not
per bridge port (or even per bridge), but instead global to the entire
device.
Hans S May 27, 2022, 4 p.m. UTC | #12
>
> As far as the bridge is concerned, locked entries are not really
> different from regular learned entries in terms of processing and since
> we don't have limits for regular entries I don't think we should have
> limits for locked entries.
>
> I do understand the problem you have in mv88e6xxx and I think it would
> be wise to hard code a reasonable limit there. It can be adjusted over
> time based on feedback and possibly exposed to user space.
>
> Just to give you another data point about how this works in other
> devices, I can say that at least in Spectrum this works a bit
> differently. Packets that ingress via a locked port and incur an FDB
> miss are trapped to the CPU where they should be injected into the Rx
> path so that the bridge will create the 'locked' FDB entry and notify it
> to user space. The packets are obviously rated limited as the CPU cannot
> handle billions of packets per second, unlike the ASIC. The limit is not
> per bridge port (or even per bridge), but instead global to the entire
> device.

Ahh, I see. I think that the best is for those FDB entries to be created
as dynamic entries, so that user-space does not have to keep track of
unauthorized entries.
Also the mv88e6xxx chipsets have a 'hold at one' feature, for authorized
entries, so that if a device goes quiet after being authorized the
driver can signal to the software bridge and further to userspace that
an authorized device has gone quiet, and the entry can then be
removed. This though requires user added dynamic entries in the ATU, or
you can call it synthetically 'learned' entries.
Hans S May 31, 2022, 9:34 a.m. UTC | #13
> Just to give you another data point about how this works in other
> devices, I can say that at least in Spectrum this works a bit
> differently. Packets that ingress via a locked port and incur an FDB
> miss are trapped to the CPU where they should be injected into the Rx
> path so that the bridge will create the 'locked' FDB entry and notify it
> to user space. The packets are obviously rated limited as the CPU cannot
> handle billions of packets per second, unlike the ASIC. The limit is not
> per bridge port (or even per bridge), but instead global to the entire
> device.

Btw, will the bridge not create a SWITCHDEV_FDB_ADD_TO_DEVICE event
towards the switchcore in the scheme you mention and thus add an entry
that opens up for the specified mac address?
Ido Schimmel May 31, 2022, 2:23 p.m. UTC | #14
On Tue, May 31, 2022 at 11:34:21AM +0200, Hans Schultz wrote:
> > Just to give you another data point about how this works in other
> > devices, I can say that at least in Spectrum this works a bit
> > differently. Packets that ingress via a locked port and incur an FDB
> > miss are trapped to the CPU where they should be injected into the Rx
> > path so that the bridge will create the 'locked' FDB entry and notify it
> > to user space. The packets are obviously rated limited as the CPU cannot
> > handle billions of packets per second, unlike the ASIC. The limit is not
> > per bridge port (or even per bridge), but instead global to the entire
> > device.
> 
> Btw, will the bridge not create a SWITCHDEV_FDB_ADD_TO_DEVICE event
> towards the switchcore in the scheme you mention and thus add an entry
> that opens up for the specified mac address?

It will, but the driver needs to ignore FDB entries that are notified
with locked flag. I see that you extended 'struct
switchdev_notifier_fdb_info' with the locked flag, but it's not
initialized in br_switchdev_fdb_populate(). Can you add it in the next
version?
Hans S May 31, 2022, 3:49 p.m. UTC | #15
On tis, maj 31, 2022 at 17:23, Ido Schimmel <idosch@nvidia.com> wrote:
> On Tue, May 31, 2022 at 11:34:21AM +0200, Hans Schultz wrote:
>> > Just to give you another data point about how this works in other
>> > devices, I can say that at least in Spectrum this works a bit
>> > differently. Packets that ingress via a locked port and incur an FDB
>> > miss are trapped to the CPU where they should be injected into the Rx
>> > path so that the bridge will create the 'locked' FDB entry and notify it
>> > to user space. The packets are obviously rated limited as the CPU cannot
>> > handle billions of packets per second, unlike the ASIC. The limit is not
>> > per bridge port (or even per bridge), but instead global to the entire
>> > device.
>> 
>> Btw, will the bridge not create a SWITCHDEV_FDB_ADD_TO_DEVICE event
>> towards the switchcore in the scheme you mention and thus add an entry
>> that opens up for the specified mac address?
>
> It will, but the driver needs to ignore FDB entries that are notified
> with locked flag. I see that you extended 'struct
> switchdev_notifier_fdb_info' with the locked flag, but it's not
> initialized in br_switchdev_fdb_populate(). Can you add it in the next
> version?

Yes, definitely. I have only had focus on it in the messages coming up
from the driver, and neglected it the other way.
Hans S June 2, 2022, 9:17 a.m. UTC | #16
On tis, maj 31, 2022 at 17:23, Ido Schimmel <idosch@nvidia.com> wrote:
> On Tue, May 31, 2022 at 11:34:21AM +0200, Hans Schultz wrote:
>> > Just to give you another data point about how this works in other
>> > devices, I can say that at least in Spectrum this works a bit
>> > differently. Packets that ingress via a locked port and incur an FDB
>> > miss are trapped to the CPU where they should be injected into the Rx
>> > path so that the bridge will create the 'locked' FDB entry and notify it
>> > to user space. The packets are obviously rated limited as the CPU cannot
>> > handle billions of packets per second, unlike the ASIC. The limit is not
>> > per bridge port (or even per bridge), but instead global to the entire
>> > device.
>> 
>> Btw, will the bridge not create a SWITCHDEV_FDB_ADD_TO_DEVICE event
>> towards the switchcore in the scheme you mention and thus add an entry
>> that opens up for the specified mac address?
>
> It will, but the driver needs to ignore FDB entries that are notified
> with locked flag. I see that you extended 'struct
> switchdev_notifier_fdb_info' with the locked flag, but it's not
> initialized in br_switchdev_fdb_populate(). Can you add it in the next
> version?

An issue with sending the flag to the driver is that port_fdb_add() is
suddenly getting more and more arguments and getting messy in my
opinion, but maybe that's just how it is...

Another issue is that
bridge fdb add MAC dev DEV master static
seems to add the entry with the SELF flag set, which I don't think is
what we would want it to do or?
Also the replace command is not really supported properly as it is. I
have made a fix for that which looks something like this:

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 6cbb27e3b976..f43aa204f375 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -917,6 +917,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
                if (flags & NLM_F_EXCL)
                        return -EEXIST;
 
+               if (flags & NLM_F_REPLACE)
+                       modified = true;
+
                if (READ_ONCE(fdb->dst) != source) {
                        WRITE_ONCE(fdb->dst, source);
                        modified = true;

The argument for always sending notifications to the driver in the case
of replace is that a replace command will refresh the entries timeout if
the entry is the same. Any thoughts on this?
Nikolay Aleksandrov June 2, 2022, 9:33 a.m. UTC | #17
On 02/06/2022 12:17, Hans Schultz wrote:
> On tis, maj 31, 2022 at 17:23, Ido Schimmel <idosch@nvidia.com> wrote:
>> On Tue, May 31, 2022 at 11:34:21AM +0200, Hans Schultz wrote:
>>>> Just to give you another data point about how this works in other
>>>> devices, I can say that at least in Spectrum this works a bit
>>>> differently. Packets that ingress via a locked port and incur an FDB
>>>> miss are trapped to the CPU where they should be injected into the Rx
>>>> path so that the bridge will create the 'locked' FDB entry and notify it
>>>> to user space. The packets are obviously rated limited as the CPU cannot
>>>> handle billions of packets per second, unlike the ASIC. The limit is not
>>>> per bridge port (or even per bridge), but instead global to the entire
>>>> device.
>>>
>>> Btw, will the bridge not create a SWITCHDEV_FDB_ADD_TO_DEVICE event
>>> towards the switchcore in the scheme you mention and thus add an entry
>>> that opens up for the specified mac address?
>>
>> It will, but the driver needs to ignore FDB entries that are notified
>> with locked flag. I see that you extended 'struct
>> switchdev_notifier_fdb_info' with the locked flag, but it's not
>> initialized in br_switchdev_fdb_populate(). Can you add it in the next
>> version?
> 
> An issue with sending the flag to the driver is that port_fdb_add() is
> suddenly getting more and more arguments and getting messy in my
> opinion, but maybe that's just how it is...
> 
> Another issue is that
> bridge fdb add MAC dev DEV master static
> seems to add the entry with the SELF flag set, which I don't think is
> what we would want it to do or?

I don't see such thing (hacked iproute2 to print the flags before cmd):
$ bridge fdb add 00:11:22:33:44:55 dev vnet110 master static
flags 0x4

0x4 = NTF_MASTER only

> Also the replace command is not really supported properly as it is. I
> have made a fix for that which looks something like this:
> 
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index 6cbb27e3b976..f43aa204f375 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -917,6 +917,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
>                 if (flags & NLM_F_EXCL)
>                         return -EEXIST;
>  
> +               if (flags & NLM_F_REPLACE)
> +                       modified = true;
> +
>                 if (READ_ONCE(fdb->dst) != source) {
>                         WRITE_ONCE(fdb->dst, source);
>                         modified = true;
> 
> The argument for always sending notifications to the driver in the case
> of replace is that a replace command will refresh the entries timeout if
> the entry is the same. Any thoughts on this?

I don't think so. It always updates its "used" timer, not its "updated" timer which is the one
for expire. A replace that doesn't actually change anything on the entry shouldn't generate
a notification.
Hans S June 2, 2022, 10:17 a.m. UTC | #18
On tor, jun 02, 2022 at 12:33, Nikolay Aleksandrov <razor@blackwall.org> wrote:
> On 02/06/2022 12:17, Hans Schultz wrote:
>> On tis, maj 31, 2022 at 17:23, Ido Schimmel <idosch@nvidia.com> wrote:
>>> On Tue, May 31, 2022 at 11:34:21AM +0200, Hans Schultz wrote:

>> Another issue is that
>> bridge fdb add MAC dev DEV master static
>> seems to add the entry with the SELF flag set, which I don't think is
>> what we would want it to do or?
>
> I don't see such thing (hacked iproute2 to print the flags before cmd):
> $ bridge fdb add 00:11:22:33:44:55 dev vnet110 master static
> flags 0x4
>
> 0x4 = NTF_MASTER only
>

I also get 0x4 from iproute2, but I still get SELF entries when I look
with:
bridge fdb show dev DEV

>> Also the replace command is not really supported properly as it is. I
>> have made a fix for that which looks something like this:
>> 
>> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
>> index 6cbb27e3b976..f43aa204f375 100644
>> --- a/net/bridge/br_fdb.c
>> +++ b/net/bridge/br_fdb.c
>> @@ -917,6 +917,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
>>                 if (flags & NLM_F_EXCL)
>>                         return -EEXIST;
>>  
>> +               if (flags & NLM_F_REPLACE)
>> +                       modified = true;
>> +
>>                 if (READ_ONCE(fdb->dst) != source) {
>>                         WRITE_ONCE(fdb->dst, source);
>>                         modified = true;
>> 
>> The argument for always sending notifications to the driver in the case
>> of replace is that a replace command will refresh the entries timeout if
>> the entry is the same. Any thoughts on this?
>
> I don't think so. It always updates its "used" timer, not its "updated" timer which is the one
> for expire. A replace that doesn't actually change anything on the entry shouldn't generate
> a notification.

Okay, so then there is missing checks on flags as the issue arose from
replacing locked entries with dynamic entries. I will do another fix
based on flags as modified needs to be true for the driver to get notified.
Nikolay Aleksandrov June 2, 2022, 10:30 a.m. UTC | #19
On 02/06/2022 13:17, Hans Schultz wrote:
> On tor, jun 02, 2022 at 12:33, Nikolay Aleksandrov <razor@blackwall.org> wrote:
>> On 02/06/2022 12:17, Hans Schultz wrote:
>>> On tis, maj 31, 2022 at 17:23, Ido Schimmel <idosch@nvidia.com> wrote:
>>>> On Tue, May 31, 2022 at 11:34:21AM +0200, Hans Schultz wrote:
> 
>>> Another issue is that
>>> bridge fdb add MAC dev DEV master static
>>> seems to add the entry with the SELF flag set, which I don't think is
>>> what we would want it to do or?
>>
>> I don't see such thing (hacked iproute2 to print the flags before cmd):
>> $ bridge fdb add 00:11:22:33:44:55 dev vnet110 master static
>> flags 0x4
>>
>> 0x4 = NTF_MASTER only
>>
> 
> I also get 0x4 from iproute2, but I still get SELF entries when I look
> with:
> bridge fdb show dev DEV
> 

after the above add:
$ bridge fdb show dev vnet110 | grep 00:11
00:11:22:33:44:55 master virbr0 static

>>> Also the replace command is not really supported properly as it is. I
>>> have made a fix for that which looks something like this:
>>>
>>> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
>>> index 6cbb27e3b976..f43aa204f375 100644
>>> --- a/net/bridge/br_fdb.c
>>> +++ b/net/bridge/br_fdb.c
>>> @@ -917,6 +917,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
>>>                 if (flags & NLM_F_EXCL)
>>>                         return -EEXIST;
>>>  
>>> +               if (flags & NLM_F_REPLACE)
>>> +                       modified = true;
>>> +
>>>                 if (READ_ONCE(fdb->dst) != source) {
>>>                         WRITE_ONCE(fdb->dst, source);
>>>                         modified = true;
>>>
>>> The argument for always sending notifications to the driver in the case
>>> of replace is that a replace command will refresh the entries timeout if
>>> the entry is the same. Any thoughts on this?
>>
>> I don't think so. It always updates its "used" timer, not its "updated" timer which is the one
>> for expire. A replace that doesn't actually change anything on the entry shouldn't generate
>> a notification.
> 
> Okay, so then there is missing checks on flags as the issue arose from
> replacing locked entries with dynamic entries. I will do another fix
> based on flags as modified needs to be true for the driver to get notified.
Ido Schimmel June 2, 2022, 10:39 a.m. UTC | #20
On Thu, Jun 02, 2022 at 01:30:06PM +0300, Nikolay Aleksandrov wrote:
> On 02/06/2022 13:17, Hans Schultz wrote:
> > On tor, jun 02, 2022 at 12:33, Nikolay Aleksandrov <razor@blackwall.org> wrote:
> >> On 02/06/2022 12:17, Hans Schultz wrote:
> >>> On tis, maj 31, 2022 at 17:23, Ido Schimmel <idosch@nvidia.com> wrote:
> >>>> On Tue, May 31, 2022 at 11:34:21AM +0200, Hans Schultz wrote:
> > 
> >>> Another issue is that
> >>> bridge fdb add MAC dev DEV master static
> >>> seems to add the entry with the SELF flag set, which I don't think is
> >>> what we would want it to do or?
> >>
> >> I don't see such thing (hacked iproute2 to print the flags before cmd):
> >> $ bridge fdb add 00:11:22:33:44:55 dev vnet110 master static
> >> flags 0x4
> >>
> >> 0x4 = NTF_MASTER only
> >>
> > 
> > I also get 0x4 from iproute2, but I still get SELF entries when I look
> > with:
> > bridge fdb show dev DEV
> > 
> 
> after the above add:
> $ bridge fdb show dev vnet110 | grep 00:11
> 00:11:22:33:44:55 master virbr0 static

I think Hans is testing with mv88e6xxx which dumps entries directly from
HW via ndo_fdb_dump(). See dsa_slave_port_fdb_do_dump() which sets
NTF_SELF.

Hans, are you seeing the entry twice? Once with 'master' and once with
'self'?

> 
> >>> Also the replace command is not really supported properly as it is. I
> >>> have made a fix for that which looks something like this:
> >>>
> >>> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> >>> index 6cbb27e3b976..f43aa204f375 100644
> >>> --- a/net/bridge/br_fdb.c
> >>> +++ b/net/bridge/br_fdb.c
> >>> @@ -917,6 +917,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
> >>>                 if (flags & NLM_F_EXCL)
> >>>                         return -EEXIST;
> >>>  
> >>> +               if (flags & NLM_F_REPLACE)
> >>> +                       modified = true;
> >>> +
> >>>                 if (READ_ONCE(fdb->dst) != source) {
> >>>                         WRITE_ONCE(fdb->dst, source);
> >>>                         modified = true;
> >>>
> >>> The argument for always sending notifications to the driver in the case
> >>> of replace is that a replace command will refresh the entries timeout if
> >>> the entry is the same. Any thoughts on this?
> >>
> >> I don't think so. It always updates its "used" timer, not its "updated" timer which is the one
> >> for expire. A replace that doesn't actually change anything on the entry shouldn't generate
> >> a notification.
> > 
> > Okay, so then there is missing checks on flags as the issue arose from
> > replacing locked entries with dynamic entries. I will do another fix
> > based on flags as modified needs to be true for the driver to get notified.
>
Hans S June 2, 2022, 11:36 a.m. UTC | #21
On tor, jun 02, 2022 at 13:39, Ido Schimmel <idosch@nvidia.com> wrote:
> On Thu, Jun 02, 2022 at 01:30:06PM +0300, Nikolay Aleksandrov wrote:
>> On 02/06/2022 13:17, Hans Schultz wrote:
>> > On tor, jun 02, 2022 at 12:33, Nikolay Aleksandrov <razor@blackwall.org> wrote:
>> >> On 02/06/2022 12:17, Hans Schultz wrote:
>> >>> On tis, maj 31, 2022 at 17:23, Ido Schimmel <idosch@nvidia.com> wrote:
>> >>>> On Tue, May 31, 2022 at 11:34:21AM +0200, Hans Schultz wrote:
>> > 
>> >>> Another issue is that
>> >>> bridge fdb add MAC dev DEV master static
>> >>> seems to add the entry with the SELF flag set, which I don't think is
>> >>> what we would want it to do or?
>> >>
>> >> I don't see such thing (hacked iproute2 to print the flags before cmd):
>> >> $ bridge fdb add 00:11:22:33:44:55 dev vnet110 master static
>> >> flags 0x4
>> >>
>> >> 0x4 = NTF_MASTER only
>> >>
>> > 
>> > I also get 0x4 from iproute2, but I still get SELF entries when I look
>> > with:
>> > bridge fdb show dev DEV
>> > 
>> 
>> after the above add:
>> $ bridge fdb show dev vnet110 | grep 00:11
>> 00:11:22:33:44:55 master virbr0 static

>
> I think Hans is testing with mv88e6xxx which dumps entries directly from
> HW via ndo_fdb_dump(). See dsa_slave_port_fdb_do_dump() which sets
> NTF_SELF.
>
> Hans, are you seeing the entry twice? Once with 'master' and once with
> 'self'?
>

Well yes, but I get some additional entries with 'self' for different
vlans. So from clean adding a random fdb entry I get 4 entries on the
port, 2 with 'master' and two with 'self'.
It looks like this:

# bridge fdb add  00:22:33:44:55:66 dev eth6 master static
# bridge fdb show dev eth6 | grep 55
00:22:33:44:55:66 vlan 1 master br0 offload static
00:22:33:44:55:66 master br0 offload static
00:22:33:44:55:66 vlan 1 self static
00:22:33:44:55:66 vlan 4095 self static

If I do a replace of a locked entry I only get one with the 'self' flag.
Ido Schimmel June 2, 2022, 11:55 a.m. UTC | #22
On Thu, Jun 02, 2022 at 01:36:56PM +0200, Hans Schultz wrote:
> On tor, jun 02, 2022 at 13:39, Ido Schimmel <idosch@nvidia.com> wrote:
> > On Thu, Jun 02, 2022 at 01:30:06PM +0300, Nikolay Aleksandrov wrote:
> >> On 02/06/2022 13:17, Hans Schultz wrote:
> >> > On tor, jun 02, 2022 at 12:33, Nikolay Aleksandrov <razor@blackwall.org> wrote:
> >> >> On 02/06/2022 12:17, Hans Schultz wrote:
> >> >>> On tis, maj 31, 2022 at 17:23, Ido Schimmel <idosch@nvidia.com> wrote:
> >> >>>> On Tue, May 31, 2022 at 11:34:21AM +0200, Hans Schultz wrote:
> >> > 
> >> >>> Another issue is that
> >> >>> bridge fdb add MAC dev DEV master static
> >> >>> seems to add the entry with the SELF flag set, which I don't think is
> >> >>> what we would want it to do or?
> >> >>
> >> >> I don't see such thing (hacked iproute2 to print the flags before cmd):
> >> >> $ bridge fdb add 00:11:22:33:44:55 dev vnet110 master static
> >> >> flags 0x4
> >> >>
> >> >> 0x4 = NTF_MASTER only
> >> >>
> >> > 
> >> > I also get 0x4 from iproute2, but I still get SELF entries when I look
> >> > with:
> >> > bridge fdb show dev DEV
> >> > 
> >> 
> >> after the above add:
> >> $ bridge fdb show dev vnet110 | grep 00:11
> >> 00:11:22:33:44:55 master virbr0 static
> 
> >
> > I think Hans is testing with mv88e6xxx which dumps entries directly from
> > HW via ndo_fdb_dump(). See dsa_slave_port_fdb_do_dump() which sets
> > NTF_SELF.
> >
> > Hans, are you seeing the entry twice? Once with 'master' and once with
> > 'self'?
> >
> 
> Well yes, but I get some additional entries with 'self' for different
> vlans. So from clean adding a random fdb entry I get 4 entries on the
> port, 2 with 'master' and two with 'self'.
> It looks like this:
> 
> # bridge fdb add  00:22:33:44:55:66 dev eth6 master static
> # bridge fdb show dev eth6 | grep 55
> 00:22:33:44:55:66 vlan 1 master br0 offload static
> 00:22:33:44:55:66 master br0 offload static

These two entries are added by the bridge driver ('master' is set). You
get two entries because you didn't specify a VLAN, so one entry is
installed with VLAN 0 (no VLAN) and the second is installed because VLAN
1 is configured on eth6.

> 00:22:33:44:55:66 vlan 1 self static

This entry is from the HW. It corresponds to the first entry above.

> 00:22:33:44:55:66 vlan 4095 self static

I assume you are using VLAN 4095 for untagged traffic, so this entry
probably corresponds to the second entry above.

> 
> If I do a replace of a locked entry I only get one with the 'self' flag.

IIUC, your driver is adding the entry to the bridge and with a specific
VLAN. So you have one entry reported by the bridge driver and a
corresponding entry in HW.
Hans S June 2, 2022, 12:08 p.m. UTC | #23
>
> I think Hans is testing with mv88e6xxx which dumps entries directly from
> HW via ndo_fdb_dump(). See dsa_slave_port_fdb_do_dump() which sets
> NTF_SELF.
>
> Hans, are you seeing the entry twice? Once with 'master' and once with
> 'self'?
>

When replacing a locked entry it looks like this:

# bridge fdb show dev eth6 | grep 4c
00:4c:4c:4c:4c:4c vlan 1 master br0 extern_learn offload locked

# bridge fdb replace 00:4c:4c:4c:4c:4c dev eth6 vlan 1 master static ; bridge fdb show dev eth6 | grep 4c
00:4c:4c:4c:4c:4c vlan 1 self static

The problem is then that the function
br_fdb_find_rcu(br,eth_hdr(skb)->h_source, vid);
, where the h_source and vid is the entry above, does not find the entry.
My hypothesis was then that this is because of the 'self' flag that I
see.

I am thinking that the function dsa_slave_port_fdb_do_dump() is only for
debug, and thus does not really set any flags in the bridge modules FDB,
but then I don't understand why the above find function does not find
the entry?
Ido Schimmel June 2, 2022, 12:18 p.m. UTC | #24
On Thu, Jun 02, 2022 at 02:08:41PM +0200, Hans Schultz wrote:
> >
> > I think Hans is testing with mv88e6xxx which dumps entries directly from
> > HW via ndo_fdb_dump(). See dsa_slave_port_fdb_do_dump() which sets
> > NTF_SELF.
> >
> > Hans, are you seeing the entry twice? Once with 'master' and once with
> > 'self'?
> >
> 
> When replacing a locked entry it looks like this:
> 
> # bridge fdb show dev eth6 | grep 4c
> 00:4c:4c:4c:4c:4c vlan 1 master br0 extern_learn offload locked
> 
> # bridge fdb replace 00:4c:4c:4c:4c:4c dev eth6 vlan 1 master static ; bridge fdb show dev eth6 | grep 4c
> 00:4c:4c:4c:4c:4c vlan 1 self static

This output means that the FDB entry was deleted from the bridge driver
FDB.

> 
> The problem is then that the function
> br_fdb_find_rcu(br,eth_hdr(skb)->h_source, vid);
> , where the h_source and vid is the entry above, does not find the entry.
> My hypothesis was then that this is because of the 'self' flag that I
> see.

br_fdb_find_rcu() does a lookup in the bridge driver FDB, but per the
output above, the entry isn't there for some reason. It's only in HW.

Can it be that you driver is deleting these entries from the bridge
driver FDB via SWITCHDEV_FDB_DEL_TO_BRIDGE for some reason?

> 
> I am thinking that the function dsa_slave_port_fdb_do_dump() is only for
> debug, and thus does not really set any flags in the bridge modules FDB,
> but then I don't understand why the above find function does not find
> the entry?
Hans S June 2, 2022, 1:27 p.m. UTC | #25
Yes, that sounds much like the case. So the replace of course just
modifies the SW fdb entry, and then it just uses port_fdb_add() to
replace HW entry I assume, which then in my case triggers
SWITCHDEV_FDB_DEL_TO_BRIDGE as the locked entry is removed.
So I should not send the SWITCHDEV_FDB_DEL_TO_BRIDGE message when
removing the locked entry from port_fdb_add() function...

(note: having problems with smtp.gmail.com...)


On Thu, Jun 2, 2022 at 2:18 PM Ido Schimmel <idosch@nvidia.com> wrote:
>
> On Thu, Jun 02, 2022 at 02:08:41PM +0200, Hans Schultz wrote:
> > >
> > > I think Hans is testing with mv88e6xxx which dumps entries directly from
> > > HW via ndo_fdb_dump(). See dsa_slave_port_fdb_do_dump() which sets
> > > NTF_SELF.
> > >
> > > Hans, are you seeing the entry twice? Once with 'master' and once with
> > > 'self'?
> > >
> >
> > When replacing a locked entry it looks like this:
> >
> > # bridge fdb show dev eth6 | grep 4c
> > 00:4c:4c:4c:4c:4c vlan 1 master br0 extern_learn offload locked
> >
> > # bridge fdb replace 00:4c:4c:4c:4c:4c dev eth6 vlan 1 master static ; bridge fdb show dev eth6 | grep 4c
> > 00:4c:4c:4c:4c:4c vlan 1 self static
>
> This output means that the FDB entry was deleted from the bridge driver
> FDB.
>
> >
> > The problem is then that the function
> > br_fdb_find_rcu(br,eth_hdr(skb)->h_source, vid);
> > , where the h_source and vid is the entry above, does not find the entry.
> > My hypothesis was then that this is because of the 'self' flag that I
> > see.
>
> br_fdb_find_rcu() does a lookup in the bridge driver FDB, but per the
> output above, the entry isn't there for some reason. It's only in HW.
>
> Can it be that you driver is deleting these entries from the bridge
> driver FDB via SWITCHDEV_FDB_DEL_TO_BRIDGE for some reason?
>
> >
> > I am thinking that the function dsa_slave_port_fdb_do_dump() is only for
> > debug, and thus does not really set any flags in the bridge modules FDB,
> > but then I don't understand why the above find function does not find
> > the entry?
Vladimir Oltean July 6, 2022, 6:13 p.m. UTC | #26
Hi Nikolay,

On Wed, May 25, 2022 at 01:18:49PM +0300, Nikolay Aleksandrov wrote:
> >>>>>> Hi Hans,
> >>>>>> So this approach has a fundamental problem, f->dst is changed without any synchronization
> >>>>>> you cannot rely on it and thus you cannot account for these entries properly. We must be very
> >>>>>> careful if we try to add any new synchronization not to affect performance as well.
> >>>>>> More below...
> >>>>>>
> >>>>>>> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
> >>>>>>>  	if (test_bit(BR_FDB_STATIC, &f->flags))
> >>>>>>>  		fdb_del_hw_addr(br, f->key.addr.addr);
> >>>>>>>  
> >>>>>>> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
> >>>>>>> +		atomic_dec(&f->dst->locked_entry_cnt);
> >>>>>>
> >>>>>> Sorry but you cannot do this for multiple reasons:
> >>>>>>  - f->dst can be NULL
> >>>>>>  - f->dst changes without any synchronization
> >>>>>>  - there is no synchronization between fdb's flags and its ->dst
> >>>>>>
> >>>>>> Cheers,
> >>>>>>  Nik
> >>>>>
> >>>>> Hi Nik,
> >>>>>
> >>>>> if a port is decoupled from the bridge, the locked entries would of
> >>>>> course be invalid, so maybe if adding and removing a port is accounted
> >>>>> for wrt locked entries and the count of locked entries, would that not
> >>>>> work?
> >>>>>
> >>>>> Best,
> >>>>> Hans
> >>>>
> >>>> Hi Hans,
> >>>> Unfortunately you need the correct amount of locked entries per-port if you want
> >>>> to limit their number per-port, instead of globally. So you need a
> >>>> consistent
> >>>
> >>> Hi Nik,
> >>> the used dst is a port structure, so it is per-port and not globally.
> >>>
> >>> Best,
> >>> Hans
> >>>
> >>
> >> Yeah, I know. :) That's why I wrote it, if the limit is not a feature requirement I'd suggest
> >> dropping it altogether, it can be enforced externally (e.g. from user-space) if needed.
> >>
> >> By the way just fyi net-next is closed right now due to merge window. And one more
> >> thing please include a short log of changes between versions when you send a new one.
> >> I had to go look for v2 to find out what changed.
> >>
> > 
> > Okay, I will drop the limit in the bridge module, which is an easy thing
> > to do. :) (It is mostly there to ensure against DOS attacks if someone
> > bombards a locked port with random mac addresses.)
> > I have a similar limitation in the driver, which should then probably be
> > dropped too?
> > 
> 
> That is up to you/driver, I'd try looking for similar problems in other switch drivers
> and check how those were handled. There are people in the CC above that can
> directly answer that. :)

Not sure whom you're referring to?

In fact I was pretty sure that I didn't see any OOM protection in the
source code of the Linux bridge driver itself either, so I wanted to
check that for myself, so I wrote a small "killswitch" program that's
supposed to, well, kill a switch. It took me a while to find a few free
hours to do the test, sorry for that.

https://github.com/vladimiroltean/killswitch/blob/master/src/killswitch.c

Sure enough, I can kill a Marvell Armada 3720 device with 1GB of RAM
within 3 minutes of running the test program.

[  273.864203] ksoftirqd/0: page allocation failure: order:0, mode:0x40a20(GFP_ATOMIC|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0
[  273.876426] CPU: 0 PID: 12 Comm: ksoftirqd/0 Not tainted 5.18.7-rc1-00013-g52b92343db13 #74
[  273.884775] Hardware name: CZ.NIC Turris Mox Board (DT)
[  273.889994] Call trace:
[  273.892437]  dump_backtrace.part.0+0xc8/0xd4
[  273.896721]  show_stack+0x18/0x70
[  273.900039]  dump_stack_lvl+0x68/0x84
[  273.903703]  dump_stack+0x18/0x34
[  273.907017]  warn_alloc+0x114/0x1a0
[  273.910508]  __alloc_pages+0xbb0/0xbe0
[  273.914257]  cache_grow_begin+0x60/0x300
[  273.918183]  fallback_alloc+0x184/0x220
[  273.922017]  ____cache_alloc_node+0x174/0x190
[  273.926373]  kmem_cache_alloc+0x1a4/0x220
[  273.930381]  fdb_create+0x40/0x430
[  273.933784]  br_fdb_update+0x198/0x210
[  273.937532]  br_handle_frame_finish+0x244/0x530
[  273.942063]  br_handle_frame+0x1c0/0x270
[  273.945986]  __netif_receive_skb_core.constprop.0+0x29c/0xd30
[  273.951734]  __netif_receive_skb_list_core+0xe8/0x210
[  273.956784]  netif_receive_skb_list_internal+0x180/0x29c
[  273.962091]  napi_gro_receive+0x174/0x190
[  273.966099]  mvneta_rx_swbm+0x6b8/0xb40
[  273.969935]  mvneta_poll+0x684/0x900
[  273.973506]  __napi_poll+0x38/0x18c
[  273.976988]  net_rx_action+0xe8/0x280
[  273.980643]  __do_softirq+0x124/0x2a0
[  273.984299]  run_ksoftirqd+0x4c/0x60
[  273.987871]  smpboot_thread_fn+0x23c/0x270
[  273.991963]  kthread+0x10c/0x110
[  273.995188]  ret_from_fork+0x10/0x20

(followed by lots upon lots of vomiting, followed by ...)

[  311.138590] Out of memory and no killable processes...
[  311.143774] Kernel panic - not syncing: System is deadlocked on memory
[  311.150295] CPU: 0 PID: 6 Comm: kworker/0:0 Not tainted 5.18.7-rc1-00013-g52b92343db13 #74
[  311.158550] Hardware name: CZ.NIC Turris Mox Board (DT)
[  311.163766] Workqueue: events rht_deferred_worker
[  311.168477] Call trace:
[  311.170916]  dump_backtrace.part.0+0xc8/0xd4
[  311.175188]  show_stack+0x18/0x70
[  311.178501]  dump_stack_lvl+0x68/0x84
[  311.182159]  dump_stack+0x18/0x34
[  311.185466]  panic+0x168/0x328
[  311.188515]  out_of_memory+0x568/0x584
[  311.192261]  __alloc_pages+0xb04/0xbe0
[  311.196006]  __alloc_pages_bulk+0x15c/0x604
[  311.200185]  alloc_pages_bulk_array_mempolicy+0xbc/0x24c
[  311.205491]  __vmalloc_node_range+0x238/0x550
[  311.209843]  __vmalloc_node_range+0x1c0/0x550
[  311.214195]  kvmalloc_node+0xe0/0x124
[  311.217856]  bucket_table_alloc.isra.0+0x40/0x150
[  311.222554]  rhashtable_rehash_alloc.isra.0+0x20/0x8c
[  311.227599]  rht_deferred_worker+0x7c/0x540
[  311.231775]  process_one_work+0x1d0/0x320
[  311.235779]  worker_thread+0x70/0x440
[  311.239435]  kthread+0x10c/0x110
[  311.242661]  ret_from_fork+0x10/0x20
[  311.246238] SMP: stopping secondary CPUs
[  311.250161] Kernel Offset: disabled
[  311.253642] CPU features: 0x000,00020009,00001086
[  311.258338] Memory Limit: none
[  311.261390] ---[ end Kernel panic - not syncing: System is deadlocked on memory ]---

That can't be quite alright? Shouldn't we have some sort of protection
in the bridge itself too, not just tell hardware driver writers to deal
with it? Or is it somewhere, but it needs to be enabled/configured?
Nikolay Aleksandrov July 6, 2022, 7:38 p.m. UTC | #27
On 06/07/2022 21:13, Vladimir Oltean wrote:
> Hi Nikolay,
> 
> On Wed, May 25, 2022 at 01:18:49PM +0300, Nikolay Aleksandrov wrote:
>>>>>>>> Hi Hans,
>>>>>>>> So this approach has a fundamental problem, f->dst is changed without any synchronization
>>>>>>>> you cannot rely on it and thus you cannot account for these entries properly. We must be very
>>>>>>>> careful if we try to add any new synchronization not to affect performance as well.
>>>>>>>> More below...
>>>>>>>>
>>>>>>>>> @@ -319,6 +326,9 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
>>>>>>>>>  	if (test_bit(BR_FDB_STATIC, &f->flags))
>>>>>>>>>  		fdb_del_hw_addr(br, f->key.addr.addr);
>>>>>>>>>  
>>>>>>>>> +	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
>>>>>>>>> +		atomic_dec(&f->dst->locked_entry_cnt);
>>>>>>>>
>>>>>>>> Sorry but you cannot do this for multiple reasons:
>>>>>>>>  - f->dst can be NULL
>>>>>>>>  - f->dst changes without any synchronization
>>>>>>>>  - there is no synchronization between fdb's flags and its ->dst
>>>>>>>>
>>>>>>>> Cheers,
>>>>>>>>  Nik
>>>>>>>
>>>>>>> Hi Nik,
>>>>>>>
>>>>>>> if a port is decoupled from the bridge, the locked entries would of
>>>>>>> course be invalid, so maybe if adding and removing a port is accounted
>>>>>>> for wrt locked entries and the count of locked entries, would that not
>>>>>>> work?
>>>>>>>
>>>>>>> Best,
>>>>>>> Hans
>>>>>>
>>>>>> Hi Hans,
>>>>>> Unfortunately you need the correct amount of locked entries per-port if you want
>>>>>> to limit their number per-port, instead of globally. So you need a
>>>>>> consistent
>>>>>
>>>>> Hi Nik,
>>>>> the used dst is a port structure, so it is per-port and not globally.
>>>>>
>>>>> Best,
>>>>> Hans
>>>>>
>>>>
>>>> Yeah, I know. :) That's why I wrote it, if the limit is not a feature requirement I'd suggest
>>>> dropping it altogether, it can be enforced externally (e.g. from user-space) if needed.
>>>>
>>>> By the way just fyi net-next is closed right now due to merge window. And one more
>>>> thing please include a short log of changes between versions when you send a new one.
>>>> I had to go look for v2 to find out what changed.
>>>>
>>>
>>> Okay, I will drop the limit in the bridge module, which is an easy thing
>>> to do. :) (It is mostly there to ensure against DOS attacks if someone
>>> bombards a locked port with random mac addresses.)
>>> I have a similar limitation in the driver, which should then probably be
>>> dropped too?
>>>
>>
>> That is up to you/driver, I'd try looking for similar problems in other switch drivers
>> and check how those were handled. There are people in the CC above that can
>> directly answer that. :)
> 
> Not sure whom you're referring to?

I meant people who have dealt with hardware resource management in the drivers.

> 
> In fact I was pretty sure that I didn't see any OOM protection in the
> source code of the Linux bridge driver itself either, so I wanted to
> check that for myself, so I wrote a small "killswitch" program that's
> supposed to, well, kill a switch. It took me a while to find a few free
> hours to do the test, sorry for that.
> 
> https://github.com/vladimiroltean/killswitch/blob/master/src/killswitch.c
> 
> Sure enough, I can kill a Marvell Armada 3720 device with 1GB of RAM
> within 3 minutes of running the test program.
> 

I don't think that is new or surprising, if there isn't anything to control the
device resources you'll get there. You don't really need to write any new programs
you can easily do it with mausezahn. I have tests that add over 10 million fdbs on
devices for a few seconds.

The point is it's not the bridge's task to limit memory consumption or to watch for resource
management. You can limit new entries from the device driver (in case of swdev learning) or
you can use a daemon to watch the number of entries and disable learning. There are many
different ways to avoid this. We've discussed it before and I don't mind adding a hard fdb
per-port limit in the bridge as long as it's done properly. We've also discussed LRU and similar
algorithms for fdb learning and eviction. But any hardcoded limits or limits that can break
current default use cases are unacceptable, they must be opt-in.

> [  273.864203] ksoftirqd/0: page allocation failure: order:0, mode:0x40a20(GFP_ATOMIC|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0
> [  273.876426] CPU: 0 PID: 12 Comm: ksoftirqd/0 Not tainted 5.18.7-rc1-00013-g52b92343db13 #74
> [  273.884775] Hardware name: CZ.NIC Turris Mox Board (DT)
> [  273.889994] Call trace:
> [  273.892437]  dump_backtrace.part.0+0xc8/0xd4
> [  273.896721]  show_stack+0x18/0x70
> [  273.900039]  dump_stack_lvl+0x68/0x84
> [  273.903703]  dump_stack+0x18/0x34
> [  273.907017]  warn_alloc+0x114/0x1a0
> [  273.910508]  __alloc_pages+0xbb0/0xbe0
> [  273.914257]  cache_grow_begin+0x60/0x300
> [  273.918183]  fallback_alloc+0x184/0x220
> [  273.922017]  ____cache_alloc_node+0x174/0x190
> [  273.926373]  kmem_cache_alloc+0x1a4/0x220
> [  273.930381]  fdb_create+0x40/0x430
> [  273.933784]  br_fdb_update+0x198/0x210
> [  273.937532]  br_handle_frame_finish+0x244/0x530
> [  273.942063]  br_handle_frame+0x1c0/0x270
> [  273.945986]  __netif_receive_skb_core.constprop.0+0x29c/0xd30
> [  273.951734]  __netif_receive_skb_list_core+0xe8/0x210
> [  273.956784]  netif_receive_skb_list_internal+0x180/0x29c
> [  273.962091]  napi_gro_receive+0x174/0x190
> [  273.966099]  mvneta_rx_swbm+0x6b8/0xb40
> [  273.969935]  mvneta_poll+0x684/0x900
> [  273.973506]  __napi_poll+0x38/0x18c
> [  273.976988]  net_rx_action+0xe8/0x280
> [  273.980643]  __do_softirq+0x124/0x2a0
> [  273.984299]  run_ksoftirqd+0x4c/0x60
> [  273.987871]  smpboot_thread_fn+0x23c/0x270
> [  273.991963]  kthread+0x10c/0x110
> [  273.995188]  ret_from_fork+0x10/0x20
> 
> (followed by lots upon lots of vomiting, followed by ...)
> 
> [  311.138590] Out of memory and no killable processes...
> [  311.143774] Kernel panic - not syncing: System is deadlocked on memory
> [  311.150295] CPU: 0 PID: 6 Comm: kworker/0:0 Not tainted 5.18.7-rc1-00013-g52b92343db13 #74
> [  311.158550] Hardware name: CZ.NIC Turris Mox Board (DT)
> [  311.163766] Workqueue: events rht_deferred_worker
> [  311.168477] Call trace:
> [  311.170916]  dump_backtrace.part.0+0xc8/0xd4
> [  311.175188]  show_stack+0x18/0x70
> [  311.178501]  dump_stack_lvl+0x68/0x84
> [  311.182159]  dump_stack+0x18/0x34
> [  311.185466]  panic+0x168/0x328
> [  311.188515]  out_of_memory+0x568/0x584
> [  311.192261]  __alloc_pages+0xb04/0xbe0
> [  311.196006]  __alloc_pages_bulk+0x15c/0x604
> [  311.200185]  alloc_pages_bulk_array_mempolicy+0xbc/0x24c
> [  311.205491]  __vmalloc_node_range+0x238/0x550
> [  311.209843]  __vmalloc_node_range+0x1c0/0x550
> [  311.214195]  kvmalloc_node+0xe0/0x124
> [  311.217856]  bucket_table_alloc.isra.0+0x40/0x150
> [  311.222554]  rhashtable_rehash_alloc.isra.0+0x20/0x8c
> [  311.227599]  rht_deferred_worker+0x7c/0x540
> [  311.231775]  process_one_work+0x1d0/0x320
> [  311.235779]  worker_thread+0x70/0x440
> [  311.239435]  kthread+0x10c/0x110
> [  311.242661]  ret_from_fork+0x10/0x20
> [  311.246238] SMP: stopping secondary CPUs
> [  311.250161] Kernel Offset: disabled
> [  311.253642] CPU features: 0x000,00020009,00001086
> [  311.258338] Memory Limit: none
> [  311.261390] ---[ end Kernel panic - not syncing: System is deadlocked on memory ]---
> 
> That can't be quite alright? Shouldn't we have some sort of protection
> in the bridge itself too, not just tell hardware driver writers to deal
> with it? Or is it somewhere, but it needs to be enabled/configured?

This is expected, if you'd like feel free to add a hard learning limit in the driver
and the bridge (again if implemented properly).
Nothing can save you if someone has L2 access to the device, they can poison any
table if learning is enabled.
Vladimir Oltean July 6, 2022, 8:21 p.m. UTC | #28
On Wed, Jul 06, 2022 at 10:38:04PM +0300, Nikolay Aleksandrov wrote:
> I don't think that is new or surprising, if there isn't anything to control the
> device resources you'll get there. You don't really need to write any new programs
> you can easily do it with mausezahn. I have tests that add over 10 million fdbs on
> devices for a few seconds.

Of course it isn't new, but that doesn't make the situation in any way better,
quite the opposite...

> The point is it's not the bridge's task to limit memory consumption or to watch for resource
> management. You can limit new entries from the device driver (in case of swdev learning) or
> you can use a daemon to watch the number of entries and disable learning. There are many
> different ways to avoid this. We've discussed it before and I don't mind adding a hard fdb
> per-port limit in the bridge as long as it's done properly. We've also discussed LRU and similar
> algorithms for fdb learning and eviction. But any hardcoded limits or limits that can break
> current default use cases are unacceptable, they must be opt-in.

I don't think you can really say that it's not the bridge's task to
limit memory consumption when what it does is essentially allocate
memory from untrusted and unbounded user input, in kernel softirq
context.

That's in fact the problem, the kernel OOM killer will kick in, but
there will be no process to kill. This is why the kernel deadlocks on
memory and dies.

Maybe where our expectations differ is that I believe that a Linux
bridge shouldn't need gazillions of tweaks to not kill the kernel?
There are many devices in production using a bridge without such
configuration, you can't just make it opt-in.

Of course, performance under heavy stress is a separate concern, and
maybe user space monitoring would be a better idea for that.

I know you changed jobs, but did Cumulus Linux have an application to
monitor and limit the FDB entry count? Is there some standard
application which does this somewhere, or does everybody roll their own?

Anyway, limiting FDB entry count from user space is still theoretically
different from not dying. If you need to schedule a task to dispose of
the weight while the ship is sinking from softirq context, you may never
get to actually schedule that task in time. AFAIK the bridge UAPI doesn't
expose a pre-programmed limit, so what needs to be done is for user
space to manually delete entries until the count falls below the limit.
Nikolay Aleksandrov July 6, 2022, 9:01 p.m. UTC | #29
On 06/07/2022 23:21, Vladimir Oltean wrote:
> On Wed, Jul 06, 2022 at 10:38:04PM +0300, Nikolay Aleksandrov wrote:
>> I don't think that is new or surprising, if there isn't anything to control the
>> device resources you'll get there. You don't really need to write any new programs
>> you can easily do it with mausezahn. I have tests that add over 10 million fdbs on
>> devices for a few seconds.
> 
> Of course it isn't new, but that doesn't make the situation in any way better,
> quite the opposite...
> 
>> The point is it's not the bridge's task to limit memory consumption or to watch for resource
>> management. You can limit new entries from the device driver (in case of swdev learning) or
>> you can use a daemon to watch the number of entries and disable learning. There are many
>> different ways to avoid this. We've discussed it before and I don't mind adding a hard fdb
>> per-port limit in the bridge as long as it's done properly. We've also discussed LRU and similar
>> algorithms for fdb learning and eviction. But any hardcoded limits or limits that can break
>> current default use cases are unacceptable, they must be opt-in.
> 
> I don't think you can really say that it's not the bridge's task to
> limit memory consumption when what it does is essentially allocate
> memory from untrusted and unbounded user input, in kernel softirq
> context.
> 
> That's in fact the problem, the kernel OOM killer will kick in, but
> there will be no process to kill. This is why the kernel deadlocks on
> memory and dies.
> 
> Maybe where our expectations differ is that I believe that a Linux
> bridge shouldn't need gazillions of tweaks to not kill the kernel?
> There are many devices in production using a bridge without such
> configuration, you can't just make it opt-in.
> 

No, you cannot suddenly enforce such limit because such limit cannot work for everyone.
There is no silver bullet that works for everyone. Opt-in is the only way to go
about this with specific config for different devices and deployments, anyone
interested can set their limits. They can be auto-adjusted by swdev drivers
after that if necessary, but first they must be implemented in software.

If you're interested in adding default limits based on memory heuristics and consumption
I'd be interested to see it.

> Of course, performance under heavy stress is a separate concern, and
> maybe user space monitoring would be a better idea for that.
> 

You can do the whole software learning from user-space if needed, not only under heavy stress.

> I know you changed jobs, but did Cumulus Linux have an application to
> monitor and limit the FDB entry count? Is there some standard
> application which does this somewhere, or does everybody roll their own?
> 

I don't see how that is relevant.

> Anyway, limiting FDB entry count from user space is still theoretically
> different from not dying. If you need to schedule a task to dispose of

you can disable learning altogether and add entries from a user-space daemon, ie
implement complete user-space learning agent, theoretically you can solve it in
many ways if that's the problem

> the weight while the ship is sinking from softirq context, you may never
> get to actually schedule that task in time. AFAIK the bridge UAPI doesn't
> expose a pre-programmed limit, so what needs to be done is for user
> space to manually delete entries until the count falls below the limit.

That is a single case speculation, it depends on how it was implemented in the first place. You
can disable learning and have more than enough time to deal with it.

I already said it's ok to add hard configurable limits if they're done properly performance-wise.
Any distribution can choose to set some default limits after the option exists.
Nikolay Aleksandrov July 7, 2022, 2:08 p.m. UTC | #30
On 07/07/2022 00:01, Nikolay Aleksandrov wrote:
> On 06/07/2022 23:21, Vladimir Oltean wrote:
>> On Wed, Jul 06, 2022 at 10:38:04PM +0300, Nikolay Aleksandrov wrote:
[snip]
> I already said it's ok to add hard configurable limits if they're done properly performance-wise.
> Any distribution can choose to set some default limits after the option exists.
> 

Just fyi, and to avoid duplicate efforts, I already have patches for global and per-port software
fdb limits that I'll polish and submit soon (depending on time availability, of course). If I find
more time I might add per-vlan limits as well to the set. They use embedded netlink attributes
to config and dump, so we can easily extend them later (e.g. different action on limit hit, limit
statistics etc).
Vladimir Oltean July 7, 2022, 5:15 p.m. UTC | #31
Hi Nikolay,

On Thu, Jul 07, 2022 at 05:08:15PM +0300, Nikolay Aleksandrov wrote:
> On 07/07/2022 00:01, Nikolay Aleksandrov wrote:
> > On 06/07/2022 23:21, Vladimir Oltean wrote:
> >> On Wed, Jul 06, 2022 at 10:38:04PM +0300, Nikolay Aleksandrov wrote:
> [snip]
> > I already said it's ok to add hard configurable limits if they're done properly performance-wise.
> > Any distribution can choose to set some default limits after the option exists.
> > 
> 
> Just fyi, and to avoid duplicate efforts, I already have patches for global and per-port software
> fdb limits that I'll polish and submit soon (depending on time availability, of course). If I find
> more time I might add per-vlan limits as well to the set. They use embedded netlink attributes
> to config and dump, so we can easily extend them later (e.g. different action on limit hit, limit
> statistics etc).

So again, to repeat myself, it's nice to have limits on FDB size, but
those won't fix the software bridges that are now out in the open and
can't have their configuration scripts changed.

I haven't had the time to expand on this in a proper change yet, but I
was thinking more along the lines of adding an OOM handler with
register_oom_notifier() in br_fdb_init(), and on OOM, do something, like
flush the FDB from all bridges. There are going to be complications, it
will schedule switchdev, switchdev is going to allocate memory which
we're low on, the workqueues aren't created with WQ_MEM_RECLAIM, so this
isn't necessarily going to be a silver bullet either. But this is what
concerns me the most, the unconfigured bridge killing the kernel so
easily. As you can see, with an OOM handler I'm not so much trying to
impose a fixed limit on FDB size, but do something sensible such that
the bridge doesn't contribute to the kernel dying.
Nikolay Aleksandrov July 7, 2022, 5:26 p.m. UTC | #32
On 7 July 2022 20:15:07 EEST, Vladimir Oltean <olteanv@gmail.com> wrote:
>Hi Nikolay,
>
>On Thu, Jul 07, 2022 at 05:08:15PM +0300, Nikolay Aleksandrov wrote:
>> On 07/07/2022 00:01, Nikolay Aleksandrov wrote:
>> > On 06/07/2022 23:21, Vladimir Oltean wrote:
>> >> On Wed, Jul 06, 2022 at 10:38:04PM +0300, Nikolay Aleksandrov wrote:
>> [snip]
>> > I already said it's ok to add hard configurable limits if they're done properly performance-wise.
>> > Any distribution can choose to set some default limits after the option exists.
>> > 
>> 
>> Just fyi, and to avoid duplicate efforts, I already have patches for global and per-port software
>> fdb limits that I'll polish and submit soon (depending on time availability, of course). If I find
>> more time I might add per-vlan limits as well to the set. They use embedded netlink attributes
>> to config and dump, so we can easily extend them later (e.g. different action on limit hit, limit
>> statistics etc).
>
>So again, to repeat myself, it's nice to have limits on FDB size, but
>those won't fix the software bridges that are now out in the open and
>can't have their configuration scripts changed.
>
>I haven't had the time to expand on this in a proper change yet, but I
>was thinking more along the lines of adding an OOM handler with
>register_oom_notifier() in br_fdb_init(), and on OOM, do something, like
>flush the FDB from all bridges. There are going to be complications, it
>will schedule switchdev, switchdev is going to allocate memory which
>we're low on, the workqueues aren't created with WQ_MEM_RECLAIM, so this
>isn't necessarily going to be a silver bullet either. But this is what
>concerns me the most, the unconfigured bridge killing the kernel so
>easily. As you can see, with an OOM handler I'm not so much trying to
>impose a fixed limit on FDB size, but do something sensible such that
>the bridge doesn't contribute to the kernel dying.

Hi Vladimir,
Sounds good to me, the fdb limits have come up multiple times in the past so I decided 
to finally add them and build from there, with them configured oom shouldn't be hit.
These limits have never been present and people are fine (everyone deals with or leaves it), but I'll be happy to review and ack such changes. I hope you can correlate the oom and the bridge fdbs, not
just blindly flushing as that can be problematic if you plan to have it enabled by default.

Cheers,
  Nik
Hans S July 8, 2022, 6:38 a.m. UTC | #33
On Thu, Jul 7, 2022 at 4:08 PM Nikolay Aleksandrov <razor@blackwall.org> wrote:
>
> On 07/07/2022 00:01, Nikolay Aleksandrov wrote:
> > On 06/07/2022 23:21, Vladimir Oltean wrote:
> >> On Wed, Jul 06, 2022 at 10:38:04PM +0300, Nikolay Aleksandrov wrote:
> [snip]
> > I already said it's ok to add hard configurable limits if they're done properly performance-wise.
> > Any distribution can choose to set some default limits after the option exists.
> >
>
> Just fyi, and to avoid duplicate efforts, I already have patches for global and per-port software
> fdb limits that I'll polish and submit soon (depending on time availability, of course). If I find
> more time I might add per-vlan limits as well to the set. They use embedded netlink attributes
> to config and dump, so we can easily extend them later (e.g. different action on limit hit, limit
> statistics etc).
>

Sounds good, I will just limit the number of locked entries in the
driver as they are not controllable from the bridge. :-)
diff mbox series

Patch

diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 39c565e460c7..76d65b481086 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -53,6 +53,7 @@  enum {
 #define NTF_ROUTER	(1 << 7)
 /* Extended flags under NDA_FLAGS_EXT: */
 #define NTF_EXT_MANAGED	(1 << 0)
+#define NTF_EXT_LOCKED	(1 << 1)
 
 /*
  *	Neighbor Cache Entry States.
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e7f4fccb6adb..6b83e2d6435d 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -105,6 +105,7 @@  static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 	struct nda_cacheinfo ci;
 	struct nlmsghdr *nlh;
 	struct ndmsg *ndm;
+	u32 ext_flags = 0;
 
 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
 	if (nlh == NULL)
@@ -125,11 +126,16 @@  static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 		ndm->ndm_flags |= NTF_EXT_LEARNED;
 	if (test_bit(BR_FDB_STICKY, &fdb->flags))
 		ndm->ndm_flags |= NTF_STICKY;
+	if (test_bit(BR_FDB_ENTRY_LOCKED, &fdb->flags))
+		ext_flags |= NTF_EXT_LOCKED;
 
 	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
 		goto nla_put_failure;
 	if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
 		goto nla_put_failure;
+	if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags))
+		goto nla_put_failure;
+
 	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
 	ci.ndm_confirmed = 0;
 	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
@@ -171,6 +177,7 @@  static inline size_t fdb_nlmsg_size(void)
 	return NLMSG_ALIGN(sizeof(struct ndmsg))
 		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
 		+ nla_total_size(sizeof(u32)) /* NDA_MASTER */
+		+ nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */
 		+ nla_total_size(sizeof(u16)) /* NDA_VLAN */
 		+ nla_total_size(sizeof(struct nda_cacheinfo))
 		+ nla_total_size(0) /* NDA_FDB_EXT_ATTRS */
@@ -319,6 +326,9 @@  static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
 	if (test_bit(BR_FDB_STATIC, &f->flags))
 		fdb_del_hw_addr(br, f->key.addr.addr);
 
+	if (test_bit(BR_FDB_ENTRY_LOCKED, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags))
+		atomic_dec(&f->dst->locked_entry_cnt);
+
 	hlist_del_init_rcu(&f->fdb_node);
 	rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
 			       br_fdb_rht_params);
@@ -1086,6 +1096,7 @@  static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 		modified = true;
 
 	set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+	clear_bit(BR_FDB_ENTRY_LOCKED, &fdb->flags);
 
 	fdb->used = jiffies;
 	if (modified) {
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 47fcbade7389..0ca04cba5ebe 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -429,6 +429,7 @@  static struct net_bridge_port *new_nbp(struct net_bridge *br,
 	p->priority = 0x8000 >> BR_PORT_BITS;
 	p->port_no = index;
 	p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
+	p->locked_entry_cnt.counter = 0;
 	br_init_port(p);
 	br_set_state(p, BR_STATE_DISABLED);
 	br_stp_port_timer_init(p);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 68b3e850bcb9..0280806cf980 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -110,8 +110,17 @@  int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 			br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid);
 
 		if (!fdb_src || READ_ONCE(fdb_src->dst) != p ||
-		    test_bit(BR_FDB_LOCAL, &fdb_src->flags))
+		    test_bit(BR_FDB_LOCAL, &fdb_src->flags) ||
+		    test_bit(BR_FDB_ENTRY_LOCKED, &fdb_src->flags)) {
+			if (!fdb_src && atomic_read(&p->locked_entry_cnt) < BR_LOCKED_ENTRIES_MAX) {
+				unsigned long flags = 0;
+
+				__set_bit(BR_FDB_ENTRY_LOCKED, &flags);
+				br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, flags);
+				atomic_inc(&p->locked_entry_cnt);
+			}
 			goto drop;
+		}
 	}
 
 	nbp_switchdev_frame_mark(p, skb);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 06e5f6faa431..be17c99efe65 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -31,6 +31,8 @@ 
 #define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000)
 #define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN
 
+#define BR_LOCKED_ENTRIES_MAX	64
+
 #define BR_HWDOM_MAX BITS_PER_LONG
 
 #define BR_VERSION	"2.3"
@@ -251,7 +253,8 @@  enum {
 	BR_FDB_ADDED_BY_EXT_LEARN,
 	BR_FDB_OFFLOADED,
 	BR_FDB_NOTIFY,
-	BR_FDB_NOTIFY_INACTIVE
+	BR_FDB_NOTIFY_INACTIVE,
+	BR_FDB_ENTRY_LOCKED,
 };
 
 struct net_bridge_fdb_key {
@@ -414,6 +417,8 @@  struct net_bridge_port {
 	u16				backup_redirected_cnt;
 
 	struct bridge_stp_xstats	stp_xstats;
+
+	atomic_t			locked_entry_cnt;
 };
 
 #define kobj_to_brport(obj)	container_of(obj, struct net_bridge_port, kobj)