diff mbox series

AW: HSR/PRP sequence counter issue with Cisco Redbox

Message ID e20bb1bd30e9465ea36d26b274b8b2b6@EXCH-SVR2013.eberle.local (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series AW: HSR/PRP sequence counter issue with Cisco Redbox | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Wenzel, Marco Feb. 17, 2021, 1:14 p.m. UTC
On Mon, Feb 15, 2021 at 5:49 PM George McCollister <george.mccollister@gmail.com> wrote:
> 
> On Mon, Feb 15, 2021 at 6:30 AM Wenzel, Marco <Marco.Wenzel@a-
> eberle.de> wrote:
> >
> > > On Wed, Jan 27, 2021 at 6:32 AM Wenzel, Marco <Marco.Wenzel@a-
> > > eberle.de> wrote:
> > > >
> > > > Hi,
> > > >
> > > > we have figured out an issue with the current PRP driver when
> > > > trying to
> > > communicate with Cisco IE 2000 industrial Ethernet switches in
> > > Redbox mode. The Cisco always resets the HSR/PRP sequence counter to
> > > "1" at low traffic (<= 1 frame in 400 ms). It can be reproduced by a
> > > simple ICMP echo request with 1 s interval between a Linux box
> > > running with PRP and a VDAN behind the Cisco Redbox. The Linux box
> > > then always receives frames with sequence counter "1" and drops
> > > them. The behavior is not configurable at the Cisco Redbox.
> > > >
> > > > I fixed it by ignoring sequence counters with value "1" at the
> > > > sequence
> > > counter check in hsr_register_frame_out ():
> > > >
> > > > diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index
> > > > 5c97de459905..630c238e81f0 100644
> > > > --- a/net/hsr/hsr_framereg.c
> > > > +++ b/net/hsr/hsr_framereg.c
> > > > @@ -411,7 +411,7 @@ void hsr_register_frame_in(struct hsr_node
> > > > *node, struct hsr_port *port,  int hsr_register_frame_out(struct
> > > > hsr_port *port,
> > > struct hsr_node *node,
> > > >                            u16 sequence_nr)  {
> > > > -       if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port-
> >type]))
> > > > +       if (seq_nr_before_or_eq(sequence_nr,
> > > > + node->seq_out[port->type]) && (sequence_nr != 1))
> > > >                 return 1;
> > > >
> > > >         node->seq_out[port->type] = sequence_nr;
> > > >
> > > >
> > > > Do you think this could be a solution? Should this patch be
> > > > officially applied
> > > in order to avoid other users running into these communication issues?
> > >
> > > This isn't the correct way to solve the problem. IEC 62439-3 defines
> > > EntryForgetTime as "Time after which an entry is removed from the
> > > duplicate table" with a value of 400ms and states devices should
> > > usually be configured to keep entries in the table for a much
> > > shorter time. hsr_framereg.c needs to be reworked to handle this
> according to the specification.
> >
> > Sorry for the delay but I did not have the time to take a closer look at the
> problem until now.
> >
> > My suggestion for the EntryForgetTime feature would be the following: A
> time_out element will be added to the hsr_node structure, which always
> stores the current time when entering hsr_register_frame_out(). If the last
> stored time is older than EntryForgetTime (400 ms) the sequence number
> check will be ignored.
> >
> > diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index
> > 5c97de459905..a97bffbd2581 100644
> > --- a/net/hsr/hsr_framereg.c
> > +++ b/net/hsr/hsr_framereg.c
> > @@ -164,8 +164,10 @@ static struct hsr_node *hsr_add_node(struct
> hsr_priv *hsr,
> >          * as initialization. (0 could trigger an spurious ring error warning).
> >          */
> >         now = jiffies;
> > -       for (i = 0; i < HSR_PT_PORTS; i++)
> > +       for (i = 0; i < HSR_PT_PORTS; i++) {
> >                 new_node->time_in[i] = now;
> > +               new_node->time_out[i] = now;
> > +       }
> >         for (i = 0; i < HSR_PT_PORTS; i++)
> >                 new_node->seq_out[i] = seq_out;
> >
> > @@ -411,9 +413,12 @@ void hsr_register_frame_in(struct hsr_node
> *node,
> > struct hsr_port *port,  int hsr_register_frame_out(struct hsr_port *port,
> struct hsr_node *node,
> >                            u16 sequence_nr)  {
> > -       if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]))
> > +       if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type])
> &&
> > +                time_is_after_jiffies(node->time_out[port->type] +
> > + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) {
> >                 return 1;
> > +       }
> >
> > +       node->time_out[port->type] = jiffies;
> >         node->seq_out[port->type] = sequence_nr;
> >         return 0;
> >  }
> > diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index
> > 86b43f539f2c..d9628e7a5f05 100644
> > --- a/net/hsr/hsr_framereg.h
> > +++ b/net/hsr/hsr_framereg.h
> > @@ -75,6 +75,7 @@ struct hsr_node {
> >         enum hsr_port_type      addr_B_port;
> >         unsigned long           time_in[HSR_PT_PORTS];
> >         bool                    time_in_stale[HSR_PT_PORTS];
> > +       unsigned long           time_out[HSR_PT_PORTS];
> >         /* if the node is a SAN */
> >         bool                    san_a;
> >         bool                    san_b;
> > diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index
> > 7dc92ce5a134..f79ca55d6986 100644
> > --- a/net/hsr/hsr_main.h
> > +++ b/net/hsr/hsr_main.h
> > @@ -21,6 +21,7 @@
> >  #define HSR_LIFE_CHECK_INTERVAL                 2000 /* ms */
> >  #define HSR_NODE_FORGET_TIME           60000 /* ms */
> >  #define HSR_ANNOUNCE_INTERVAL            100 /* ms */
> > +#define HSR_ENTRY_FORGET_TIME            400 /* ms */
> >
> >  /* By how much may slave1 and slave2 timestamps of latest received
> frame from
> >   * each node differ before we notify of communication problem?
> >
> >
> > This approach works fine with the Cisco IE 2000 and I think it implements
> the correct way to handle sequence numbers as defined in IEC 62439-3.
> 
> Looks good to me. Can you send an official patch? If so I'll try it out. Even if I
> can't replicate the Cisco situation I can try it with my setups and make sure it
> doesn't break anything.

I was not so familiar with kernel patching until now and hope that this patch is correct now:


From 8836f1df35a884327da37885ff3ad8bfc5eb933c Mon Sep 17 00:00:00 2001
From: Marco Wenzel <marco.wenzel@a-eberle.de>
Date: Wed, 17 Feb 2021 13:53:31 +0100
Subject: [PATCH] net: hsr: add support for EntryForgetTime

In IEC 62439-3 EntryForgetTime is defined with a value of 400 ms. When a
node does not send any frame within this time, the sequence number check
for can be ignored. This solves communication issues with Cisco IE 2000
in Redbox mode.

Signed-off-by: Marco Wenzel <marco.wenzel@a-eberle.de>
---
 net/hsr/hsr_framereg.c | 9 +++++++--
 net/hsr/hsr_framereg.h | 1 +
 net/hsr/hsr_main.h     | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

Comments

George McCollister Feb. 17, 2021, 5:41 p.m. UTC | #1
On Wed, Feb 17, 2021 at 7:14 AM Wenzel, Marco <Marco.Wenzel@a-eberle.de> wrote:
>
> On Mon, Feb 15, 2021 at 5:49 PM George McCollister <george.mccollister@gmail.com> wrote:
> >
> > On Mon, Feb 15, 2021 at 6:30 AM Wenzel, Marco <Marco.Wenzel@a-
> > eberle.de> wrote:
[snip]
>
> I was not so familiar with kernel patching until now and hope that this patch is correct now:

This process is rather confusing and I had trouble with it initially
(I'm still certainly not an expert). Looks like you have most of it
correct but you need to send the patch directly instead of embedding
it in another email. Otherwise it will be lost.
I use the git send-email command to send patches to the mailing list
but I'm sure there are other ways to do it as well. You may want to
run scripts/get_maintainer.pl and CC everyone reported as well.

Here are some more resources:
https://www.kernel.org/doc/html/v5.11/process/submitting-patches.html
https://www.kernel.org/doc/html/latest/networking/netdev-FAQ.html

>
>
> From 8836f1df35a884327da37885ff3ad8bfc5eb933c Mon Sep 17 00:00:00 2001
> From: Marco Wenzel <marco.wenzel@a-eberle.de>
> Date: Wed, 17 Feb 2021 13:53:31 +0100
> Subject: [PATCH] net: hsr: add support for EntryForgetTime
>
> In IEC 62439-3 EntryForgetTime is defined with a value of 400 ms. When a
> node does not send any frame within this time, the sequence number check
> for can be ignored. This solves communication issues with Cisco IE 2000
> in Redbox mode.
>
> Signed-off-by: Marco Wenzel <marco.wenzel@a-eberle.de>
> ---
>  net/hsr/hsr_framereg.c | 9 +++++++--
>  net/hsr/hsr_framereg.h | 1 +
>  net/hsr/hsr_main.h     | 1 +
>  3 files changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
> index 5c97de459905..805f974923b9 100644
> --- a/net/hsr/hsr_framereg.c
> +++ b/net/hsr/hsr_framereg.c
> @@ -164,8 +164,10 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
>          * as initialization. (0 could trigger an spurious ring error warning).
>          */
>         now = jiffies;
> -       for (i = 0; i < HSR_PT_PORTS; i++)
> +       for (i = 0; i < HSR_PT_PORTS; i++) {
>                 new_node->time_in[i] = now;
> +               new_node->time_out[i] = now;
> +       }
>         for (i = 0; i < HSR_PT_PORTS; i++)
>                 new_node->seq_out[i] = seq_out;
>
> @@ -411,9 +413,12 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
>  int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node,
>                            u16 sequence_nr)
>  {
> -       if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]))
> +       if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) &&
> +           time_is_after_jiffies(node->time_out[port->type] +
> +           msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)))
>                 return 1;
>
> +       node->time_out[port->type] = jiffies;
>         node->seq_out[port->type] = sequence_nr;
>         return 0;
>  }
> diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
> index 86b43f539f2c..7a120ce3e3db 100644
> --- a/net/hsr/hsr_framereg.h
> +++ b/net/hsr/hsr_framereg.h
> @@ -75,6 +75,7 @@ struct hsr_node {
>         enum hsr_port_type      addr_B_port;
>         unsigned long           time_in[HSR_PT_PORTS];
>         bool                    time_in_stale[HSR_PT_PORTS];
> +       unsigned long     time_out[HSR_PT_PORTS];
>         /* if the node is a SAN */
>         bool                    san_a;
>         bool                    san_b;
> diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
> index 7dc92ce5a134..f79ca55d6986 100644
> --- a/net/hsr/hsr_main.h
> +++ b/net/hsr/hsr_main.h
> @@ -21,6 +21,7 @@
>  #define HSR_LIFE_CHECK_INTERVAL                 2000 /* ms */
>  #define HSR_NODE_FORGET_TIME           60000 /* ms */
>  #define HSR_ANNOUNCE_INTERVAL            100 /* ms */
> +#define HSR_ENTRY_FORGET_TIME            400 /* ms */
>
>  /* By how much may slave1 and slave2 timestamps of latest received frame from
>   * each node differ before we notify of communication problem?
> --
> 2.29.2
>
>
> Regards,
> Marco Wenzel
>
> >
> > Regards,
> > George McCollister
> >
> > >
> > > Regards,
> > > Marco Wenzel
> > >
> > > > >
> > > > > Thanks
> > > > > Marco Wenzel
> > > >
> > > > Regards,
> > > > George McCollister
Andrew Lunn Feb. 17, 2021, 7:51 p.m. UTC | #2
> From 8836f1df35a884327da37885ff3ad8bfc5eb933c Mon Sep 17 00:00:00 2001
> From: Marco Wenzel <marco.wenzel@a-eberle.de>
> Date: Wed, 17 Feb 2021 13:53:31 +0100
> Subject: [PATCH] net: hsr: add support for EntryForgetTime
> 
> In IEC 62439-3 EntryForgetTime is defined with a value of 400 ms. When a
> node does not send any frame within this time, the sequence number check
> for can be ignored. This solves communication issues with Cisco IE 2000
> in Redbox mode.
> 
> Signed-off-by: Marco Wenzel <marco.wenzel@a-eberle.de>

It would be nice to have a Fixes: tag here to indicate which commit
introduced the problem. If it has been broken forever, reference the
commit which added HSR. 

> diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
> index 86b43f539f2c..7a120ce3e3db 100644
> --- a/net/hsr/hsr_framereg.h
> +++ b/net/hsr/hsr_framereg.h
> @@ -75,6 +75,7 @@ struct hsr_node {
>  	enum hsr_port_type	addr_B_port;
>  	unsigned long		time_in[HSR_PT_PORTS];
>  	bool			time_in_stale[HSR_PT_PORTS];
> +	unsigned long	  time_out[HSR_PT_PORTS];

It looks like the indentation is wrong here, and is using a mixture of
tabs and spaces.

     Andrew
diff mbox series

Patch

diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 5c97de459905..805f974923b9 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -164,8 +164,10 @@  static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
 	 * as initialization. (0 could trigger an spurious ring error warning).
 	 */
 	now = jiffies;
-	for (i = 0; i < HSR_PT_PORTS; i++)
+	for (i = 0; i < HSR_PT_PORTS; i++) {
 		new_node->time_in[i] = now;
+		new_node->time_out[i] = now;
+	}
 	for (i = 0; i < HSR_PT_PORTS; i++)
 		new_node->seq_out[i] = seq_out;
 
@@ -411,9 +413,12 @@  void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
 int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node,
 			   u16 sequence_nr)
 {
-	if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]))
+	if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) &&
+	    time_is_after_jiffies(node->time_out[port->type] +
+	    msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)))
 		return 1;
 
+	node->time_out[port->type] = jiffies;
 	node->seq_out[port->type] = sequence_nr;
 	return 0;
 }
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index 86b43f539f2c..7a120ce3e3db 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -75,6 +75,7 @@  struct hsr_node {
 	enum hsr_port_type	addr_B_port;
 	unsigned long		time_in[HSR_PT_PORTS];
 	bool			time_in_stale[HSR_PT_PORTS];
+	unsigned long	  time_out[HSR_PT_PORTS];
 	/* if the node is a SAN */
 	bool			san_a;
 	bool			san_b;
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 7dc92ce5a134..f79ca55d6986 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -21,6 +21,7 @@ 
 #define HSR_LIFE_CHECK_INTERVAL		 2000 /* ms */
 #define HSR_NODE_FORGET_TIME		60000 /* ms */
 #define HSR_ANNOUNCE_INTERVAL		  100 /* ms */
+#define HSR_ENTRY_FORGET_TIME		  400 /* ms */
 
 /* By how much may slave1 and slave2 timestamps of latest received frame from
  * each node differ before we notify of communication problem?