diff mbox series

ARM: mvebu: Enable MBUS error propagation

Message ID 20191124093529.32399-1-chris.packham@alliedtelesis.co.nz (mailing list archive)
State New, archived
Headers show
Series ARM: mvebu: Enable MBUS error propagation | expand

Commit Message

Chris Packham Nov. 24, 2019, 9:35 a.m. UTC
U-boot disables MBUS error propagation for Armada-385. The effect of
this on the kernel is that any access to a mapped but inaccessible
address causes the system to hang.

By enabling MBUS error propagation the kernel can raise a Bus Error and
panic to restart the system.

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
---

Notes:
    We've encountered an issue where rogue accesses to PCI-e space cause an
    Armada-385 system to lockup. We've found that enabling MBUS error
    propagation lets us get a bus error which at least gives us a panic to
    help identify what was accessed.
    
    U-boot clears the IO Err Prop Enable Bit[1] but so far no-one seems to
    know why.
    
    I wasn't sure where to put this code. There is similar code for kirwood
    in the equivalent dt_init function. On Armada-XP the register is part of
    the Core Coherency Fabric block (for A385 it's documented as part of the
    CCF block).
    
    --
    [1] - https://gitlab.denx.de/u-boot/u-boot/blob/master/arch/arm/mach-mvebu/cpu.c#L489

 arch/arm/boot/dts/armada-38x.dtsi |  5 +++++
 arch/arm/mach-mvebu/board-v7.c    | 27 +++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

Comments

Gregory CLEMENT Jan. 8, 2020, 10:22 a.m. UTC | #1
Hello Chris,

> U-boot disables MBUS error propagation for Armada-385. The effect of
> this on the kernel is that any access to a mapped but inaccessible
> address causes the system to hang.
>
> By enabling MBUS error propagation the kernel can raise a Bus Error and
> panic to restart the system.

Unless I miss it, it seems that nobody comment this patch: sorry for the
delay.

>
> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
> ---
>
> Notes:
>     We've encountered an issue where rogue accesses to PCI-e space cause an
>     Armada-385 system to lockup. We've found that enabling MBUS error
>     propagation lets us get a bus error which at least gives us a panic to
>     help identify what was accessed.
>     
>     U-boot clears the IO Err Prop Enable Bit[1] but so far no-one seems to
>     know why.
>     
>     I wasn't sure where to put this code. There is similar code for kirwood
>     in the equivalent dt_init function. On Armada-XP the register is part of
>     the Core Coherency Fabric block (for A385 it's documented as part of the
>     CCF block).

What about adding a new set of register to the mvebu mbus driver?

In this case it will be called even earlier allowing to see bus error
earlier.

In any case, you should separate the device tree change from the code
change and at least provide 2 patches.

Gregory

>     
>     --
>     [1] - https://gitlab.denx.de/u-boot/u-boot/blob/master/arch/arm/mach-mvebu/cpu.c#L489
>
>  arch/arm/boot/dts/armada-38x.dtsi |  5 +++++
>  arch/arm/mach-mvebu/board-v7.c    | 27 +++++++++++++++++++++++++++
>  2 files changed, 32 insertions(+)
>
> diff --git a/arch/arm/boot/dts/armada-38x.dtsi b/arch/arm/boot/dts/armada-38x.dtsi
> index 3f4bb44d85f0..3214c67433eb 100644
> --- a/arch/arm/boot/dts/armada-38x.dtsi
> +++ b/arch/arm/boot/dts/armada-38x.dtsi
> @@ -386,6 +386,11 @@
>  				      <0x20250 0x8>;
>  			};
>  
> +			ioerrc: io-err-control@20200 {
> +				compatible = "marvell,io-err-control";
> +				reg = <0x20200 0x4>;
> +			};
> +
>  			mpic: interrupt-controller@20a00 {
>  				compatible = "marvell,mpic";
>  				reg = <0x20a00 0x2d0>, <0x21070 0x58>;
> diff --git a/arch/arm/mach-mvebu/board-v7.c b/arch/arm/mach-mvebu/board-v7.c
> index d2df5ef9382b..fb7718386ef9 100644
> --- a/arch/arm/mach-mvebu/board-v7.c
> +++ b/arch/arm/mach-mvebu/board-v7.c
> @@ -138,10 +138,36 @@ static void __init i2c_quirk(void)
>  	}
>  }
>  
> +#define MBUS_ERR_PROP_EN BIT(8)
> +
> +/*
> + * U-boot disables MBUS error propagation. Re-enable it so we
> + * can handle them as Bus Errors.
> + */
> +static void __init enable_mbus_error_propagation(void)
> +{
> +	struct device_node *np =
> +		of_find_compatible_node(NULL, NULL, "marvell,io-err-control");
> +
> +	if (np) {
> +		void __iomem *reg;
> +
> +		reg = of_iomap(np, 0);
> +		if (reg) {
> +			u32 val;
> +
> +			val = readl_relaxed(reg);
> +			writel_relaxed(val | MBUS_ERR_PROP_EN, reg);
> +		}
> +		of_node_put(np);
> +	}
> +}
> +
>  static void __init mvebu_dt_init(void)
>  {
>  	if (of_machine_is_compatible("marvell,armadaxp"))
>  		i2c_quirk();
> +	enable_mbus_error_propagation();
>  }
>  
>  static void __init armada_370_xp_dt_fixup(void)
> @@ -191,6 +217,7 @@ DT_MACHINE_START(ARMADA_38X_DT, "Marvell Armada 380/385 (Device Tree)")
>  	.l2c_aux_val	= 0,
>  	.l2c_aux_mask	= ~0,
>  	.init_irq       = mvebu_init_irq,
> +	.init_machine	= mvebu_dt_init,
>  	.restart	= mvebu_restart,
>  	.dt_compat	= armada_38x_dt_compat,
>  MACHINE_END
> -- 
> 2.24.0
>
Chris Packham Jan. 8, 2020, 7:42 p.m. UTC | #2
Hi Gregory,

On Wed, 2020-01-08 at 11:22 +0100, Gregory CLEMENT wrote:
> Hello Chris,
> 
> > U-boot disables MBUS error propagation for Armada-385. The effect of
> > this on the kernel is that any access to a mapped but inaccessible
> > address causes the system to hang.
> > 
> > By enabling MBUS error propagation the kernel can raise a Bus Error and
> > panic to restart the system.
> 
> Unless I miss it, it seems that nobody comment this patch: sorry for the
> delay.
> 

Thanks for the response.

> > 
> > Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
> > ---
> > 
> > Notes:
> >     We've encountered an issue where rogue accesses to PCI-e space cause an
> >     Armada-385 system to lockup. We've found that enabling MBUS error
> >     propagation lets us get a bus error which at least gives us a panic to
> >     help identify what was accessed.
> >     
> >     U-boot clears the IO Err Prop Enable Bit[1] but so far no-one seems to
> >     know why.
> >     
> >     I wasn't sure where to put this code. There is similar code for kirwood
> >     in the equivalent dt_init function. On Armada-XP the register is part of
> >     the Core Coherency Fabric block (for A385 it's documented as part of the
> >     CCF block).
> 
> What about adding a new set of register to the mvebu mbus driver?
> 

After more testing we found that some previously "good" boards started
throwing up panics with this change. I think that this might require
handling some of the PCI-e interrupts (for correctable errors) via the
EDAC subsystem.

We're still working with Marvell to track down exactly why this is
happening on our system.

> In this case it will be called even earlier allowing to see bus error
> earlier.
> 
> In any case, you should separate the device tree change from the code
> change and at least provide 2 patches.

Agreed. If/when something solid eventuates we'll do it as a proper
series.

> 
> Gregory
> 
> >     
> >     --
> >     [1] - https://gitlab.denx.de/u-boot/u-boot/blob/master/arch/arm/mach-mvebu/cpu.c#L489
> > 
> >  arch/arm/boot/dts/armada-38x.dtsi |  5 +++++
> >  arch/arm/mach-mvebu/board-v7.c    | 27 +++++++++++++++++++++++++++
> >  2 files changed, 32 insertions(+)
> > 
> > diff --git a/arch/arm/boot/dts/armada-38x.dtsi b/arch/arm/boot/dts/armada-38x.dtsi
> > index 3f4bb44d85f0..3214c67433eb 100644
> > --- a/arch/arm/boot/dts/armada-38x.dtsi
> > +++ b/arch/arm/boot/dts/armada-38x.dtsi
> > @@ -386,6 +386,11 @@
> >  				      <0x20250 0x8>;
> >  			};
> >  
> > +			ioerrc: io-err-control@20200 {
> > +				compatible = "marvell,io-err-control";
> > +				reg = <0x20200 0x4>;
> > +			};
> > +
> >  			mpic: interrupt-controller@20a00 {
> >  				compatible = "marvell,mpic";
> >  				reg = <0x20a00 0x2d0>, <0x21070 0x58>;
> > diff --git a/arch/arm/mach-mvebu/board-v7.c b/arch/arm/mach-mvebu/board-v7.c
> > index d2df5ef9382b..fb7718386ef9 100644
> > --- a/arch/arm/mach-mvebu/board-v7.c
> > +++ b/arch/arm/mach-mvebu/board-v7.c
> > @@ -138,10 +138,36 @@ static void __init i2c_quirk(void)
> >  	}
> >  }
> >  
> > +#define MBUS_ERR_PROP_EN BIT(8)
> > +
> > +/*
> > + * U-boot disables MBUS error propagation. Re-enable it so we
> > + * can handle them as Bus Errors.
> > + */
> > +static void __init enable_mbus_error_propagation(void)
> > +{
> > +	struct device_node *np =
> > +		of_find_compatible_node(NULL, NULL, "marvell,io-err-control");
> > +
> > +	if (np) {
> > +		void __iomem *reg;
> > +
> > +		reg = of_iomap(np, 0);
> > +		if (reg) {
> > +			u32 val;
> > +
> > +			val = readl_relaxed(reg);
> > +			writel_relaxed(val | MBUS_ERR_PROP_EN, reg);
> > +		}
> > +		of_node_put(np);
> > +	}
> > +}
> > +
> >  static void __init mvebu_dt_init(void)
> >  {
> >  	if (of_machine_is_compatible("marvell,armadaxp"))
> >  		i2c_quirk();
> > +	enable_mbus_error_propagation();
> >  }
> >  
> >  static void __init armada_370_xp_dt_fixup(void)
> > @@ -191,6 +217,7 @@ DT_MACHINE_START(ARMADA_38X_DT, "Marvell Armada 380/385 (Device Tree)")
> >  	.l2c_aux_val	= 0,
> >  	.l2c_aux_mask	= ~0,
> >  	.init_irq       = mvebu_init_irq,
> > +	.init_machine	= mvebu_dt_init,
> >  	.restart	= mvebu_restart,
> >  	.dt_compat	= armada_38x_dt_compat,
> >  MACHINE_END
> > -- 
> > 2.24.0
> > 
> 
>
Pali Rohár June 3, 2021, 12:55 p.m. UTC | #3
On Wednesday 08 January 2020 19:42:12 Chris Packham wrote:
> Hi Gregory,
> 
> On Wed, 2020-01-08 at 11:22 +0100, Gregory CLEMENT wrote:
> > Hello Chris,
> > 
> > > U-boot disables MBUS error propagation for Armada-385. The effect of
> > > this on the kernel is that any access to a mapped but inaccessible
> > > address causes the system to hang.
> > > 
> > > By enabling MBUS error propagation the kernel can raise a Bus Error and
> > > panic to restart the system.
> > 
> > Unless I miss it, it seems that nobody comment this patch: sorry for the
> > delay.
> > 
> 
> Thanks for the response.
> 
> > > 
> > > Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
> > > ---
> > > 
> > > Notes:
> > >     We've encountered an issue where rogue accesses to PCI-e space cause an
> > >     Armada-385 system to lockup. We've found that enabling MBUS error
> > >     propagation lets us get a bus error which at least gives us a panic to
> > >     help identify what was accessed.
> > >     
> > >     U-boot clears the IO Err Prop Enable Bit[1] but so far no-one seems to
> > >     know why.
> > >     
> > >     I wasn't sure where to put this code. There is similar code for kirwood
> > >     in the equivalent dt_init function. On Armada-XP the register is part of
> > >     the Core Coherency Fabric block (for A385 it's documented as part of the
> > >     CCF block).
> > 
> > What about adding a new set of register to the mvebu mbus driver?
> > 
> 
> After more testing we found that some previously "good" boards started
> throwing up panics with this change. I think that this might require
> handling some of the PCI-e interrupts (for correctable errors) via the
> EDAC subsystem.
> 
> We're still working with Marvell to track down exactly why this is
> happening on our system.

Hello Chris! Have you somehow solved this issue? Or do you have some
contacts in Marvell for A385 PCIe subsystem?

> > In this case it will be called even earlier allowing to see bus error
> > earlier.
> > 
> > In any case, you should separate the device tree change from the code
> > change and at least provide 2 patches.
> 
> Agreed. If/when something solid eventuates we'll do it as a proper
> series.
> 
> > 
> > Gregory
> > 
> > >     
> > >     --
> > >     [1] - https://gitlab.denx.de/u-boot/u-boot/blob/master/arch/arm/mach-mvebu/cpu.c#L489
> > > 
> > >  arch/arm/boot/dts/armada-38x.dtsi |  5 +++++
> > >  arch/arm/mach-mvebu/board-v7.c    | 27 +++++++++++++++++++++++++++
> > >  2 files changed, 32 insertions(+)
> > > 
> > > diff --git a/arch/arm/boot/dts/armada-38x.dtsi b/arch/arm/boot/dts/armada-38x.dtsi
> > > index 3f4bb44d85f0..3214c67433eb 100644
> > > --- a/arch/arm/boot/dts/armada-38x.dtsi
> > > +++ b/arch/arm/boot/dts/armada-38x.dtsi
> > > @@ -386,6 +386,11 @@
> > >  				      <0x20250 0x8>;
> > >  			};
> > >  
> > > +			ioerrc: io-err-control@20200 {
> > > +				compatible = "marvell,io-err-control";
> > > +				reg = <0x20200 0x4>;
> > > +			};
> > > +
> > >  			mpic: interrupt-controller@20a00 {
> > >  				compatible = "marvell,mpic";
> > >  				reg = <0x20a00 0x2d0>, <0x21070 0x58>;
> > > diff --git a/arch/arm/mach-mvebu/board-v7.c b/arch/arm/mach-mvebu/board-v7.c
> > > index d2df5ef9382b..fb7718386ef9 100644
> > > --- a/arch/arm/mach-mvebu/board-v7.c
> > > +++ b/arch/arm/mach-mvebu/board-v7.c
> > > @@ -138,10 +138,36 @@ static void __init i2c_quirk(void)
> > >  	}
> > >  }
> > >  
> > > +#define MBUS_ERR_PROP_EN BIT(8)
> > > +
> > > +/*
> > > + * U-boot disables MBUS error propagation. Re-enable it so we
> > > + * can handle them as Bus Errors.
> > > + */
> > > +static void __init enable_mbus_error_propagation(void)
> > > +{
> > > +	struct device_node *np =
> > > +		of_find_compatible_node(NULL, NULL, "marvell,io-err-control");
> > > +
> > > +	if (np) {
> > > +		void __iomem *reg;
> > > +
> > > +		reg = of_iomap(np, 0);
> > > +		if (reg) {
> > > +			u32 val;
> > > +
> > > +			val = readl_relaxed(reg);
> > > +			writel_relaxed(val | MBUS_ERR_PROP_EN, reg);
> > > +		}
> > > +		of_node_put(np);
> > > +	}
> > > +}
> > > +
> > >  static void __init mvebu_dt_init(void)
> > >  {
> > >  	if (of_machine_is_compatible("marvell,armadaxp"))
> > >  		i2c_quirk();
> > > +	enable_mbus_error_propagation();
> > >  }
> > >  
> > >  static void __init armada_370_xp_dt_fixup(void)
> > > @@ -191,6 +217,7 @@ DT_MACHINE_START(ARMADA_38X_DT, "Marvell Armada 380/385 (Device Tree)")
> > >  	.l2c_aux_val	= 0,
> > >  	.l2c_aux_mask	= ~0,
> > >  	.init_irq       = mvebu_init_irq,
> > > +	.init_machine	= mvebu_dt_init,
> > >  	.restart	= mvebu_restart,
> > >  	.dt_compat	= armada_38x_dt_compat,
> > >  MACHINE_END
> > > -- 
> > > 2.24.0
> > > 
> > 
> >
Chris Packham June 3, 2021, 9:03 p.m. UTC | #4
On 4/06/21 12:55 am, Pali Rohár wrote:
> On Wednesday 08 January 2020 19:42:12 Chris Packham wrote:
>> Hi Gregory,
>>
>> On Wed, 2020-01-08 at 11:22 +0100, Gregory CLEMENT wrote:
>>> Hello Chris,
>>>
>>>> U-boot disables MBUS error propagation for Armada-385. The effect of
>>>> this on the kernel is that any access to a mapped but inaccessible
>>>> address causes the system to hang.
>>>>
>>>> By enabling MBUS error propagation the kernel can raise a Bus Error and
>>>> panic to restart the system.
>>> Unless I miss it, it seems that nobody comment this patch: sorry for the
>>> delay.
>>>
>> Thanks for the response.
>>
>>>> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
>>>> ---
>>>>
>>>> Notes:
>>>>      We've encountered an issue where rogue accesses to PCI-e space cause an
>>>>      Armada-385 system to lockup. We've found that enabling MBUS error
>>>>      propagation lets us get a bus error which at least gives us a panic to
>>>>      help identify what was accessed.
>>>>      
>>>>      U-boot clears the IO Err Prop Enable Bit[1] but so far no-one seems to
>>>>      know why.
>>>>      
>>>>      I wasn't sure where to put this code. There is similar code for kirwood
>>>>      in the equivalent dt_init function. On Armada-XP the register is part of
>>>>      the Core Coherency Fabric block (for A385 it's documented as part of the
>>>>      CCF block).
>>> What about adding a new set of register to the mvebu mbus driver?
>>>
>> After more testing we found that some previously "good" boards started
>> throwing up panics with this change. I think that this might require
>> handling some of the PCI-e interrupts (for correctable errors) via the
>> EDAC subsystem.
>>
>> We're still working with Marvell to track down exactly why this is
>> happening on our system.
> Hello Chris! Have you somehow solved this issue? Or do you have some
> contacts in Marvell for A385 PCIe subsystem?

The problem seemed to be a specific CPU Erratum for the A385 which I 
brought into upstream u-boot[1]. But even then we had to update our base 
u-boot version from 2016.11 to 2019.10 so there may have been more going 
on than just that one change.

[1] - 
https://source.denx.de/u-boot/u-boot/-/commit/ad91fdfff0bd6ea471afe838e0f6d58ed898694e

>>> In this case it will be called even earlier allowing to see bus error
>>> earlier.
>>>
>>> In any case, you should separate the device tree change from the code
>>> change and at least provide 2 patches.
>> Agreed. If/when something solid eventuates we'll do it as a proper
>> series.
>>
>>> Gregory
>>>
>>>>      
>>>>      --
>>>>      [1] - https://scanmail.trustwave.com/?c=20988&d=yNG44EMQR-PfAPR99zig1LzptmqzHo11i7uAj4AGuQ&u=https%3a%2f%2fgitlab%2edenx%2ede%2fu-boot%2fu-boot%2fblob%2fmaster%2farch%2farm%2fmach-mvebu%2fcpu%2ec%23L489
^^^ sorry, as a result of a reasonably high profile cyber attack on a 
local institution URLs in our incoming mail are intercepted.
>>>>
>>>>   arch/arm/boot/dts/armada-38x.dtsi |  5 +++++
>>>>   arch/arm/mach-mvebu/board-v7.c    | 27 +++++++++++++++++++++++++++
>>>>   2 files changed, 32 insertions(+)
>>>>
>>>> diff --git a/arch/arm/boot/dts/armada-38x.dtsi b/arch/arm/boot/dts/armada-38x.dtsi
>>>> index 3f4bb44d85f0..3214c67433eb 100644
>>>> --- a/arch/arm/boot/dts/armada-38x.dtsi
>>>> +++ b/arch/arm/boot/dts/armada-38x.dtsi
>>>> @@ -386,6 +386,11 @@
>>>>   				      <0x20250 0x8>;
>>>>   			};
>>>>   
>>>> +			ioerrc: io-err-control@20200 {
>>>> +				compatible = "marvell,io-err-control";
>>>> +				reg = <0x20200 0x4>;
>>>> +			};
>>>> +
>>>>   			mpic: interrupt-controller@20a00 {
>>>>   				compatible = "marvell,mpic";
>>>>   				reg = <0x20a00 0x2d0>, <0x21070 0x58>;
>>>> diff --git a/arch/arm/mach-mvebu/board-v7.c b/arch/arm/mach-mvebu/board-v7.c
>>>> index d2df5ef9382b..fb7718386ef9 100644
>>>> --- a/arch/arm/mach-mvebu/board-v7.c
>>>> +++ b/arch/arm/mach-mvebu/board-v7.c
>>>> @@ -138,10 +138,36 @@ static void __init i2c_quirk(void)
>>>>   	}
>>>>   }
>>>>   
>>>> +#define MBUS_ERR_PROP_EN BIT(8)
>>>> +
>>>> +/*
>>>> + * U-boot disables MBUS error propagation. Re-enable it so we
>>>> + * can handle them as Bus Errors.
>>>> + */
>>>> +static void __init enable_mbus_error_propagation(void)
>>>> +{
>>>> +	struct device_node *np =
>>>> +		of_find_compatible_node(NULL, NULL, "marvell,io-err-control");
>>>> +
>>>> +	if (np) {
>>>> +		void __iomem *reg;
>>>> +
>>>> +		reg = of_iomap(np, 0);
>>>> +		if (reg) {
>>>> +			u32 val;
>>>> +
>>>> +			val = readl_relaxed(reg);
>>>> +			writel_relaxed(val | MBUS_ERR_PROP_EN, reg);
>>>> +		}
>>>> +		of_node_put(np);
>>>> +	}
>>>> +}
>>>> +
>>>>   static void __init mvebu_dt_init(void)
>>>>   {
>>>>   	if (of_machine_is_compatible("marvell,armadaxp"))
>>>>   		i2c_quirk();
>>>> +	enable_mbus_error_propagation();
>>>>   }
>>>>   
>>>>   static void __init armada_370_xp_dt_fixup(void)
>>>> @@ -191,6 +217,7 @@ DT_MACHINE_START(ARMADA_38X_DT, "Marvell Armada 380/385 (Device Tree)")
>>>>   	.l2c_aux_val	= 0,
>>>>   	.l2c_aux_mask	= ~0,
>>>>   	.init_irq       = mvebu_init_irq,
>>>> +	.init_machine	= mvebu_dt_init,
>>>>   	.restart	= mvebu_restart,
>>>>   	.dt_compat	= armada_38x_dt_compat,
>>>>   MACHINE_END
>>>> -- 
>>>> 2.24.0
>>>>
>>>
Pali Rohár April 22, 2022, 9:27 a.m. UTC | #5
On Thursday 03 June 2021 21:03:55 Chris Packham wrote:
> On 4/06/21 12:55 am, Pali Rohár wrote:
> > On Wednesday 08 January 2020 19:42:12 Chris Packham wrote:
> >> Hi Gregory,
> >>
> >> On Wed, 2020-01-08 at 11:22 +0100, Gregory CLEMENT wrote:
> >>> Hello Chris,
> >>>
> >>>> U-boot disables MBUS error propagation for Armada-385. The effect of
> >>>> this on the kernel is that any access to a mapped but inaccessible
> >>>> address causes the system to hang.
> >>>>
> >>>> By enabling MBUS error propagation the kernel can raise a Bus Error and
> >>>> panic to restart the system.
> >>> Unless I miss it, it seems that nobody comment this patch: sorry for the
> >>> delay.
> >>>
> >> Thanks for the response.
> >>
> >>>> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
> >>>> ---
> >>>>
> >>>> Notes:
> >>>>      We've encountered an issue where rogue accesses to PCI-e space cause an
> >>>>      Armada-385 system to lockup. We've found that enabling MBUS error
> >>>>      propagation lets us get a bus error which at least gives us a panic to
> >>>>      help identify what was accessed.
> >>>>      
> >>>>      U-boot clears the IO Err Prop Enable Bit[1] but so far no-one seems to
> >>>>      know why.
> >>>>      
> >>>>      I wasn't sure where to put this code. There is similar code for kirwood
> >>>>      in the equivalent dt_init function. On Armada-XP the register is part of
> >>>>      the Core Coherency Fabric block (for A385 it's documented as part of the
> >>>>      CCF block).
> >>> What about adding a new set of register to the mvebu mbus driver?
> >>>
> >> After more testing we found that some previously "good" boards started
> >> throwing up panics with this change. I think that this might require
> >> handling some of the PCI-e interrupts (for correctable errors) via the
> >> EDAC subsystem.
> >>
> >> We're still working with Marvell to track down exactly why this is
> >> happening on our system.
> > Hello Chris! Have you somehow solved this issue? Or do you have some
> > contacts in Marvell for A385 PCIe subsystem?
> 
> The problem seemed to be a specific CPU Erratum for the A385 which I 
> brought into upstream u-boot[1].

Hello Chris, could you point me to this CPU Erratum? What is its number
or some other identifier? Because I'm unable to find any such erratum in
the list for A385 erratums.

> But even then we had to update our base 
> u-boot version from 2016.11 to 2019.10 so there may have been more going 
> on than just that one change.
> 
> [1] - 
> https://source.denx.de/u-boot/u-boot/-/commit/ad91fdfff0bd6ea471afe838e0f6d58ed898694e
> 
> >>> In this case it will be called even earlier allowing to see bus error
> >>> earlier.
> >>>
> >>> In any case, you should separate the device tree change from the code
> >>> change and at least provide 2 patches.
> >> Agreed. If/when something solid eventuates we'll do it as a proper
> >> series.
> >>
> >>> Gregory
> >>>
> >>>>      
> >>>>      --
> >>>>      [1] - https://scanmail.trustwave.com/?c=20988&d=yNG44EMQR-PfAPR99zig1LzptmqzHo11i7uAj4AGuQ&u=https%3a%2f%2fgitlab%2edenx%2ede%2fu-boot%2fu-boot%2fblob%2fmaster%2farch%2farm%2fmach-mvebu%2fcpu%2ec%23L489
> ^^^ sorry, as a result of a reasonably high profile cyber attack on a 
> local institution URLs in our incoming mail are intercepted.
> >>>>
> >>>>   arch/arm/boot/dts/armada-38x.dtsi |  5 +++++
> >>>>   arch/arm/mach-mvebu/board-v7.c    | 27 +++++++++++++++++++++++++++
> >>>>   2 files changed, 32 insertions(+)
> >>>>
> >>>> diff --git a/arch/arm/boot/dts/armada-38x.dtsi b/arch/arm/boot/dts/armada-38x.dtsi
> >>>> index 3f4bb44d85f0..3214c67433eb 100644
> >>>> --- a/arch/arm/boot/dts/armada-38x.dtsi
> >>>> +++ b/arch/arm/boot/dts/armada-38x.dtsi
> >>>> @@ -386,6 +386,11 @@
> >>>>   				      <0x20250 0x8>;
> >>>>   			};
> >>>>   
> >>>> +			ioerrc: io-err-control@20200 {
> >>>> +				compatible = "marvell,io-err-control";
> >>>> +				reg = <0x20200 0x4>;
> >>>> +			};
> >>>> +
> >>>>   			mpic: interrupt-controller@20a00 {
> >>>>   				compatible = "marvell,mpic";
> >>>>   				reg = <0x20a00 0x2d0>, <0x21070 0x58>;
> >>>> diff --git a/arch/arm/mach-mvebu/board-v7.c b/arch/arm/mach-mvebu/board-v7.c
> >>>> index d2df5ef9382b..fb7718386ef9 100644
> >>>> --- a/arch/arm/mach-mvebu/board-v7.c
> >>>> +++ b/arch/arm/mach-mvebu/board-v7.c
> >>>> @@ -138,10 +138,36 @@ static void __init i2c_quirk(void)
> >>>>   	}
> >>>>   }
> >>>>   
> >>>> +#define MBUS_ERR_PROP_EN BIT(8)
> >>>> +
> >>>> +/*
> >>>> + * U-boot disables MBUS error propagation. Re-enable it so we
> >>>> + * can handle them as Bus Errors.
> >>>> + */
> >>>> +static void __init enable_mbus_error_propagation(void)
> >>>> +{
> >>>> +	struct device_node *np =
> >>>> +		of_find_compatible_node(NULL, NULL, "marvell,io-err-control");
> >>>> +
> >>>> +	if (np) {
> >>>> +		void __iomem *reg;
> >>>> +
> >>>> +		reg = of_iomap(np, 0);
> >>>> +		if (reg) {
> >>>> +			u32 val;
> >>>> +
> >>>> +			val = readl_relaxed(reg);
> >>>> +			writel_relaxed(val | MBUS_ERR_PROP_EN, reg);
> >>>> +		}
> >>>> +		of_node_put(np);
> >>>> +	}
> >>>> +}
> >>>> +
> >>>>   static void __init mvebu_dt_init(void)
> >>>>   {
> >>>>   	if (of_machine_is_compatible("marvell,armadaxp"))
> >>>>   		i2c_quirk();
> >>>> +	enable_mbus_error_propagation();
> >>>>   }
> >>>>   
> >>>>   static void __init armada_370_xp_dt_fixup(void)
> >>>> @@ -191,6 +217,7 @@ DT_MACHINE_START(ARMADA_38X_DT, "Marvell Armada 380/385 (Device Tree)")
> >>>>   	.l2c_aux_val	= 0,
> >>>>   	.l2c_aux_mask	= ~0,
> >>>>   	.init_irq       = mvebu_init_irq,
> >>>> +	.init_machine	= mvebu_dt_init,
> >>>>   	.restart	= mvebu_restart,
> >>>>   	.dt_compat	= armada_38x_dt_compat,
> >>>>   MACHINE_END
> >>>> -- 
> >>>> 2.24.0
> >>>>
> >>>
Chris Packham April 26, 2022, 1:17 a.m. UTC | #6
On 22/04/22 21:27, Pali Rohár wrote:
> On Thursday 03 June 2021 21:03:55 Chris Packham wrote:
>> On 4/06/21 12:55 am, Pali Rohár wrote:
>>> On Wednesday 08 January 2020 19:42:12 Chris Packham wrote:
>>>> Hi Gregory,
>>>>
>>>> On Wed, 2020-01-08 at 11:22 +0100, Gregory CLEMENT wrote:
>>>>> Hello Chris,
>>>>>
>>>>>> U-boot disables MBUS error propagation for Armada-385. The effect of
>>>>>> this on the kernel is that any access to a mapped but inaccessible
>>>>>> address causes the system to hang.
>>>>>>
>>>>>> By enabling MBUS error propagation the kernel can raise a Bus Error and
>>>>>> panic to restart the system.
>>>>> Unless I miss it, it seems that nobody comment this patch: sorry for the
>>>>> delay.
>>>>>
>>>> Thanks for the response.
>>>>
>>>>>> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
>>>>>> ---
>>>>>>
>>>>>> Notes:
>>>>>>       We've encountered an issue where rogue accesses to PCI-e space cause an
>>>>>>       Armada-385 system to lockup. We've found that enabling MBUS error
>>>>>>       propagation lets us get a bus error which at least gives us a panic to
>>>>>>       help identify what was accessed.
>>>>>>       
>>>>>>       U-boot clears the IO Err Prop Enable Bit[1] but so far no-one seems to
>>>>>>       know why.
>>>>>>       
>>>>>>       I wasn't sure where to put this code. There is similar code for kirwood
>>>>>>       in the equivalent dt_init function. On Armada-XP the register is part of
>>>>>>       the Core Coherency Fabric block (for A385 it's documented as part of the
>>>>>>       CCF block).
>>>>> What about adding a new set of register to the mvebu mbus driver?
>>>>>
>>>> After more testing we found that some previously "good" boards started
>>>> throwing up panics with this change. I think that this might require
>>>> handling some of the PCI-e interrupts (for correctable errors) via the
>>>> EDAC subsystem.
>>>>
>>>> We're still working with Marvell to track down exactly why this is
>>>> happening on our system.
>>> Hello Chris! Have you somehow solved this issue? Or do you have some
>>> contacts in Marvell for A385 PCIe subsystem?
>> The problem seemed to be a specific CPU Erratum for the A385 which I
>> brought into upstream u-boot[1].
> Hello Chris, could you point me to this CPU Erratum? What is its number
> or some other identifier? Because I'm unable to find any such erratum in
> the list for A385 erratums.

I haven't seen the actual errata either. My comment is based on the 
following change from one of Marvell's u-boot forks

https://github.com/MarvellEmbeddedProcessors/u-boot-marvell/commit/20cd2704072512de176e048970f2883db901674b

It just says "ERRATA# TBD". Perhaps they just haven't published a new 
document (latest I can find is Rev D from 2016) or they decided it was a 
SW bug and didn't assign it an official errata number.

>> But even then we had to update our base
>> u-boot version from 2016.11 to 2019.10 so there may have been more going
>> on than just that one change.
>>
>> [1] -
>> https://scanmail.trustwave.com/?c=20988&d=8fTi4h2MjPVKSgkegODmtLtuBcIeiZ2hXGl9DulH7A&u=https%3a%2f%2fsource%2edenx%2ede%2fu-boot%2fu-boot%2f-%2fcommit%2fad91fdfff0bd6ea471afe838e0f6d58ed898694e
>>
>>>>> In this case it will be called even earlier allowing to see bus error
>>>>> earlier.
>>>>>
>>>>> In any case, you should separate the device tree change from the code
>>>>> change and at least provide 2 patches.
>>>> Agreed. If/when something solid eventuates we'll do it as a proper
>>>> series.
>>>>
>>>>> Gregory
>>>>>
>>>>>>       
>>>>>>       --
>>>>>>       [1] - https://scanmail.trustwave.com/?c=20988&d=8fTi4h2MjPVKSgkegODmtLtuBcIeiZ2hXGl-X75J5A&u=https%3a%2f%2fgitlab%2edenx%2ede%2fu-boot%2fu-boot%2fblob%2fmaster%2farch%2farm%2fmach-mvebu%2fcpu%2ec%23L489
>> ^^^ sorry, as a result of a reasonably high profile cyber attack on a
>> local institution URLs in our incoming mail are intercepted.
>>>>>>    arch/arm/boot/dts/armada-38x.dtsi |  5 +++++
>>>>>>    arch/arm/mach-mvebu/board-v7.c    | 27 +++++++++++++++++++++++++++
>>>>>>    2 files changed, 32 insertions(+)
>>>>>>
>>>>>> diff --git a/arch/arm/boot/dts/armada-38x.dtsi b/arch/arm/boot/dts/armada-38x.dtsi
>>>>>> index 3f4bb44d85f0..3214c67433eb 100644
>>>>>> --- a/arch/arm/boot/dts/armada-38x.dtsi
>>>>>> +++ b/arch/arm/boot/dts/armada-38x.dtsi
>>>>>> @@ -386,6 +386,11 @@
>>>>>>    				      <0x20250 0x8>;
>>>>>>    			};
>>>>>>    
>>>>>> +			ioerrc: io-err-control@20200 {
>>>>>> +				compatible = "marvell,io-err-control";
>>>>>> +				reg = <0x20200 0x4>;
>>>>>> +			};
>>>>>> +
>>>>>>    			mpic: interrupt-controller@20a00 {
>>>>>>    				compatible = "marvell,mpic";
>>>>>>    				reg = <0x20a00 0x2d0>, <0x21070 0x58>;
>>>>>> diff --git a/arch/arm/mach-mvebu/board-v7.c b/arch/arm/mach-mvebu/board-v7.c
>>>>>> index d2df5ef9382b..fb7718386ef9 100644
>>>>>> --- a/arch/arm/mach-mvebu/board-v7.c
>>>>>> +++ b/arch/arm/mach-mvebu/board-v7.c
>>>>>> @@ -138,10 +138,36 @@ static void __init i2c_quirk(void)
>>>>>>    	}
>>>>>>    }
>>>>>>    
>>>>>> +#define MBUS_ERR_PROP_EN BIT(8)
>>>>>> +
>>>>>> +/*
>>>>>> + * U-boot disables MBUS error propagation. Re-enable it so we
>>>>>> + * can handle them as Bus Errors.
>>>>>> + */
>>>>>> +static void __init enable_mbus_error_propagation(void)
>>>>>> +{
>>>>>> +	struct device_node *np =
>>>>>> +		of_find_compatible_node(NULL, NULL, "marvell,io-err-control");
>>>>>> +
>>>>>> +	if (np) {
>>>>>> +		void __iomem *reg;
>>>>>> +
>>>>>> +		reg = of_iomap(np, 0);
>>>>>> +		if (reg) {
>>>>>> +			u32 val;
>>>>>> +
>>>>>> +			val = readl_relaxed(reg);
>>>>>> +			writel_relaxed(val | MBUS_ERR_PROP_EN, reg);
>>>>>> +		}
>>>>>> +		of_node_put(np);
>>>>>> +	}
>>>>>> +}
>>>>>> +
>>>>>>    static void __init mvebu_dt_init(void)
>>>>>>    {
>>>>>>    	if (of_machine_is_compatible("marvell,armadaxp"))
>>>>>>    		i2c_quirk();
>>>>>> +	enable_mbus_error_propagation();
>>>>>>    }
>>>>>>    
>>>>>>    static void __init armada_370_xp_dt_fixup(void)
>>>>>> @@ -191,6 +217,7 @@ DT_MACHINE_START(ARMADA_38X_DT, "Marvell Armada 380/385 (Device Tree)")
>>>>>>    	.l2c_aux_val	= 0,
>>>>>>    	.l2c_aux_mask	= ~0,
>>>>>>    	.init_irq       = mvebu_init_irq,
>>>>>> +	.init_machine	= mvebu_dt_init,
>>>>>>    	.restart	= mvebu_restart,
>>>>>>    	.dt_compat	= armada_38x_dt_compat,
>>>>>>    MACHINE_END
>>>>>> -- 
>>>>>> 2.24.0
>>>>>>
diff mbox series

Patch

diff --git a/arch/arm/boot/dts/armada-38x.dtsi b/arch/arm/boot/dts/armada-38x.dtsi
index 3f4bb44d85f0..3214c67433eb 100644
--- a/arch/arm/boot/dts/armada-38x.dtsi
+++ b/arch/arm/boot/dts/armada-38x.dtsi
@@ -386,6 +386,11 @@ 
 				      <0x20250 0x8>;
 			};
 
+			ioerrc: io-err-control@20200 {
+				compatible = "marvell,io-err-control";
+				reg = <0x20200 0x4>;
+			};
+
 			mpic: interrupt-controller@20a00 {
 				compatible = "marvell,mpic";
 				reg = <0x20a00 0x2d0>, <0x21070 0x58>;
diff --git a/arch/arm/mach-mvebu/board-v7.c b/arch/arm/mach-mvebu/board-v7.c
index d2df5ef9382b..fb7718386ef9 100644
--- a/arch/arm/mach-mvebu/board-v7.c
+++ b/arch/arm/mach-mvebu/board-v7.c
@@ -138,10 +138,36 @@  static void __init i2c_quirk(void)
 	}
 }
 
+#define MBUS_ERR_PROP_EN BIT(8)
+
+/*
+ * U-boot disables MBUS error propagation. Re-enable it so we
+ * can handle them as Bus Errors.
+ */
+static void __init enable_mbus_error_propagation(void)
+{
+	struct device_node *np =
+		of_find_compatible_node(NULL, NULL, "marvell,io-err-control");
+
+	if (np) {
+		void __iomem *reg;
+
+		reg = of_iomap(np, 0);
+		if (reg) {
+			u32 val;
+
+			val = readl_relaxed(reg);
+			writel_relaxed(val | MBUS_ERR_PROP_EN, reg);
+		}
+		of_node_put(np);
+	}
+}
+
 static void __init mvebu_dt_init(void)
 {
 	if (of_machine_is_compatible("marvell,armadaxp"))
 		i2c_quirk();
+	enable_mbus_error_propagation();
 }
 
 static void __init armada_370_xp_dt_fixup(void)
@@ -191,6 +217,7 @@  DT_MACHINE_START(ARMADA_38X_DT, "Marvell Armada 380/385 (Device Tree)")
 	.l2c_aux_val	= 0,
 	.l2c_aux_mask	= ~0,
 	.init_irq       = mvebu_init_irq,
+	.init_machine	= mvebu_dt_init,
 	.restart	= mvebu_restart,
 	.dt_compat	= armada_38x_dt_compat,
 MACHINE_END