diff mbox series

[3/3] coresight etm4x: Add 32-bit read/write option to split 64-bit words

Message ID 4e74e75a6e489403aa2cc108ac31639c57973fce.1677881753.git.scclevenger@os.amperecomputing.com (mailing list archive)
State New, archived
Headers show
Series Ampere Computing ETMv4.x Support | expand

Commit Message

Steve Clevenger March 6, 2023, 5:54 a.m. UTC
An Ampere Computing design decision to not support 64-bit read/write
access in the ETMv4.6 implementation.

The Ampere work around is to split ETMv4.6 64-bit register access
into 2 ea. 32-bit read/write accesses.
AC03_DEBUG_10 is described in the AmpereOne Developer Errata document:

https://solutions.amperecomputing.com/customer-connect/products/AmpereOne-device-documentation

Signed-off-by: Steve Clevenger <scclevenger@os.amperecomputing.com>
---
 drivers/hwtracing/coresight/coresight-etm4x.h | 58 ++++++++++++++-----
 1 file changed, 44 insertions(+), 14 deletions(-)

Comments

Leo Yan March 8, 2023, 7:04 a.m. UTC | #1
Hi Steve,

On Sun, Mar 05, 2023 at 10:54:08PM -0700, Steve Clevenger wrote:
> An Ampere Computing design decision to not support 64-bit read/write
> access in the ETMv4.6 implementation.
> 
> The Ampere work around is to split ETMv4.6 64-bit register access
> into 2 ea. 32-bit read/write accesses.
> AC03_DEBUG_10 is described in the AmpereOne Developer Errata document:
> 
> https://solutions.amperecomputing.com/customer-connect/products/AmpereOne-device-documentation
> 
> Signed-off-by: Steve Clevenger <scclevenger@os.amperecomputing.com>
> ---
>  drivers/hwtracing/coresight/coresight-etm4x.h | 58 ++++++++++++++-----
>  1 file changed, 44 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h
> index 434f4e95ee17..17457ec71876 100644
> --- a/drivers/hwtracing/coresight/coresight-etm4x.h
> +++ b/drivers/hwtracing/coresight/coresight-etm4x.h
> @@ -539,11 +539,6 @@
>  		 readl_relaxed((csa)->base + (offset)) :		\
>  		 read_etm4x_sysreg_offset((offset), false)))
>  
> -#define etm4x_relaxed_read64(csa, offset)				\
> -	((u64)((csa)->io_mem ?						\
> -		 readq_relaxed((csa)->base + (offset)) :		\
> -		 read_etm4x_sysreg_offset((offset), true)))
> -
>  #define etm4x_read32(csa, offset)					\
>  	({								\
>  		u32 __val = etm4x_relaxed_read32((csa), (offset));	\
> @@ -567,15 +562,6 @@
>  						  false);		\
>  	} while (0)
>  
> -#define etm4x_relaxed_write64(csa, val, offset)				\
> -	do {								\
> -		if ((csa)->io_mem)					\
> -			writeq_relaxed((val), (csa)->base + (offset));	\
> -		else							\
> -			write_etm4x_sysreg_offset((val), (offset),	\
> -						  true);		\
> -	} while (0)
> -
>  #define etm4x_write32(csa, val, offset)					\
>  	do {								\
>  		__io_bw();						\
> @@ -1091,6 +1077,50 @@ void etm4_config_trace_mode(struct etmv4_config *config);
>  u64 etm4x_sysreg_read(u32 offset, bool _relaxed, bool _64bit);
>  void etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit);
>  
> +/* 64-bit aligned to convert 64-bit access to 2 ea. 32-bit access */
> +#pragma pack(push, 8)
> +
> +struct etm_quad_split {
> +	u32 lsw;
> +	u32 msw;
> +};
> +
> +#pragma pack(pop)
> +
> +static inline u64 etm4x_relaxed_read64(struct csdev_access *csa, unsigned int offset)
> +{
> +	if (csa->io_mem) {
> +		if (csa->no_quad_mmio) {
> +			/* split 64-bit reads into 2 consecutive 32-bit reads */
> +			struct etm_quad_split container;
> +
> +			container.lsw = etm4x_read32(csa, offset);
> +			container.msw = etm4x_read32(csa, offset + sizeof(u32));
> +
> +			return *(u64 *) &container;

To be honest, I am not familiar with this part, just want to remind two
things.

Firstly, here you could use hi_lo_readq_relaxed(), seems to me,
hi_lo_readq_relaxed() does the same thing with above section and
you don't need to define the structure etm_quad_split().

Secondly, IIUC, a main problem with splitting 64-bit access into two
32-bit accesses is breaking atomicity.  If here have race condition
between reading and writing 64-bit registers, we need to consider to
use spinlock for register accessing.

I'd like to leave the second issue to Suzuki/Mike/James for
comfirmation in case I introduced complexity.

> +		} else
> +			return readq_relaxed(csa->base + offset);
> +	} else
> +		return read_etm4x_sysreg_offset(offset, true);

Here need to add brackets:

        } else {
                return read_etm4x_sysreg_offset(offset, true);
        }

> +}
> +
> +static inline void etm4x_relaxed_write64(struct csdev_access *csa, u64 quad, unsigned int offset)
> +{
> +	if (csa->io_mem) {
> +		/* split 64-bit writes into 2 consecutive 32-bit writes */
> +		if (csa->no_quad_mmio) {
> +			struct etm_quad_split container;
> +
> +			*(u64 *) &container = quad;
> +
> +			etm4x_relaxed_write32(csa, container.lsw, offset);
> +			etm4x_relaxed_write32(csa, container.msw, offset + sizeof(u32));
> +		} else
> +			writeq_relaxed(quad, csa->base + offset);
> +	} else
> +		write_etm4x_sysreg_offset(quad, offset, true);		\

Ditto.  Please drop the character '\' at the end of the line.

Thanks,
Leo

P.s. I have a ADLink AVA platform (Ampere Altra SoC with 32 CPUs), I
am glad to give a test if you can confirm this patch set can apply on
it (and please clarify if there have any prerequisite for firmwares).

> +}
> +
>  static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata)
>  {
>  	return drvdata->arch >= ETM_ARCH_ETE;
> -- 
> 2.25.1
>
Al Grant March 8, 2023, 10:06 a.m. UTC | #2
> Hi Steve,
> 
> On Sun, Mar 05, 2023 at 10:54:08PM -0700, Steve Clevenger wrote:
> > An Ampere Computing design decision to not support 64-bit read/write
> > access in the ETMv4.6 implementation.
> >
> > The Ampere work around is to split ETMv4.6 64-bit register access into
> > 2 ea. 32-bit read/write accesses.
> > AC03_DEBUG_10 is described in the AmpereOne Developer Errata document:
> >
> > https://solutions.amperecomputing.com/customer-connect/products/Ampere
> > One-device-documentation
> >
> > Signed-off-by: Steve Clevenger <scclevenger@os.amperecomputing.com>
> > ---
> >  drivers/hwtracing/coresight/coresight-etm4x.h | 58
> > ++++++++++++++-----
> >  1 file changed, 44 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h
> > b/drivers/hwtracing/coresight/coresight-etm4x.h
> > index 434f4e95ee17..17457ec71876 100644
> > --- a/drivers/hwtracing/coresight/coresight-etm4x.h
> > +++ b/drivers/hwtracing/coresight/coresight-etm4x.h
> > @@ -539,11 +539,6 @@
> >  		 readl_relaxed((csa)->base + (offset)) :		\
> >  		 read_etm4x_sysreg_offset((offset), false)))
> >
> > -#define etm4x_relaxed_read64(csa, offset)				\
> > -	((u64)((csa)->io_mem ?						\
> > -		 readq_relaxed((csa)->base + (offset)) :		\
> > -		 read_etm4x_sysreg_offset((offset), true)))
> > -
> >  #define etm4x_read32(csa, offset)					\
> >  	({								\
> >  		u32 __val = etm4x_relaxed_read32((csa), (offset));	\
> > @@ -567,15 +562,6 @@
> >  						  false);		\
> >  	} while (0)
> >
> > -#define etm4x_relaxed_write64(csa, val, offset)
> 	\
> > -	do {								\
> > -		if ((csa)->io_mem)					\
> > -			writeq_relaxed((val), (csa)->base + (offset));	\
> > -		else							\
> > -			write_etm4x_sysreg_offset((val), (offset),	\
> > -						  true);		\
> > -	} while (0)
> > -
> >  #define etm4x_write32(csa, val, offset)
> 	\
> >  	do {								\
> >  		__io_bw();						\
> > @@ -1091,6 +1077,50 @@ void etm4_config_trace_mode(struct
> etmv4_config
> > *config);
> >  u64 etm4x_sysreg_read(u32 offset, bool _relaxed, bool _64bit);  void
> > etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit);
> >
> > +/* 64-bit aligned to convert 64-bit access to 2 ea. 32-bit access */
> > +#pragma pack(push, 8)
> > +
> > +struct etm_quad_split {
> > +	u32 lsw;
> > +	u32 msw;
> > +};
> > +
> > +#pragma pack(pop)
> > +
> > +static inline u64 etm4x_relaxed_read64(struct csdev_access *csa,
> > +unsigned int offset) {
> > +	if (csa->io_mem) {
> > +		if (csa->no_quad_mmio) {
> > +			/* split 64-bit reads into 2 consecutive 32-bit reads */
> > +			struct etm_quad_split container;
> > +
> > +			container.lsw = etm4x_read32(csa, offset);
> > +			container.msw = etm4x_read32(csa, offset +
> sizeof(u32));
> > +
> > +			return *(u64 *) &container;
> 
> To be honest, I am not familiar with this part, just want to remind two things.
> 
> Firstly, here you could use hi_lo_readq_relaxed(), seems to me,
> hi_lo_readq_relaxed() does the same thing with above section and you don't
> need to define the structure etm_quad_split().
> 
> Secondly, IIUC, a main problem with splitting 64-bit access into two 32-bit
> accesses is breaking atomicity.  If here have race condition between reading and
> writing 64-bit registers, we need to consider to use spinlock for register
> accessing.

That shouldn't be an issue. 32-bit memory-mapped interface is 
normal for ETMs. The peripheral bus interface to ETMs (debug APB)
is maximum 32-bit. The ETM architecture defines that registers are
only single-copy atomic up to 32-bit. This is generally true of
CoreSight programming registers.

What typically happens to a 64-bit read/write from a CPU is that it
is split into two 32-bit accesses by a downsizing bridge on the path
from the main system interconnect to the debug APB (see 4.3.5 
in the ETM architecture spec). In this case, it sounds like there is no
downsizing bridge. But because there should be no existing code
relying on 64-bit atomic access to ETM registers, it should be safe
to split the accesses in software without worrying about atomicity.

Overall the ETM configuration involves complicated interactions
between multiple registers, so if you've got one CPU reading out the
configuration while another CPU is writing it, you've likely got much
bigger problems than 64-bit atomicity.

Al


> 
> I'd like to leave the second issue to Suzuki/Mike/James for comfirmation in case
> I introduced complexity.
> 
> > +		} else
> > +			return readq_relaxed(csa->base + offset);
> > +	} else
> > +		return read_etm4x_sysreg_offset(offset, true);
> 
> Here need to add brackets:
> 
>         } else {
>                 return read_etm4x_sysreg_offset(offset, true);
>         }
> 
> > +}
> > +
> > +static inline void etm4x_relaxed_write64(struct csdev_access *csa,
> > +u64 quad, unsigned int offset) {
> > +	if (csa->io_mem) {
> > +		/* split 64-bit writes into 2 consecutive 32-bit writes */
> > +		if (csa->no_quad_mmio) {
> > +			struct etm_quad_split container;
> > +
> > +			*(u64 *) &container = quad;
> > +
> > +			etm4x_relaxed_write32(csa, container.lsw, offset);
> > +			etm4x_relaxed_write32(csa, container.msw, offset +
> sizeof(u32));
> > +		} else
> > +			writeq_relaxed(quad, csa->base + offset);
> > +	} else
> > +		write_etm4x_sysreg_offset(quad, offset, true);		\
> 
> Ditto.  Please drop the character '\' at the end of the line.
> 
> Thanks,
> Leo
> 
> P.s. I have a ADLink AVA platform (Ampere Altra SoC with 32 CPUs), I am glad to
> give a test if you can confirm this patch set can apply on it (and please clarify if
> there have any prerequisite for firmwares).
> 
> > +}
> > +
> >  static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata)  {
> >  	return drvdata->arch >= ETM_ARCH_ETE;
> > --
> > 2.25.1
> >
> _______________________________________________
> CoreSight mailing list -- coresight@lists.linaro.org To unsubscribe send an email
> to coresight-leave@lists.linaro.org
Mike Leach March 8, 2023, 11:26 a.m. UTC | #3
Hi,

On Wed, 8 Mar 2023 at 10:07, Al Grant <Al.Grant@arm.com> wrote:
>
> > Hi Steve,
> >
> > On Sun, Mar 05, 2023 at 10:54:08PM -0700, Steve Clevenger wrote:
> > > An Ampere Computing design decision to not support 64-bit read/write
> > > access in the ETMv4.6 implementation.
> > >
> > > The Ampere work around is to split ETMv4.6 64-bit register access into
> > > 2 ea. 32-bit read/write accesses.
> > > AC03_DEBUG_10 is described in the AmpereOne Developer Errata document:
> > >
> > > https://solutions.amperecomputing.com/customer-connect/products/Ampere
> > > One-device-documentation
> > >
> > > Signed-off-by: Steve Clevenger <scclevenger@os.amperecomputing.com>
> > > ---
> > >  drivers/hwtracing/coresight/coresight-etm4x.h | 58
> > > ++++++++++++++-----
> > >  1 file changed, 44 insertions(+), 14 deletions(-)
> > >
> > > diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h
> > > b/drivers/hwtracing/coresight/coresight-etm4x.h
> > > index 434f4e95ee17..17457ec71876 100644
> > > --- a/drivers/hwtracing/coresight/coresight-etm4x.h
> > > +++ b/drivers/hwtracing/coresight/coresight-etm4x.h
> > > @@ -539,11 +539,6 @@
> > >              readl_relaxed((csa)->base + (offset)) :                \
> > >              read_etm4x_sysreg_offset((offset), false)))
> > >
> > > -#define etm4x_relaxed_read64(csa, offset)                          \
> > > -   ((u64)((csa)->io_mem ?                                          \
> > > -            readq_relaxed((csa)->base + (offset)) :                \
> > > -            read_etm4x_sysreg_offset((offset), true)))
> > > -
> > >  #define etm4x_read32(csa, offset)                                  \
> > >     ({                                                              \
> > >             u32 __val = etm4x_relaxed_read32((csa), (offset));      \
> > > @@ -567,15 +562,6 @@
> > >                                               false);               \
> > >     } while (0)
> > >
> > > -#define etm4x_relaxed_write64(csa, val, offset)
> >       \
> > > -   do {                                                            \
> > > -           if ((csa)->io_mem)                                      \
> > > -                   writeq_relaxed((val), (csa)->base + (offset));  \
> > > -           else                                                    \
> > > -                   write_etm4x_sysreg_offset((val), (offset),      \
> > > -                                             true);                \
> > > -   } while (0)
> > > -
> > >  #define etm4x_write32(csa, val, offset)
> >       \
> > >     do {                                                            \
> > >             __io_bw();                                              \
> > > @@ -1091,6 +1077,50 @@ void etm4_config_trace_mode(struct
> > etmv4_config
> > > *config);
> > >  u64 etm4x_sysreg_read(u32 offset, bool _relaxed, bool _64bit);  void
> > > etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit);
> > >
> > > +/* 64-bit aligned to convert 64-bit access to 2 ea. 32-bit access */
> > > +#pragma pack(push, 8)
> > > +
> > > +struct etm_quad_split {
> > > +   u32 lsw;
> > > +   u32 msw;
> > > +};
> > > +
> > > +#pragma pack(pop)
> > > +
> > > +static inline u64 etm4x_relaxed_read64(struct csdev_access *csa,
> > > +unsigned int offset) {
> > > +   if (csa->io_mem) {
> > > +           if (csa->no_quad_mmio) {
> > > +                   /* split 64-bit reads into 2 consecutive 32-bit reads */
> > > +                   struct etm_quad_split container;
> > > +
> > > +                   container.lsw = etm4x_read32(csa, offset);
> > > +                   container.msw = etm4x_read32(csa, offset +
> > sizeof(u32));
> > > +
> > > +                   return *(u64 *) &container;
> >
> > To be honest, I am not familiar with this part, just want to remind two things.
> >
> > Firstly, here you could use hi_lo_readq_relaxed(), seems to me,
> > hi_lo_readq_relaxed() does the same thing with above section and you don't
> > need to define the structure etm_quad_split().
> >
> > Secondly, IIUC, a main problem with splitting 64-bit access into two 32-bit
> > accesses is breaking atomicity.  If here have race condition between reading and
> > writing 64-bit registers, we need to consider to use spinlock for register
> > accessing.

The drivers ensure that:-
a) the ETM is only accessed by the CPU it is associated with.
b) there are appropriate locks in place while the ETM is programmed up.

Moreover, the 64 bit registers are programmed only when the device is
disabled - as required by the spec, so there cannot be a case of using
"half" an updated value as all the required registers are programmed
before the device is finally enabled to begin tracing

Regards

Mike


>
> That shouldn't be an issue. 32-bit memory-mapped interface is
> normal for ETMs. The peripheral bus interface to ETMs (debug APB)
> is maximum 32-bit. The ETM architecture defines that registers are
> only single-copy atomic up to 32-bit. This is generally true of
> CoreSight programming registers.
>
> What typically happens to a 64-bit read/write from a CPU is that it
> is split into two 32-bit accesses by a downsizing bridge on the path
> from the main system interconnect to the debug APB (see 4.3.5
> in the ETM architecture spec). In this case, it sounds like there is no
> downsizing bridge. But because there should be no existing code
> relying on 64-bit atomic access to ETM registers, it should be safe
> to split the accesses in software without worrying about atomicity.
>
> Overall the ETM configuration involves complicated interactions
> between multiple registers, so if you've got one CPU reading out the
> configuration while another CPU is writing it, you've likely got much
> bigger problems than 64-bit atomicity.
>
> Al
>
>
> >
> > I'd like to leave the second issue to Suzuki/Mike/James for comfirmation in case
> > I introduced complexity.
> >
> > > +           } else
> > > +                   return readq_relaxed(csa->base + offset);
> > > +   } else
> > > +           return read_etm4x_sysreg_offset(offset, true);
> >
> > Here need to add brackets:
> >
> >         } else {
> >                 return read_etm4x_sysreg_offset(offset, true);
> >         }
> >
> > > +}
> > > +
> > > +static inline void etm4x_relaxed_write64(struct csdev_access *csa,
> > > +u64 quad, unsigned int offset) {
> > > +   if (csa->io_mem) {
> > > +           /* split 64-bit writes into 2 consecutive 32-bit writes */
> > > +           if (csa->no_quad_mmio) {
> > > +                   struct etm_quad_split container;
> > > +
> > > +                   *(u64 *) &container = quad;
> > > +
> > > +                   etm4x_relaxed_write32(csa, container.lsw, offset);
> > > +                   etm4x_relaxed_write32(csa, container.msw, offset +
> > sizeof(u32));
> > > +           } else
> > > +                   writeq_relaxed(quad, csa->base + offset);
> > > +   } else
> > > +           write_etm4x_sysreg_offset(quad, offset, true);          \
> >
> > Ditto.  Please drop the character '\' at the end of the line.
> >
> > Thanks,
> > Leo
> >
> > P.s. I have a ADLink AVA platform (Ampere Altra SoC with 32 CPUs), I am glad to
> > give a test if you can confirm this patch set can apply on it (and please clarify if
> > there have any prerequisite for firmwares).
> >
> > > +}
> > > +
> > >  static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata)  {
> > >     return drvdata->arch >= ETM_ARCH_ETE;
> > > --
> > > 2.25.1
> > >
> > _______________________________________________
> > CoreSight mailing list -- coresight@lists.linaro.org To unsubscribe send an email
> > to coresight-leave@lists.linaro.org
Leo Yan March 8, 2023, 11:54 a.m. UTC | #4
On Wed, Mar 08, 2023 at 11:26:48AM +0000, Mike Leach wrote:

[...]

> > > > +static inline u64 etm4x_relaxed_read64(struct csdev_access *csa,
> > > > +unsigned int offset) {
> > > > +   if (csa->io_mem) {
> > > > +           if (csa->no_quad_mmio) {
> > > > +                   /* split 64-bit reads into 2 consecutive 32-bit reads */
> > > > +                   struct etm_quad_split container;
> > > > +
> > > > +                   container.lsw = etm4x_read32(csa, offset);
> > > > +                   container.msw = etm4x_read32(csa, offset +
> > > sizeof(u32));
> > > > +
> > > > +                   return *(u64 *) &container;
> > >
> > > To be honest, I am not familiar with this part, just want to remind two things.
> > >
> > > Firstly, here you could use hi_lo_readq_relaxed(), seems to me,
> > > hi_lo_readq_relaxed() does the same thing with above section and you don't
> > > need to define the structure etm_quad_split().
> > >
> > > Secondly, IIUC, a main problem with splitting 64-bit access into two 32-bit
> > > accesses is breaking atomicity.  If here have race condition between reading and
> > > writing 64-bit registers, we need to consider to use spinlock for register
> > > accessing.
> 
> The drivers ensure that:-
> a) the ETM is only accessed by the CPU it is associated with.
> b) there are appropriate locks in place while the ETM is programmed up.
> 
> Moreover, the 64 bit registers are programmed only when the device is
> disabled - as required by the spec, so there cannot be a case of using
> "half" an updated value as all the required registers are programmed
> before the device is finally enabled to begin tracing

Thanks a lot for confirmation, Al and Mike.  So the second issue is
not valid anymore and please ignore it.

Thanks,
Leo

> Regards
> 
> Mike
> 
> 
> >
> > That shouldn't be an issue. 32-bit memory-mapped interface is
> > normal for ETMs. The peripheral bus interface to ETMs (debug APB)
> > is maximum 32-bit. The ETM architecture defines that registers are
> > only single-copy atomic up to 32-bit. This is generally true of
> > CoreSight programming registers.
> >
> > What typically happens to a 64-bit read/write from a CPU is that it
> > is split into two 32-bit accesses by a downsizing bridge on the path
> > from the main system interconnect to the debug APB (see 4.3.5
> > in the ETM architecture spec). In this case, it sounds like there is no
> > downsizing bridge. But because there should be no existing code
> > relying on 64-bit atomic access to ETM registers, it should be safe
> > to split the accesses in software without worrying about atomicity.
> >
> > Overall the ETM configuration involves complicated interactions
> > between multiple registers, so if you've got one CPU reading out the
> > configuration while another CPU is writing it, you've likely got much
> > bigger problems than 64-bit atomicity.
> >
> > Al
> >
> >
> > >
> > > I'd like to leave the second issue to Suzuki/Mike/James for comfirmation in case
> > > I introduced complexity.
> > >
> > > > +           } else
> > > > +                   return readq_relaxed(csa->base + offset);
> > > > +   } else
> > > > +           return read_etm4x_sysreg_offset(offset, true);
> > >
> > > Here need to add brackets:
> > >
> > >         } else {
> > >                 return read_etm4x_sysreg_offset(offset, true);
> > >         }
> > >
> > > > +}
> > > > +
> > > > +static inline void etm4x_relaxed_write64(struct csdev_access *csa,
> > > > +u64 quad, unsigned int offset) {
> > > > +   if (csa->io_mem) {
> > > > +           /* split 64-bit writes into 2 consecutive 32-bit writes */
> > > > +           if (csa->no_quad_mmio) {
> > > > +                   struct etm_quad_split container;
> > > > +
> > > > +                   *(u64 *) &container = quad;
> > > > +
> > > > +                   etm4x_relaxed_write32(csa, container.lsw, offset);
> > > > +                   etm4x_relaxed_write32(csa, container.msw, offset +
> > > sizeof(u32));
> > > > +           } else
> > > > +                   writeq_relaxed(quad, csa->base + offset);
> > > > +   } else
> > > > +           write_etm4x_sysreg_offset(quad, offset, true);          \
> > >
> > > Ditto.  Please drop the character '\' at the end of the line.
> > >
> > > Thanks,
> > > Leo
> > >
> > > P.s. I have a ADLink AVA platform (Ampere Altra SoC with 32 CPUs), I am glad to
> > > give a test if you can confirm this patch set can apply on it (and please clarify if
> > > there have any prerequisite for firmwares).
> > >
> > > > +}
> > > > +
> > > >  static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata)  {
> > > >     return drvdata->arch >= ETM_ARCH_ETE;
> > > > --
> > > > 2.25.1
> > > >
> > > _______________________________________________
> > > CoreSight mailing list -- coresight@lists.linaro.org To unsubscribe send an email
> > > to coresight-leave@lists.linaro.org
> 
> 
> 
> -- 
> Mike Leach
> Principal Engineer, ARM Ltd.
> Manchester Design Centre. UK
Steve Clevenger March 8, 2023, 7:08 p.m. UTC | #5
Hi Leo,

Thanks for your feedback. Comments inline.

Steve

On 3/8/2023 3:54 AM, Leo Yan wrote:
> On Wed, Mar 08, 2023 at 11:26:48AM +0000, Mike Leach wrote:
> 
> [...]
> 
>>>>> +static inline u64 etm4x_relaxed_read64(struct csdev_access *csa,
>>>>> +unsigned int offset) {
>>>>> +   if (csa->io_mem) {
>>>>> +           if (csa->no_quad_mmio) {
>>>>> +                   /* split 64-bit reads into 2 consecutive 32-bit reads */
>>>>> +                   struct etm_quad_split container;
>>>>> +
>>>>> +                   container.lsw = etm4x_read32(csa, offset);
>>>>> +                   container.msw = etm4x_read32(csa, offset +
>>>> sizeof(u32));
>>>>> +
>>>>> +                   return *(u64 *) &container;
>>>>
>>>> To be honest, I am not familiar with this part, just want to remind two things.
>>>>
>>>> Firstly, here you could use hi_lo_readq_relaxed(), seems to me,
>>>> hi_lo_readq_relaxed() does the same thing with above section and you don't
>>>> need to define the structure etm_quad_split().

I'm familiar with this interface. The reason I chose not to use it is
this solution is configured at compile time. The writeq_relaxed (used by
etm4x_relaxed_write64) and readq_relaxed (used by etm4x_relaxed_read64)
otherwise default to 64-bit access. I was uncertain how a compile time
change would go over with the maintainers. Correct me, but compile time
configurations in the kernel seem to be related to ARM64 erratum, versus
manufacturer specific?

My take (based on limited knowledge) of the changes necessary to support
a compile time decision seemed to exceed the changes compared with the
implementation I submitted upstream. If this becomes a sticking point,
let me know.

>>>>
>>>> Secondly, IIUC, a main problem with splitting 64-bit access into two 32-bit
>>>> accesses is breaking atomicity.  If here have race condition between reading and
>>>> writing 64-bit registers, we need to consider to use spinlock for register
>>>> accessing.
>>
>> The drivers ensure that:-
>> a) the ETM is only accessed by the CPU it is associated with.
>> b) there are appropriate locks in place while the ETM is programmed up.
>>
>> Moreover, the 64 bit registers are programmed only when the device is
>> disabled - as required by the spec, so there cannot be a case of using
>> "half" an updated value as all the required registers are programmed
>> before the device is finally enabled to begin tracing
> 
> Thanks a lot for confirmation, Al and Mike.  So the second issue is
> not valid anymore and please ignore it.
> 
> Thanks,
> Leo
> 
>> Regards
>>
>> Mike
>>
>>
>>>
>>> That shouldn't be an issue. 32-bit memory-mapped interface is
>>> normal for ETMs. The peripheral bus interface to ETMs (debug APB)
>>> is maximum 32-bit. The ETM architecture defines that registers are
>>> only single-copy atomic up to 32-bit. This is generally true of
>>> CoreSight programming registers.
>>>
>>> What typically happens to a 64-bit read/write from a CPU is that it
>>> is split into two 32-bit accesses by a downsizing bridge on the path
>>> from the main system interconnect to the debug APB (see 4.3.5
>>> in the ETM architecture spec). In this case, it sounds like there is no
>>> downsizing bridge. But because there should be no existing code
>>> relying on 64-bit atomic access to ETM registers, it should be safe
>>> to split the accesses in software without worrying about atomicity.
>>>
>>> Overall the ETM configuration involves complicated interactions
>>> between multiple registers, so if you've got one CPU reading out the
>>> configuration while another CPU is writing it, you've likely got much
>>> bigger problems than 64-bit atomicity.
>>>
>>> Al
>>>
>>>
>>>>
>>>> I'd like to leave the second issue to Suzuki/Mike/James for comfirmation in case
>>>> I introduced complexity.
>>>>
>>>>> +           } else
>>>>> +                   return readq_relaxed(csa->base + offset);
>>>>> +   } else
>>>>> +           return read_etm4x_sysreg_offset(offset, true);
>>>>
>>>> Here need to add brackets:
>>>>
>>>>         } else {
>>>>                 return read_etm4x_sysreg_offset(offset, true);
>>>>         }
>>>>
>>>>> +}
>>>>> +
>>>>> +static inline void etm4x_relaxed_write64(struct csdev_access *csa,
>>>>> +u64 quad, unsigned int offset) {
>>>>> +   if (csa->io_mem) {
>>>>> +           /* split 64-bit writes into 2 consecutive 32-bit writes */
>>>>> +           if (csa->no_quad_mmio) {
>>>>> +                   struct etm_quad_split container;
>>>>> +
>>>>> +                   *(u64 *) &container = quad;
>>>>> +
>>>>> +                   etm4x_relaxed_write32(csa, container.lsw, offset);
>>>>> +                   etm4x_relaxed_write32(csa, container.msw, offset +
>>>> sizeof(u32));
>>>>> +           } else
>>>>> +                   writeq_relaxed(quad, csa->base + offset);
>>>>> +   } else
>>>>> +           write_etm4x_sysreg_offset(quad, offset, true);          \
>>>>
>>>> Ditto.  Please drop the character '\' at the end of the line.
>>>>
>>>> Thanks,
>>>> Leo
>>>>
>>>> P.s. I have a ADLink AVA platform (Ampere Altra SoC with 32 CPUs), I am glad to
>>>> give a test if you can confirm this patch set can apply on it (and please clarify if
>>>> there have any prerequisite for firmwares).
>>>>
>>>>> +}
>>>>> +
>>>>>  static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata)  {
>>>>>     return drvdata->arch >= ETM_ARCH_ETE;
>>>>> --
>>>>> 2.25.1
>>>>>
>>>> _______________________________________________
>>>> CoreSight mailing list -- coresight@lists.linaro.org To unsubscribe send an email
>>>> to coresight-leave@lists.linaro.org
>>
>>
>>
>> -- 
>> Mike Leach
>> Principal Engineer, ARM Ltd.
>> Manchester Design Centre. UK
Leo Yan March 9, 2023, 12:11 p.m. UTC | #6
Hi Steve,

On Wed, Mar 08, 2023 at 11:08:34AM -0800, Steve Clevenger wrote:

[...]

> I'm familiar with this interface. The reason I chose not to use it is
> this solution is configured at compile time. The writeq_relaxed (used by
> etm4x_relaxed_write64) and readq_relaxed (used by etm4x_relaxed_read64)
> otherwise default to 64-bit access. I was uncertain how a compile time
> change would go over with the maintainers. Correct me, but compile time
> configurations in the kernel seem to be related to ARM64 erratum, versus
> manufacturer specific?
> 
> My take (based on limited knowledge) of the changes necessary to support
> a compile time decision seemed to exceed the changes compared with the
> implementation I submitted upstream. If this becomes a sticking point,
> let me know.

I am suggesting something like below:

#include <linux/io-64-nonatomic-hi-lo.h>

static inline u64 etm4x_relaxed_read64(struct csdev_access *csa, unsigned int offset)
{
	if (csa->io_mem) {
		if (csa->no_quad_mmio)
                        return hi_lo_readq_relaxed(csa->base + offset);
		else
			return readq_relaxed(csa->base + offset);
	} else {
		return read_etm4x_sysreg_offset(offset, true);
        }
}

I'm not saying to play magic on {writeq|readq}_relaxed(), would it be
fine to directly call hi_lo_readq_relaxed()?

Thanks,
Leo
Steve Clevenger March 9, 2023, 8:46 p.m. UTC | #7
Hi Leo,

Comments below.

Steve

On 3/9/2023 4:11 AM, Leo Yan wrote:
> Hi Steve,
> 
> On Wed, Mar 08, 2023 at 11:08:34AM -0800, Steve Clevenger wrote:
> 
> [...]
> 
>> I'm familiar with this interface. The reason I chose not to use it is
>> this solution is configured at compile time. The writeq_relaxed (used by
>> etm4x_relaxed_write64) and readq_relaxed (used by etm4x_relaxed_read64)
>> otherwise default to 64-bit access. I was uncertain how a compile time
>> change would go over with the maintainers. Correct me, but compile time
>> configurations in the kernel seem to be related to ARM64 erratum, versus
>> manufacturer specific?
>>
>> My take (based on limited knowledge) of the changes necessary to support
>> a compile time decision seemed to exceed the changes compared with the
>> implementation I submitted upstream. If this becomes a sticking point,
>> let me know.
> 
> I am suggesting something like below:
> 
> #include <linux/io-64-nonatomic-hi-lo.h>
> 
> static inline u64 etm4x_relaxed_read64(struct csdev_access *csa, unsigned int offset)
> {
> 	if (csa->io_mem) {
> 		if (csa->no_quad_mmio)
>                         return hi_lo_readq_relaxed(csa->base + offset);
> 		else
> 			return readq_relaxed(csa->base + offset);
> 	} else {
> 		return read_etm4x_sysreg_offset(offset, true);
>         }
> }
> 
> I'm not saying to play magic on {writeq|readq}_relaxed(), would it be
> fine to directly call hi_lo_readq_relaxed()?

I can do this, and it would work fine. Without compile protection, one
concern is the silent inclusion of io-64-nonatomic-hi-lo.h definitions
for readq_relaxed/writeq_relaxed if the include hierarchy gets changed.
The only (minor) risk I see is to performance.

Second, it seems to me the hi_lo and lo_hi macros were intended to deal
with endianness. IMO, the implementation I offered is agnostic without a
hint to endianness or atomicity. The CoreSight maintainer audience is
limited and already aware of these issues, so I'll make the change for you.

> 
> Thanks,
> Leo
Leo Yan March 10, 2023, 1:35 a.m. UTC | #8
On Thu, Mar 09, 2023 at 12:46:55PM -0800, Steve Clevenger wrote:

[...]

> > #include <linux/io-64-nonatomic-hi-lo.h>
> > 
> > static inline u64 etm4x_relaxed_read64(struct csdev_access *csa, unsigned int offset)
> > {
> > 	if (csa->io_mem) {
> > 		if (csa->no_quad_mmio)
> >                         return hi_lo_readq_relaxed(csa->base + offset);
> > 		else
> > 			return readq_relaxed(csa->base + offset);
> > 	} else {
> > 		return read_etm4x_sysreg_offset(offset, true);
> >         }
> > }
> > 
> > I'm not saying to play magic on {writeq|readq}_relaxed(), would it be
> > fine to directly call hi_lo_readq_relaxed()?
> 
> I can do this, and it would work fine. Without compile protection, one
> concern is the silent inclusion of io-64-nonatomic-hi-lo.h definitions
> for readq_relaxed/writeq_relaxed if the include hierarchy gets changed.
> The only (minor) risk I see is to performance.

Yeah, we could include io.h ahead io-64-nonatomic-hi-lo.h:

  #include <linux/io.h>
  #include <linux/io-64-nonatomic-lo-hi.h>

Thus we can get the definitions for {readq|writeq}_relaxed() from
"io.h".

> Second, it seems to me the hi_lo and lo_hi macros were intended to deal
> with endianness. IMO, the implementation I offered is agnostic without a
> hint to endianness or atomicity.

Seems to me, kernel functions with explict endianness naming is not bad
thing :)

> The CoreSight maintainer audience is
> limited and already aware of these issues, so I'll make the change for you.

Thanks!  Suzuki, Mike, I think it's good to get your opinion before
Steve can proceed for updating patch.

Leo
Steve Clevenger March 10, 2023, 4:55 p.m. UTC | #9
On 3/9/2023 5:35 PM, Leo Yan wrote:
> On Thu, Mar 09, 2023 at 12:46:55PM -0800, Steve Clevenger wrote:
> 
> [...]
> 
>>> #include <linux/io-64-nonatomic-hi-lo.h>
>>>
>>> static inline u64 etm4x_relaxed_read64(struct csdev_access *csa, unsigned int offset)
>>> {
>>> 	if (csa->io_mem) {
>>> 		if (csa->no_quad_mmio)
>>>                         return hi_lo_readq_relaxed(csa->base + offset);
>>> 		else
>>> 			return readq_relaxed(csa->base + offset);
>>> 	} else {
>>> 		return read_etm4x_sysreg_offset(offset, true);
>>>         }
>>> }
>>>
>>> I'm not saying to play magic on {writeq|readq}_relaxed(), would it be
>>> fine to directly call hi_lo_readq_relaxed()?
>>
>> I can do this, and it would work fine. Without compile protection, one
>> concern is the silent inclusion of io-64-nonatomic-hi-lo.h definitions
>> for readq_relaxed/writeq_relaxed if the include hierarchy gets changed.
>> The only (minor) risk I see is to performance.
> 
> Yeah, we could include io.h ahead io-64-nonatomic-hi-lo.h:
> 
>   #include <linux/io.h>
>   #include <linux/io-64-nonatomic-lo-hi.h>
> 
> Thus we can get the definitions for {readq|writeq}_relaxed() from
> "io.h".

It seems the actual read_relaxed/write_relaxed definitions come from the
asm-generic/io.h version already in the include hierarchy without adding
linux/io.h. A more foolproof protection is to check whether these are
defined before the io-64-nonatomic-hi-lo.h include and error out if not.

> 
>> Second, it seems to me the hi_lo and lo_hi macros were intended to deal
>> with endianness. IMO, the implementation I offered is agnostic without a
>> hint to endianness or atomicity.
> 
> Seems to me, kernel functions with explict endianness naming is not bad
> thing :)
> 
>> The CoreSight maintainer audience is
>> limited and already aware of these issues, so I'll make the change for you.
> 
> Thanks!  Suzuki, Mike, I think it's good to get your opinion before
> Steve can proceed for updating patch.
> 
> Leo
Steve Clevenger March 13, 2023, 4:14 p.m. UTC | #10
It seems there's a bit of a lull in this patch discussion. Leo, I'll
resubmit the patch with the compile protection I suggested in my last
response.

Steve

On 3/10/2023 8:55 AM, Steve Clevenger wrote:
> 
> 
> On 3/9/2023 5:35 PM, Leo Yan wrote:
>> On Thu, Mar 09, 2023 at 12:46:55PM -0800, Steve Clevenger wrote:
>>
>> [...]
>>
>>>> #include <linux/io-64-nonatomic-hi-lo.h>
>>>>
>>>> static inline u64 etm4x_relaxed_read64(struct csdev_access *csa, unsigned int offset)
>>>> {
>>>> 	if (csa->io_mem) {
>>>> 		if (csa->no_quad_mmio)
>>>>                         return hi_lo_readq_relaxed(csa->base + offset);
>>>> 		else
>>>> 			return readq_relaxed(csa->base + offset);
>>>> 	} else {
>>>> 		return read_etm4x_sysreg_offset(offset, true);
>>>>         }
>>>> }
>>>>
>>>> I'm not saying to play magic on {writeq|readq}_relaxed(), would it be
>>>> fine to directly call hi_lo_readq_relaxed()?
>>>
>>> I can do this, and it would work fine. Without compile protection, one
>>> concern is the silent inclusion of io-64-nonatomic-hi-lo.h definitions
>>> for readq_relaxed/writeq_relaxed if the include hierarchy gets changed.
>>> The only (minor) risk I see is to performance.
>>
>> Yeah, we could include io.h ahead io-64-nonatomic-hi-lo.h:
>>
>>   #include <linux/io.h>
>>   #include <linux/io-64-nonatomic-lo-hi.h>
>>
>> Thus we can get the definitions for {readq|writeq}_relaxed() from
>> "io.h".
> 
> It seems the actual read_relaxed/write_relaxed definitions come from the
> asm-generic/io.h version already in the include hierarchy without adding
> linux/io.h. A more foolproof protection is to check whether these are
> defined before the io-64-nonatomic-hi-lo.h include and error out if not.
> 
>>
>>> Second, it seems to me the hi_lo and lo_hi macros were intended to deal
>>> with endianness. IMO, the implementation I offered is agnostic without a
>>> hint to endianness or atomicity.
>>
>> Seems to me, kernel functions with explict endianness naming is not bad
>> thing :)
>>
>>> The CoreSight maintainer audience is
>>> limited and already aware of these issues, so I'll make the change for you.
>>
>> Thanks!  Suzuki, Mike, I think it's good to get your opinion before
>> Steve can proceed for updating patch.
>>
>> Leo
diff mbox series

Patch

diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h
index 434f4e95ee17..17457ec71876 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.h
+++ b/drivers/hwtracing/coresight/coresight-etm4x.h
@@ -539,11 +539,6 @@ 
 		 readl_relaxed((csa)->base + (offset)) :		\
 		 read_etm4x_sysreg_offset((offset), false)))
 
-#define etm4x_relaxed_read64(csa, offset)				\
-	((u64)((csa)->io_mem ?						\
-		 readq_relaxed((csa)->base + (offset)) :		\
-		 read_etm4x_sysreg_offset((offset), true)))
-
 #define etm4x_read32(csa, offset)					\
 	({								\
 		u32 __val = etm4x_relaxed_read32((csa), (offset));	\
@@ -567,15 +562,6 @@ 
 						  false);		\
 	} while (0)
 
-#define etm4x_relaxed_write64(csa, val, offset)				\
-	do {								\
-		if ((csa)->io_mem)					\
-			writeq_relaxed((val), (csa)->base + (offset));	\
-		else							\
-			write_etm4x_sysreg_offset((val), (offset),	\
-						  true);		\
-	} while (0)
-
 #define etm4x_write32(csa, val, offset)					\
 	do {								\
 		__io_bw();						\
@@ -1091,6 +1077,50 @@  void etm4_config_trace_mode(struct etmv4_config *config);
 u64 etm4x_sysreg_read(u32 offset, bool _relaxed, bool _64bit);
 void etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit);
 
+/* 64-bit aligned to convert 64-bit access to 2 ea. 32-bit access */
+#pragma pack(push, 8)
+
+struct etm_quad_split {
+	u32 lsw;
+	u32 msw;
+};
+
+#pragma pack(pop)
+
+static inline u64 etm4x_relaxed_read64(struct csdev_access *csa, unsigned int offset)
+{
+	if (csa->io_mem) {
+		if (csa->no_quad_mmio) {
+			/* split 64-bit reads into 2 consecutive 32-bit reads */
+			struct etm_quad_split container;
+
+			container.lsw = etm4x_read32(csa, offset);
+			container.msw = etm4x_read32(csa, offset + sizeof(u32));
+
+			return *(u64 *) &container;
+		} else
+			return readq_relaxed(csa->base + offset);
+	} else
+		return read_etm4x_sysreg_offset(offset, true);
+}
+
+static inline void etm4x_relaxed_write64(struct csdev_access *csa, u64 quad, unsigned int offset)
+{
+	if (csa->io_mem) {
+		/* split 64-bit writes into 2 consecutive 32-bit writes */
+		if (csa->no_quad_mmio) {
+			struct etm_quad_split container;
+
+			*(u64 *) &container = quad;
+
+			etm4x_relaxed_write32(csa, container.lsw, offset);
+			etm4x_relaxed_write32(csa, container.msw, offset + sizeof(u32));
+		} else
+			writeq_relaxed(quad, csa->base + offset);
+	} else
+		write_etm4x_sysreg_offset(quad, offset, true);		\
+}
+
 static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata)
 {
 	return drvdata->arch >= ETM_ARCH_ETE;