diff mbox series

parisc: Restore __ldcw_align for PA-RISC 2.0 processors

Message ID ZQnfrGKvvIs0KLvf@mx3210.localdomain (mailing list archive)
State Accepted, archived
Headers show
Series parisc: Restore __ldcw_align for PA-RISC 2.0 processors | expand

Commit Message

John David Anglin Sept. 19, 2023, 5:51 p.m. UTC
Back in 2005, Kyle McMartin removed the 16-byte alignment for ldcw
semaphores on PA 2.0 machines (CONFIG_PA20). This broke spinlocks
on pre PA8800 processors. The main symptom was random faults in
mmap'd memory (e.g., gcc compilations, etc).

Unfortunately, the errata for this ldcw change is lost.

The issue is the 16-byte alignment required for ldcw semaphore
instructions can only reduced to natural alignment when the ldcw
operation can be handled coherently in cache. Only PA8800 and
PA8900 processors actually support doing the operation in cache.

Aligning the spinlock dynamically adds two integer instructions
to each spinlock.

Tested on rp3440, c8000 and a500.

Signed-off-by: John David Anglin <dave.anglin@bell.net>
---

Comments

Sam James Sept. 19, 2023, 6:31 p.m. UTC | #1
John David Anglin <dave@parisc-linux.org> writes:

> [[PGP Signed Part:Undecided]]
> Back in 2005, Kyle McMartin removed the 16-byte alignment for ldcw
> semaphores on PA 2.0 machines (CONFIG_PA20). This broke spinlocks
> on pre PA8800 processors. The main symptom was random faults in
> mmap'd memory (e.g., gcc compilations, etc).
>
> Unfortunately, the errata for this ldcw change is lost.
>
> The issue is the 16-byte alignment required for ldcw semaphore
> instructions can only reduced to natural alignment when the ldcw
> operation can be handled coherently in cache. Only PA8800 and
> PA8900 processors actually support doing the operation in cache.
>
> Aligning the spinlock dynamically adds two integer instructions
> to each spinlock.
>
> Tested on rp3440, c8000 and a500.

Could you include a 'Fixes: <whatever the ancient commit hash is>'?

Also, what a find!

>
> Signed-off-by: John David Anglin <dave.anglin@bell.net>
> ---
>
> diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h
> index 6d28b5514699..ee9e071859b2 100644
> --- a/arch/parisc/include/asm/ldcw.h
> +++ b/arch/parisc/include/asm/ldcw.h
> @@ -2,39 +2,42 @@
>  #ifndef __PARISC_LDCW_H
>  #define __PARISC_LDCW_H
>  
> -#ifndef CONFIG_PA20
>  /* Because kmalloc only guarantees 8-byte alignment for kmalloc'd data,
>     and GCC only guarantees 8-byte alignment for stack locals, we can't
>     be assured of 16-byte alignment for atomic lock data even if we
>     specify "__attribute ((aligned(16)))" in the type declaration.  So,
>     we use a struct containing an array of four ints for the atomic lock
>     type and dynamically select the 16-byte aligned int from the array
> -   for the semaphore.  */
> +   for the semaphore. */
> +
> +/* From: "Jim Hull" <jim.hull of hp.com>
> +   I've attached a summary of the change, but basically, for PA 2.0, as
> +   long as the ",CO" (coherent operation) completer is implemented, then the
> +   16-byte alignment requirement for ldcw and ldcd is relaxed, and instead
> +   they only require "natural" alignment (4-byte for ldcw, 8-byte for
> +   ldcd).
> +
> +   Although the cache control hint is accepted by all PA 2.0 processors,
> +   it is only implemented on PA8800/PA8900 CPUs. Prior PA8X00 CPUs still
> +   require 16-byte alignment. If the address is unaligned, the operation
> +   of the instruction is undefined. The ldcw instruction does not generate
> +   unaligned data reference traps so misaligned accesses are not detected.
> +   This hid the problem for years. So, restore the 16-byte alignment dropped
> +   by Kyle McMartin in "Remove __ldcw_align for PA-RISC 2.0 processors". */
>  
>  #define __PA_LDCW_ALIGNMENT	16
> -#define __PA_LDCW_ALIGN_ORDER	4
>  #define __ldcw_align(a) ({					\
>  	unsigned long __ret = (unsigned long) &(a)->lock[0];	\
>  	__ret = (__ret + __PA_LDCW_ALIGNMENT - 1)		\
>  		& ~(__PA_LDCW_ALIGNMENT - 1);			\
>  	(volatile unsigned int *) __ret;			\
>  })
> -#define __LDCW	"ldcw"
>  
> -#else /*CONFIG_PA20*/
> -/* From: "Jim Hull" <jim.hull of hp.com>
> -   I've attached a summary of the change, but basically, for PA 2.0, as
> -   long as the ",CO" (coherent operation) completer is specified, then the
> -   16-byte alignment requirement for ldcw and ldcd is relaxed, and instead
> -   they only require "natural" alignment (4-byte for ldcw, 8-byte for
> -   ldcd). */
> -
> -#define __PA_LDCW_ALIGNMENT	4
> -#define __PA_LDCW_ALIGN_ORDER	2
> -#define __ldcw_align(a) (&(a)->slock)
> +#ifdef CONFIG_PA20
>  #define __LDCW	"ldcw,co"
> -
> -#endif /*!CONFIG_PA20*/
> +#else
> +#define __LDCW	"ldcw"
> +#endif
>  
>  /* LDCW, the only atomic read-write operation PA-RISC has. *sigh*.
>     We don't explicitly expose that "*a" may be written as reload
> diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h
> index efd06a897c6a..7b986b09dba8 100644
> --- a/arch/parisc/include/asm/spinlock_types.h
> +++ b/arch/parisc/include/asm/spinlock_types.h
> @@ -9,15 +9,10 @@
>  #ifndef __ASSEMBLY__
>  
>  typedef struct {
> -#ifdef CONFIG_PA20
> -	volatile unsigned int slock;
> -# define __ARCH_SPIN_LOCK_UNLOCKED { __ARCH_SPIN_LOCK_UNLOCKED_VAL }
> -#else
>  	volatile unsigned int lock[4];
>  # define __ARCH_SPIN_LOCK_UNLOCKED	\
>  	{ { __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL, \
>  	    __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL } }
> -#endif
>  } arch_spinlock_t;
>  
>  
>
> [[End of PGP Signed Part]]
John David Anglin Sept. 19, 2023, 7:36 p.m. UTC | #2
On 2023-09-19 2:31 p.m., Sam James wrote:
> John David Anglin <dave@parisc-linux.org> writes:
>
>> [[PGP Signed Part:Undecided]]
>> Back in 2005, Kyle McMartin removed the 16-byte alignment for ldcw
>> semaphores on PA 2.0 machines (CONFIG_PA20). This broke spinlocks
>> on pre PA8800 processors. The main symptom was random faults in
>> mmap'd memory (e.g., gcc compilations, etc).
>>
>> Unfortunately, the errata for this ldcw change is lost.
>>
>> The issue is the 16-byte alignment required for ldcw semaphore
>> instructions can only reduced to natural alignment when the ldcw
>> operation can be handled coherently in cache. Only PA8800 and
>> PA8900 processors actually support doing the operation in cache.
>>
>> Aligning the spinlock dynamically adds two integer instructions
>> to each spinlock.
>>
>> Tested on rp3440, c8000 and a500.
> Could you include a 'Fixes: <whatever the ancient commit hash is>'?
I looked but Kyle's commit predates git.  It was done to a cvs archive that I don't have.

It's available in the mail archive here:
https://lore.kernel.org/linux-parisc/20050609050702.GB4641@roadwarrior.mcmartin.ca/
>
> Also, what a find!
>
>> Signed-off-by: John David Anglin <dave.anglin@bell.net>
>> ---
>>
>> diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h
>> index 6d28b5514699..ee9e071859b2 100644
>> --- a/arch/parisc/include/asm/ldcw.h
>> +++ b/arch/parisc/include/asm/ldcw.h
>> @@ -2,39 +2,42 @@
>>   #ifndef __PARISC_LDCW_H
>>   #define __PARISC_LDCW_H
>>   
>> -#ifndef CONFIG_PA20
>>   /* Because kmalloc only guarantees 8-byte alignment for kmalloc'd data,
>>      and GCC only guarantees 8-byte alignment for stack locals, we can't
>>      be assured of 16-byte alignment for atomic lock data even if we
>>      specify "__attribute ((aligned(16)))" in the type declaration.  So,
>>      we use a struct containing an array of four ints for the atomic lock
>>      type and dynamically select the 16-byte aligned int from the array
>> -   for the semaphore.  */
>> +   for the semaphore. */
>> +
>> +/* From: "Jim Hull" <jim.hull of hp.com>
>> +   I've attached a summary of the change, but basically, for PA 2.0, as
>> +   long as the ",CO" (coherent operation) completer is implemented, then the
>> +   16-byte alignment requirement for ldcw and ldcd is relaxed, and instead
>> +   they only require "natural" alignment (4-byte for ldcw, 8-byte for
>> +   ldcd).
>> +
>> +   Although the cache control hint is accepted by all PA 2.0 processors,
>> +   it is only implemented on PA8800/PA8900 CPUs. Prior PA8X00 CPUs still
>> +   require 16-byte alignment. If the address is unaligned, the operation
>> +   of the instruction is undefined. The ldcw instruction does not generate
>> +   unaligned data reference traps so misaligned accesses are not detected.
>> +   This hid the problem for years. So, restore the 16-byte alignment dropped
>> +   by Kyle McMartin in "Remove __ldcw_align for PA-RISC 2.0 processors". */
>>   
>>   #define __PA_LDCW_ALIGNMENT	16
>> -#define __PA_LDCW_ALIGN_ORDER	4
>>   #define __ldcw_align(a) ({					\
>>   	unsigned long __ret = (unsigned long) &(a)->lock[0];	\
>>   	__ret = (__ret + __PA_LDCW_ALIGNMENT - 1)		\
>>   		& ~(__PA_LDCW_ALIGNMENT - 1);			\
>>   	(volatile unsigned int *) __ret;			\
>>   })
>> -#define __LDCW	"ldcw"
>>   
>> -#else /*CONFIG_PA20*/
>> -/* From: "Jim Hull" <jim.hull of hp.com>
>> -   I've attached a summary of the change, but basically, for PA 2.0, as
>> -   long as the ",CO" (coherent operation) completer is specified, then the
>> -   16-byte alignment requirement for ldcw and ldcd is relaxed, and instead
>> -   they only require "natural" alignment (4-byte for ldcw, 8-byte for
>> -   ldcd). */
>> -
>> -#define __PA_LDCW_ALIGNMENT	4
>> -#define __PA_LDCW_ALIGN_ORDER	2
>> -#define __ldcw_align(a) (&(a)->slock)
>> +#ifdef CONFIG_PA20
>>   #define __LDCW	"ldcw,co"
>> -
>> -#endif /*!CONFIG_PA20*/
>> +#else
>> +#define __LDCW	"ldcw"
>> +#endif
>>   
>>   /* LDCW, the only atomic read-write operation PA-RISC has. *sigh*.
>>      We don't explicitly expose that "*a" may be written as reload
>> diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h
>> index efd06a897c6a..7b986b09dba8 100644
>> --- a/arch/parisc/include/asm/spinlock_types.h
>> +++ b/arch/parisc/include/asm/spinlock_types.h
>> @@ -9,15 +9,10 @@
>>   #ifndef __ASSEMBLY__
>>   
>>   typedef struct {
>> -#ifdef CONFIG_PA20
>> -	volatile unsigned int slock;
>> -# define __ARCH_SPIN_LOCK_UNLOCKED { __ARCH_SPIN_LOCK_UNLOCKED_VAL }
>> -#else
>>   	volatile unsigned int lock[4];
>>   # define __ARCH_SPIN_LOCK_UNLOCKED	\
>>   	{ { __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL, \
>>   	    __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL } }
>> -#endif
>>   } arch_spinlock_t;
>>   
>>   
>>
>> [[End of PGP Signed Part]]
diff mbox series

Patch

diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h
index 6d28b5514699..ee9e071859b2 100644
--- a/arch/parisc/include/asm/ldcw.h
+++ b/arch/parisc/include/asm/ldcw.h
@@ -2,39 +2,42 @@ 
 #ifndef __PARISC_LDCW_H
 #define __PARISC_LDCW_H
 
-#ifndef CONFIG_PA20
 /* Because kmalloc only guarantees 8-byte alignment for kmalloc'd data,
    and GCC only guarantees 8-byte alignment for stack locals, we can't
    be assured of 16-byte alignment for atomic lock data even if we
    specify "__attribute ((aligned(16)))" in the type declaration.  So,
    we use a struct containing an array of four ints for the atomic lock
    type and dynamically select the 16-byte aligned int from the array
-   for the semaphore.  */
+   for the semaphore. */
+
+/* From: "Jim Hull" <jim.hull of hp.com>
+   I've attached a summary of the change, but basically, for PA 2.0, as
+   long as the ",CO" (coherent operation) completer is implemented, then the
+   16-byte alignment requirement for ldcw and ldcd is relaxed, and instead
+   they only require "natural" alignment (4-byte for ldcw, 8-byte for
+   ldcd).
+
+   Although the cache control hint is accepted by all PA 2.0 processors,
+   it is only implemented on PA8800/PA8900 CPUs. Prior PA8X00 CPUs still
+   require 16-byte alignment. If the address is unaligned, the operation
+   of the instruction is undefined. The ldcw instruction does not generate
+   unaligned data reference traps so misaligned accesses are not detected.
+   This hid the problem for years. So, restore the 16-byte alignment dropped
+   by Kyle McMartin in "Remove __ldcw_align for PA-RISC 2.0 processors". */
 
 #define __PA_LDCW_ALIGNMENT	16
-#define __PA_LDCW_ALIGN_ORDER	4
 #define __ldcw_align(a) ({					\
 	unsigned long __ret = (unsigned long) &(a)->lock[0];	\
 	__ret = (__ret + __PA_LDCW_ALIGNMENT - 1)		\
 		& ~(__PA_LDCW_ALIGNMENT - 1);			\
 	(volatile unsigned int *) __ret;			\
 })
-#define __LDCW	"ldcw"
 
-#else /*CONFIG_PA20*/
-/* From: "Jim Hull" <jim.hull of hp.com>
-   I've attached a summary of the change, but basically, for PA 2.0, as
-   long as the ",CO" (coherent operation) completer is specified, then the
-   16-byte alignment requirement for ldcw and ldcd is relaxed, and instead
-   they only require "natural" alignment (4-byte for ldcw, 8-byte for
-   ldcd). */
-
-#define __PA_LDCW_ALIGNMENT	4
-#define __PA_LDCW_ALIGN_ORDER	2
-#define __ldcw_align(a) (&(a)->slock)
+#ifdef CONFIG_PA20
 #define __LDCW	"ldcw,co"
-
-#endif /*!CONFIG_PA20*/
+#else
+#define __LDCW	"ldcw"
+#endif
 
 /* LDCW, the only atomic read-write operation PA-RISC has. *sigh*.
    We don't explicitly expose that "*a" may be written as reload
diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h
index efd06a897c6a..7b986b09dba8 100644
--- a/arch/parisc/include/asm/spinlock_types.h
+++ b/arch/parisc/include/asm/spinlock_types.h
@@ -9,15 +9,10 @@ 
 #ifndef __ASSEMBLY__
 
 typedef struct {
-#ifdef CONFIG_PA20
-	volatile unsigned int slock;
-# define __ARCH_SPIN_LOCK_UNLOCKED { __ARCH_SPIN_LOCK_UNLOCKED_VAL }
-#else
 	volatile unsigned int lock[4];
 # define __ARCH_SPIN_LOCK_UNLOCKED	\
 	{ { __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL, \
 	    __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL } }
-#endif
 } arch_spinlock_t;