diff mbox series

parisc: Remove locking from TLB handler and set page accessed

Message ID 20180916233010.GA5591@mx3210.localdomain (mailing list archive)
State Accepted, archived
Headers show
Series parisc: Remove locking from TLB handler and set page accessed | expand

Commit Message

John David Anglin Sept. 16, 2018, 11:30 p.m. UTC
Locking was introduced into the TLB handlers primarily to ensure that updates
to the accessed and dirty bits in PTE entries were atomic.  However, this
has a significant impact on performance.  Also, it turned out that the user
page defines in pgtable.h always set the accessed bit, so it wasn't necessary
to update the accessed bit at all.  The impact of this is that the kernel
doesn't know if a page has been accessed or not.

This patch removes the locking from the TLB handlers.  We update the page
accessed bit in the page table entries using a stb instruction that doesn't
affect the dirty bit.  Thus, an access can't clobber the dirty bit.

With this change, the accessed bit in user PTE entries is now updated when a
page is accessed and hopefully this will improve performance.


Signed-off-by: John David Anglin <dave.anglin@bell.net>

Comments

Mikulas Patocka Sept. 20, 2018, 2:42 p.m. UTC | #1
But there's code like this that needs to be changed. It needs to first set 
the PTE and then purge the tlb - otherwise the old pte value could be 
reloaded by a different CPU after purge_tlb_entries() because the TLB 
reload code no longer takes the lock.

static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
        unsigned long flags;
        spin_lock_irqsave(&pa_tlb_lock, flags);
        purge_tlb_entries(mm, addr);
        set_pte(ptep, pte_wrprotect(*ptep));
        spin_unlock_irqrestore(&pa_tlb_lock, flags);
}



The accessed bit is in the same byte as the present bit, so what will 
happen if the TLB fault races with pte clearing?

Linux also encodes the swap location in non-present PTEs - could that race 
with the update of the byte containing the accessed bit?

Perhaps the best solution would be to store the accessed and dirty flags 
in the two topmost bytes of the PTE and make sure that these bytes are not 
used for anything else.

Mikulas


On Sun, 16 Sep 2018, John David Anglin wrote:

> Locking was introduced into the TLB handlers primarily to ensure that updates
> to the accessed and dirty bits in PTE entries were atomic.  However, this
> has a significant impact on performance.  Also, it turned out that the user
> page defines in pgtable.h always set the accessed bit, so it wasn't necessary
> to update the accessed bit at all.  The impact of this is that the kernel
> doesn't know if a page has been accessed or not.
> 
> This patch removes the locking from the TLB handlers.  We update the page
> accessed bit in the page table entries using a stb instruction that doesn't
> affect the dirty bit.  Thus, an access can't clobber the dirty bit.
> 
> With this change, the accessed bit in user PTE entries is now updated when a
> page is accessed and hopefully this will improve performance.
> 
> diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
> index fa6b7c78f18a..7efce946ba04 100644
> --- a/arch/parisc/include/asm/pgtable.h
> +++ b/arch/parisc/include/asm/pgtable.h
> @@ -202,7 +202,7 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
>  #define _PAGE_HUGE     (1 << xlate_pabit(_PAGE_HPAGE_BIT))
>  #define _PAGE_USER     (1 << xlate_pabit(_PAGE_USER_BIT))
>  
> -#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE |  _PAGE_DIRTY | _PAGE_ACCESSED)
> +#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)
>  #define _PAGE_CHG_MASK	(PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
>  #define _PAGE_KERNEL_RO	(_PAGE_PRESENT | _PAGE_READ | _PAGE_DIRTY | _PAGE_ACCESSED)
>  #define _PAGE_KERNEL_EXEC	(_PAGE_KERNEL_RO | _PAGE_EXEC)
> @@ -227,22 +227,22 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
>  
>  #ifndef __ASSEMBLY__
>  
> -#define PAGE_NONE	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
> -#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_ACCESSED)
> +#define PAGE_NONE	__pgprot(_PAGE_PRESENT | _PAGE_USER)
> +#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE)
>  /* Others seem to make this executable, I don't know if that's correct
>     or not.  The stack is mapped this way though so this is necessary
>     in the short term - dhd@linuxcare.com, 2000-08-08 */
> -#define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED)
> -#define PAGE_WRITEONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE | _PAGE_ACCESSED)
> -#define PAGE_EXECREAD   __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC |_PAGE_ACCESSED)
> +#define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ)
> +#define PAGE_WRITEONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE)
> +#define PAGE_EXECREAD   __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC)
>  #define PAGE_COPY       PAGE_EXECREAD
> -#define PAGE_RWX        __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC |_PAGE_ACCESSED)
> +#define PAGE_RWX        __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
>  #define PAGE_KERNEL	__pgprot(_PAGE_KERNEL)
>  #define PAGE_KERNEL_EXEC	__pgprot(_PAGE_KERNEL_EXEC)
>  #define PAGE_KERNEL_RWX	__pgprot(_PAGE_KERNEL_RWX)
>  #define PAGE_KERNEL_RO	__pgprot(_PAGE_KERNEL_RO)
>  #define PAGE_KERNEL_UNC	__pgprot(_PAGE_KERNEL | _PAGE_NO_CACHE)
> -#define PAGE_GATEWAY    __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_GATEWAY| _PAGE_READ)
> +#define PAGE_GATEWAY    __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_GATEWAY| _PAGE_READ)
>  
>  
>  /*
> diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
> index 1b4732e20137..8039241b669b 100644
> --- a/arch/parisc/kernel/entry.S
> +++ b/arch/parisc/kernel/entry.S
> @@ -46,16 +46,6 @@
>  	.level 2.0
>  #endif
>  
> -	.import		pa_tlb_lock,data
> -	.macro  load_pa_tlb_lock reg
> -#if __PA_LDCW_ALIGNMENT > 4
> -	load32	PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
> -	depi	0,31,__PA_LDCW_ALIGN_ORDER, \reg
> -#else
> -	load32	PA(pa_tlb_lock), \reg
> -#endif
> -	.endm
> -
>  	/* space_to_prot macro creates a prot id from a space id */
>  
>  #if (SPACEID_SHIFT) == 0
> @@ -462,47 +452,17 @@
>  	L2_ptep		\pgd,\pte,\index,\va,\fault
>  	.endm
>  
> -	/* Acquire pa_tlb_lock lock and recheck page is still present. */
> -	.macro		tlb_lock	spc,ptp,pte,tmp,tmp1,fault
> -#ifdef CONFIG_SMP
> -	cmpib,COND(=),n	0,\spc,2f
> -	load_pa_tlb_lock \tmp
> -1:	LDCW		0(\tmp),\tmp1
> -	cmpib,COND(=)	0,\tmp1,1b
> -	nop
> -	LDREG		0(\ptp),\pte
> -	bb,<,n		\pte,_PAGE_PRESENT_BIT,2f
> -	b		\fault
> -	stw		 \spc,0(\tmp)
> -2:
> -#endif
> -	.endm
> -
> -	/* Release pa_tlb_lock lock without reloading lock address. */
> -	.macro		tlb_unlock0	spc,tmp
> -#ifdef CONFIG_SMP
> -	or,COND(=)	%r0,\spc,%r0
> -	sync
> -	or,COND(=)	%r0,\spc,%r0
> -	stw             \spc,0(\tmp)
> -#endif
> -	.endm
> -
> -	/* Release pa_tlb_lock lock. */
> -	.macro		tlb_unlock1	spc,tmp
> -#ifdef CONFIG_SMP
> -	load_pa_tlb_lock \tmp
> -	tlb_unlock0	\spc,\tmp
> -#endif
> -	.endm
> -
> -	/* Set the _PAGE_ACCESSED bit of the PTE.  Be clever and
> -	 * don't needlessly dirty the cache line if it was already set */
> +	/* Set the _PAGE_ACCESSED bit of the PTE without overwriting the
> +	 * dirty bit.  Fortunately, it resides in a different byte of
> +	 * the PTE than the dirty bit.  Be clever and don't needlessly
> +	 * dirty the cache line if it was already set */
>  	.macro		update_accessed	ptp,pte,tmp,tmp1
>  	ldi		_PAGE_ACCESSED,\tmp1
>  	or		\tmp1,\pte,\tmp
> +	/* Extract aligned byte from PTE containing page accessed bit */
> +	extru		\tmp,_PAGE_ACCESSED_BIT,8,\tmp
>  	and,COND(<>)	\tmp1,\pte,%r0
> -	STREG		\tmp,0(\ptp)
> +	stb		\tmp,__BITS_PER_LONG/8-2(\tmp)
>  	.endm
>  
>  	/* Set the dirty bit (and accessed bit).  No need to be
> @@ -1167,14 +1127,12 @@ dtlb_miss_20w:
>  
>  	L3_ptep		ptp,pte,t0,va,dtlb_check_alias_20w
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,dtlb_check_alias_20w
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
>  	
>  	idtlbt          pte,prot
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1193,14 +1151,12 @@ nadtlb_miss_20w:
>  
>  	L3_ptep		ptp,pte,t0,va,nadtlb_check_alias_20w
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,nadtlb_check_alias_20w
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
>  
>  	idtlbt          pte,prot
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1221,7 +1177,6 @@ dtlb_miss_11:
>  
>  	L2_ptep		ptp,pte,t0,va,dtlb_check_alias_11
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,dtlb_check_alias_11
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb_11	spc,pte,prot
> @@ -1234,7 +1189,6 @@ dtlb_miss_11:
>  
>  	mtsp		t1, %sr1	/* Restore sr1 */
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1254,7 +1208,6 @@ nadtlb_miss_11:
>  
>  	L2_ptep		ptp,pte,t0,va,nadtlb_check_alias_11
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,nadtlb_check_alias_11
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb_11	spc,pte,prot
> @@ -1267,7 +1220,6 @@ nadtlb_miss_11:
>  
>  	mtsp		t1, %sr1	/* Restore sr1 */
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1287,7 +1239,6 @@ dtlb_miss_20:
>  
>  	L2_ptep		ptp,pte,t0,va,dtlb_check_alias_20
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,dtlb_check_alias_20
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
> @@ -1296,7 +1247,6 @@ dtlb_miss_20:
>  
>  	idtlbt          pte,prot
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1315,7 +1265,6 @@ nadtlb_miss_20:
>  
>  	L2_ptep		ptp,pte,t0,va,nadtlb_check_alias_20
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,nadtlb_check_alias_20
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
> @@ -1324,7 +1273,6 @@ nadtlb_miss_20:
>  	
>  	idtlbt		pte,prot
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1424,14 +1372,12 @@ itlb_miss_20w:
>  
>  	L3_ptep		ptp,pte,t0,va,itlb_fault
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,itlb_fault
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
>  	
>  	iitlbt          pte,prot
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1448,14 +1394,12 @@ naitlb_miss_20w:
>  
>  	L3_ptep		ptp,pte,t0,va,naitlb_check_alias_20w
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,naitlb_check_alias_20w
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
>  
>  	iitlbt          pte,prot
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1476,7 +1420,6 @@ itlb_miss_11:
>  
>  	L2_ptep		ptp,pte,t0,va,itlb_fault
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,itlb_fault
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb_11	spc,pte,prot
> @@ -1489,7 +1432,6 @@ itlb_miss_11:
>  
>  	mtsp		t1, %sr1	/* Restore sr1 */
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1500,7 +1442,6 @@ naitlb_miss_11:
>  
>  	L2_ptep		ptp,pte,t0,va,naitlb_check_alias_11
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,naitlb_check_alias_11
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb_11	spc,pte,prot
> @@ -1513,7 +1454,6 @@ naitlb_miss_11:
>  
>  	mtsp		t1, %sr1	/* Restore sr1 */
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1534,7 +1474,6 @@ itlb_miss_20:
>  
>  	L2_ptep		ptp,pte,t0,va,itlb_fault
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,itlb_fault
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
> @@ -1543,7 +1482,6 @@ itlb_miss_20:
>  
>  	iitlbt          pte,prot
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1554,7 +1492,6 @@ naitlb_miss_20:
>  
>  	L2_ptep		ptp,pte,t0,va,naitlb_check_alias_20
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,naitlb_check_alias_20
>  	update_accessed	ptp,pte,t0,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
> @@ -1563,7 +1500,6 @@ naitlb_miss_20:
>  
>  	iitlbt          pte,prot
>  
> -	tlb_unlock1	spc,t0
>  	rfir
>  	nop
>  
> @@ -1586,14 +1522,12 @@ dbit_trap_20w:
>  
>  	L3_ptep		ptp,pte,t0,va,dbit_fault
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,dbit_fault
>  	update_dirty	ptp,pte,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
>  		
>  	idtlbt          pte,prot
>  
> -	tlb_unlock0	spc,t0
>  	rfir
>  	nop
>  #else
> @@ -1606,7 +1540,6 @@ dbit_trap_11:
>  
>  	L2_ptep		ptp,pte,t0,va,dbit_fault
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,dbit_fault
>  	update_dirty	ptp,pte,t1
>  
>  	make_insert_tlb_11	spc,pte,prot
> @@ -1619,7 +1552,6 @@ dbit_trap_11:
>  
>  	mtsp            t1, %sr1     /* Restore sr1 */
>  
> -	tlb_unlock0	spc,t0
>  	rfir
>  	nop
>  
> @@ -1630,7 +1562,6 @@ dbit_trap_20:
>  
>  	L2_ptep		ptp,pte,t0,va,dbit_fault
>  
> -	tlb_lock	spc,ptp,pte,t0,t1,dbit_fault
>  	update_dirty	ptp,pte,t1
>  
>  	make_insert_tlb	spc,pte,prot,t1
> @@ -1639,7 +1570,6 @@ dbit_trap_20:
>  	
>  	idtlbt		pte,prot
>  
> -	tlb_unlock0	spc,t0
>  	rfir
>  	nop
>  #endif
> 
> Signed-off-by: John David Anglin <dave.anglin@bell.net>
> 
>
Mikulas Patocka Sept. 20, 2018, 4:11 p.m. UTC | #2
On Thu, 20 Sep 2018, Mikulas Patocka wrote:

> But there's code like this that needs to be changed. It needs to first set 
> the PTE and then purge the tlb - otherwise the old pte value could be 
> reloaded by a different CPU after purge_tlb_entries() because the TLB 
> reload code no longer takes the lock.
> 
> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
> {
>         unsigned long flags;
>         spin_lock_irqsave(&pa_tlb_lock, flags);
>         purge_tlb_entries(mm, addr);
>         set_pte(ptep, pte_wrprotect(*ptep));
>         spin_unlock_irqrestore(&pa_tlb_lock, flags);
> }
> 
> 
> 
> The accessed bit is in the same byte as the present bit, so what will 
> happen if the TLB fault races with pte clearing?
> 
> Linux also encodes the swap location in non-present PTEs - could that race 
> with the update of the byte containing the accessed bit?
> 
> Perhaps the best solution would be to store the accessed and dirty flags 
> in the two topmost bytes of the PTE and make sure that these bytes are not 
> used for anything else.
> 
> Mikulas

And I realized that even if you don't modify the PTE, dropping the lock 
from the TLB handler is wrong.

If you drop the lock, the CPU can read valid PTE, then wait unlimited 
amount of time (due to some microarchitectural event) and then insert the 
PTE into TLB - long after the PTE was cleared and flushed by some other 
CPU.

If you drop the lock, you must stop relying on the instructions that 
broadcast TLB shootdown and you must send IPIs to flush TLBs just like on 
x86.

Mikulas
John David Anglin Sept. 20, 2018, 5 p.m. UTC | #3
You ask good questions.  If the races that you describe are possible, 
this argues for continuing
to use the lock.  It has never been clear to me whether the kernel 
changes page table entries
while code using the entries or not.

On 2018-09-20 10:42 AM, Mikulas Patocka wrote:
> But there's code like this that needs to be changed. It needs to first set
> the PTE and then purge the tlb - otherwise the old pte value could be
> reloaded by a different CPU after purge_tlb_entries() because the TLB
> reload code no longer takes the lock.
>
> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
> {
>          unsigned long flags;
>          spin_lock_irqsave(&pa_tlb_lock, flags);
>          purge_tlb_entries(mm, addr);
>          set_pte(ptep, pte_wrprotect(*ptep));
>          spin_unlock_irqrestore(&pa_tlb_lock, flags);
> }
Should be tried.
>
> The accessed bit is in the same byte as the present bit, so what will
> happen if the TLB fault races with pte clearing?
>
> Linux also encodes the swap location in non-present PTEs - could that race
> with the update of the byte containing the accessed bit?
The current code checks the present bit twice, once inside the lock.  
The initial check is only
required for kernel faults
>
> Perhaps the best solution would be to store the accessed and dirty flags
> in the two topmost bytes of the PTE and make sure that these bytes are not
> used for anything else.
The dirty flag is a hardware flag and I don't think it can move.
>
> Mikulas
>
>
> On Sun, 16 Sep 2018, John David Anglin wrote:
>
>> Locking was introduced into the TLB handlers primarily to ensure that updates
>> to the accessed and dirty bits in PTE entries were atomic.  However, this
>> has a significant impact on performance.  Also, it turned out that the user
>> page defines in pgtable.h always set the accessed bit, so it wasn't necessary
>> to update the accessed bit at all.  The impact of this is that the kernel
>> doesn't know if a page has been accessed or not.
>>
>> This patch removes the locking from the TLB handlers.  We update the page
>> accessed bit in the page table entries using a stb instruction that doesn't
>> affect the dirty bit.  Thus, an access can't clobber the dirty bit.
>>
>> With this change, the accessed bit in user PTE entries is now updated when a
>> page is accessed and hopefully this will improve performance.
>>
>> diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
>> index fa6b7c78f18a..7efce946ba04 100644
>> --- a/arch/parisc/include/asm/pgtable.h
>> +++ b/arch/parisc/include/asm/pgtable.h
>> @@ -202,7 +202,7 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
>>   #define _PAGE_HUGE     (1 << xlate_pabit(_PAGE_HPAGE_BIT))
>>   #define _PAGE_USER     (1 << xlate_pabit(_PAGE_USER_BIT))
>>   
>> -#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE |  _PAGE_DIRTY | _PAGE_ACCESSED)
>> +#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)
>>   #define _PAGE_CHG_MASK	(PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
>>   #define _PAGE_KERNEL_RO	(_PAGE_PRESENT | _PAGE_READ | _PAGE_DIRTY | _PAGE_ACCESSED)
>>   #define _PAGE_KERNEL_EXEC	(_PAGE_KERNEL_RO | _PAGE_EXEC)
>> @@ -227,22 +227,22 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
>>   
>>   #ifndef __ASSEMBLY__
>>   
>> -#define PAGE_NONE	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
>> -#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_ACCESSED)
>> +#define PAGE_NONE	__pgprot(_PAGE_PRESENT | _PAGE_USER)
>> +#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE)
>>   /* Others seem to make this executable, I don't know if that's correct
>>      or not.  The stack is mapped this way though so this is necessary
>>      in the short term - dhd@linuxcare.com, 2000-08-08 */
>> -#define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED)
>> -#define PAGE_WRITEONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE | _PAGE_ACCESSED)
>> -#define PAGE_EXECREAD   __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC |_PAGE_ACCESSED)
>> +#define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ)
>> +#define PAGE_WRITEONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE)
>> +#define PAGE_EXECREAD   __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC)
>>   #define PAGE_COPY       PAGE_EXECREAD
>> -#define PAGE_RWX        __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC |_PAGE_ACCESSED)
>> +#define PAGE_RWX        __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
>>   #define PAGE_KERNEL	__pgprot(_PAGE_KERNEL)
>>   #define PAGE_KERNEL_EXEC	__pgprot(_PAGE_KERNEL_EXEC)
>>   #define PAGE_KERNEL_RWX	__pgprot(_PAGE_KERNEL_RWX)
>>   #define PAGE_KERNEL_RO	__pgprot(_PAGE_KERNEL_RO)
>>   #define PAGE_KERNEL_UNC	__pgprot(_PAGE_KERNEL | _PAGE_NO_CACHE)
>> -#define PAGE_GATEWAY    __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_GATEWAY| _PAGE_READ)
>> +#define PAGE_GATEWAY    __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_GATEWAY| _PAGE_READ)
>>   
>>   
>>   /*
>> diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
>> index 1b4732e20137..8039241b669b 100644
>> --- a/arch/parisc/kernel/entry.S
>> +++ b/arch/parisc/kernel/entry.S
>> @@ -46,16 +46,6 @@
>>   	.level 2.0
>>   #endif
>>   
>> -	.import		pa_tlb_lock,data
>> -	.macro  load_pa_tlb_lock reg
>> -#if __PA_LDCW_ALIGNMENT > 4
>> -	load32	PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
>> -	depi	0,31,__PA_LDCW_ALIGN_ORDER, \reg
>> -#else
>> -	load32	PA(pa_tlb_lock), \reg
>> -#endif
>> -	.endm
>> -
>>   	/* space_to_prot macro creates a prot id from a space id */
>>   
>>   #if (SPACEID_SHIFT) == 0
>> @@ -462,47 +452,17 @@
>>   	L2_ptep		\pgd,\pte,\index,\va,\fault
>>   	.endm
>>   
>> -	/* Acquire pa_tlb_lock lock and recheck page is still present. */
>> -	.macro		tlb_lock	spc,ptp,pte,tmp,tmp1,fault
>> -#ifdef CONFIG_SMP
>> -	cmpib,COND(=),n	0,\spc,2f
>> -	load_pa_tlb_lock \tmp
>> -1:	LDCW		0(\tmp),\tmp1
>> -	cmpib,COND(=)	0,\tmp1,1b
>> -	nop
>> -	LDREG		0(\ptp),\pte
>> -	bb,<,n		\pte,_PAGE_PRESENT_BIT,2f
>> -	b		\fault
>> -	stw		 \spc,0(\tmp)
>> -2:
>> -#endif
>> -	.endm
>> -
>> -	/* Release pa_tlb_lock lock without reloading lock address. */
>> -	.macro		tlb_unlock0	spc,tmp
>> -#ifdef CONFIG_SMP
>> -	or,COND(=)	%r0,\spc,%r0
>> -	sync
>> -	or,COND(=)	%r0,\spc,%r0
>> -	stw             \spc,0(\tmp)
>> -#endif
>> -	.endm
>> -
>> -	/* Release pa_tlb_lock lock. */
>> -	.macro		tlb_unlock1	spc,tmp
>> -#ifdef CONFIG_SMP
>> -	load_pa_tlb_lock \tmp
>> -	tlb_unlock0	\spc,\tmp
>> -#endif
>> -	.endm
>> -
>> -	/* Set the _PAGE_ACCESSED bit of the PTE.  Be clever and
>> -	 * don't needlessly dirty the cache line if it was already set */
>> +	/* Set the _PAGE_ACCESSED bit of the PTE without overwriting the
>> +	 * dirty bit.  Fortunately, it resides in a different byte of
>> +	 * the PTE than the dirty bit.  Be clever and don't needlessly
>> +	 * dirty the cache line if it was already set */
>>   	.macro		update_accessed	ptp,pte,tmp,tmp1
>>   	ldi		_PAGE_ACCESSED,\tmp1
>>   	or		\tmp1,\pte,\tmp
>> +	/* Extract aligned byte from PTE containing page accessed bit */
>> +	extru		\tmp,_PAGE_ACCESSED_BIT,8,\tmp
>>   	and,COND(<>)	\tmp1,\pte,%r0
>> -	STREG		\tmp,0(\ptp)
>> +	stb		\tmp,__BITS_PER_LONG/8-2(\tmp)
>>   	.endm
>>   
>>   	/* Set the dirty bit (and accessed bit).  No need to be
>> @@ -1167,14 +1127,12 @@ dtlb_miss_20w:
>>   
>>   	L3_ptep		ptp,pte,t0,va,dtlb_check_alias_20w
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,dtlb_check_alias_20w
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>>   	
>>   	idtlbt          pte,prot
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1193,14 +1151,12 @@ nadtlb_miss_20w:
>>   
>>   	L3_ptep		ptp,pte,t0,va,nadtlb_check_alias_20w
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,nadtlb_check_alias_20w
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>>   
>>   	idtlbt          pte,prot
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1221,7 +1177,6 @@ dtlb_miss_11:
>>   
>>   	L2_ptep		ptp,pte,t0,va,dtlb_check_alias_11
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,dtlb_check_alias_11
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb_11	spc,pte,prot
>> @@ -1234,7 +1189,6 @@ dtlb_miss_11:
>>   
>>   	mtsp		t1, %sr1	/* Restore sr1 */
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1254,7 +1208,6 @@ nadtlb_miss_11:
>>   
>>   	L2_ptep		ptp,pte,t0,va,nadtlb_check_alias_11
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,nadtlb_check_alias_11
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb_11	spc,pte,prot
>> @@ -1267,7 +1220,6 @@ nadtlb_miss_11:
>>   
>>   	mtsp		t1, %sr1	/* Restore sr1 */
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1287,7 +1239,6 @@ dtlb_miss_20:
>>   
>>   	L2_ptep		ptp,pte,t0,va,dtlb_check_alias_20
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,dtlb_check_alias_20
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>> @@ -1296,7 +1247,6 @@ dtlb_miss_20:
>>   
>>   	idtlbt          pte,prot
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1315,7 +1265,6 @@ nadtlb_miss_20:
>>   
>>   	L2_ptep		ptp,pte,t0,va,nadtlb_check_alias_20
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,nadtlb_check_alias_20
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>> @@ -1324,7 +1273,6 @@ nadtlb_miss_20:
>>   	
>>   	idtlbt		pte,prot
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1424,14 +1372,12 @@ itlb_miss_20w:
>>   
>>   	L3_ptep		ptp,pte,t0,va,itlb_fault
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,itlb_fault
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>>   	
>>   	iitlbt          pte,prot
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1448,14 +1394,12 @@ naitlb_miss_20w:
>>   
>>   	L3_ptep		ptp,pte,t0,va,naitlb_check_alias_20w
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,naitlb_check_alias_20w
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>>   
>>   	iitlbt          pte,prot
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1476,7 +1420,6 @@ itlb_miss_11:
>>   
>>   	L2_ptep		ptp,pte,t0,va,itlb_fault
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,itlb_fault
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb_11	spc,pte,prot
>> @@ -1489,7 +1432,6 @@ itlb_miss_11:
>>   
>>   	mtsp		t1, %sr1	/* Restore sr1 */
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1500,7 +1442,6 @@ naitlb_miss_11:
>>   
>>   	L2_ptep		ptp,pte,t0,va,naitlb_check_alias_11
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,naitlb_check_alias_11
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb_11	spc,pte,prot
>> @@ -1513,7 +1454,6 @@ naitlb_miss_11:
>>   
>>   	mtsp		t1, %sr1	/* Restore sr1 */
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1534,7 +1474,6 @@ itlb_miss_20:
>>   
>>   	L2_ptep		ptp,pte,t0,va,itlb_fault
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,itlb_fault
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>> @@ -1543,7 +1482,6 @@ itlb_miss_20:
>>   
>>   	iitlbt          pte,prot
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1554,7 +1492,6 @@ naitlb_miss_20:
>>   
>>   	L2_ptep		ptp,pte,t0,va,naitlb_check_alias_20
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,naitlb_check_alias_20
>>   	update_accessed	ptp,pte,t0,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>> @@ -1563,7 +1500,6 @@ naitlb_miss_20:
>>   
>>   	iitlbt          pte,prot
>>   
>> -	tlb_unlock1	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1586,14 +1522,12 @@ dbit_trap_20w:
>>   
>>   	L3_ptep		ptp,pte,t0,va,dbit_fault
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,dbit_fault
>>   	update_dirty	ptp,pte,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>>   		
>>   	idtlbt          pte,prot
>>   
>> -	tlb_unlock0	spc,t0
>>   	rfir
>>   	nop
>>   #else
>> @@ -1606,7 +1540,6 @@ dbit_trap_11:
>>   
>>   	L2_ptep		ptp,pte,t0,va,dbit_fault
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,dbit_fault
>>   	update_dirty	ptp,pte,t1
>>   
>>   	make_insert_tlb_11	spc,pte,prot
>> @@ -1619,7 +1552,6 @@ dbit_trap_11:
>>   
>>   	mtsp            t1, %sr1     /* Restore sr1 */
>>   
>> -	tlb_unlock0	spc,t0
>>   	rfir
>>   	nop
>>   
>> @@ -1630,7 +1562,6 @@ dbit_trap_20:
>>   
>>   	L2_ptep		ptp,pte,t0,va,dbit_fault
>>   
>> -	tlb_lock	spc,ptp,pte,t0,t1,dbit_fault
>>   	update_dirty	ptp,pte,t1
>>   
>>   	make_insert_tlb	spc,pte,prot,t1
>> @@ -1639,7 +1570,6 @@ dbit_trap_20:
>>   	
>>   	idtlbt		pte,prot
>>   
>> -	tlb_unlock0	spc,t0
>>   	rfir
>>   	nop
>>   #endif
>>
>> Signed-off-by: John David Anglin <dave.anglin@bell.net>
>>
>>
Mikulas Patocka Sept. 20, 2018, 5:40 p.m. UTC | #4
On Thu, 20 Sep 2018, John David Anglin wrote:

> You ask good questions.  If the races that you describe are possible, this
> argues for continuing
> to use the lock.  It has never been clear to me whether the kernel changes
> page table entries
> while code using the entries or not.

The kernel doesn't stop processes while changing the pagetables - so the 
change may happen simultaneously with another CPU accessing the PTE that 
is being changed.

In x86 the kernel changes the pagetable and then broadcasts an IPI to 
other processors to flush the TLB.

On parisc, the request to flush the TLB may be broadcasted by the hardware 
- but we need the lock in the TLB handler to make sure that it doesn't 
load an entry after it was invalidated.

> On 2018-09-20 10:42 AM, Mikulas Patocka wrote:
> > But there's code like this that needs to be changed. It needs to first set
> > the PTE and then purge the tlb - otherwise the old pte value could be
> > reloaded by a different CPU after purge_tlb_entries() because the TLB
> > reload code no longer takes the lock.
> > 
> > static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long
> > addr, pte_t *ptep)
> > {
> >          unsigned long flags;
> >          spin_lock_irqsave(&pa_tlb_lock, flags);
> >          purge_tlb_entries(mm, addr);
> >          set_pte(ptep, pte_wrprotect(*ptep));
> >          spin_unlock_irqrestore(&pa_tlb_lock, flags);
> > }
> Should be tried.

But I think this isn't sufficient - see my other email. There's this race 
possible if we drop the lock from the fault handler:

CPU1 (executing a page fault handler):
load *pte from pagetable

CPU2 (executing ptep_set_wrprotect)
spin_lock_irqsave(&pa_tlb_lock, flags);
set_pte(ptep, pte_wrprotect(*ptep));
purge_tlb_entries(mm, addr);
spin_unlock_irqrestore(&pa_tlb_lock, flags);

CPU1 (continuing the page fault handler)
insert pte into TLB - now CPU1 is running with stale pte entry in TLB


Does anyone have an idea how does HP-UX handle the TLB faults?


Mikulas
John David Anglin Sept. 20, 2018, 6:30 p.m. UTC | #5
On 2018-09-20 12:11 PM, Mikulas Patocka wrote:
>
> On Thu, 20 Sep 2018, Mikulas Patocka wrote:
>
>> But there's code like this that needs to be changed. It needs to first set
>> the PTE and then purge the tlb - otherwise the old pte value could be
>> reloaded by a different CPU after purge_tlb_entries() because the TLB
>> reload code no longer takes the lock.
>>
>> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
>> {
>>          unsigned long flags;
>>          spin_lock_irqsave(&pa_tlb_lock, flags);
>>          purge_tlb_entries(mm, addr);
>>          set_pte(ptep, pte_wrprotect(*ptep));
>>          spin_unlock_irqrestore(&pa_tlb_lock, flags);
>> }
>>
>>
>>
>> The accessed bit is in the same byte as the present bit, so what will
>> happen if the TLB fault races with pte clearing?
>>
>> Linux also encodes the swap location in non-present PTEs - could that race
>> with the update of the byte containing the accessed bit?
>>
>> Perhaps the best solution would be to store the accessed and dirty flags
>> in the two topmost bytes of the PTE and make sure that these bytes are not
>> used for anything else.
>>
>> Mikulas
> And I realized that even if you don't modify the PTE, dropping the lock
> from the TLB handler is wrong.
>
> If you drop the lock, the CPU can read valid PTE, then wait unlimited
> amount of time (due to some microarchitectural event) and then insert the
> PTE into TLB - long after the PTE was cleared and flushed by some other
> CPU.
>
> If you drop the lock, you must stop relying on the instructions that
> broadcast TLB shootdown and you must send IPIs to flush TLBs just like on
> x86.
I don't believe the lock helps with the insert.  As far as I can tell 
from the arch, TLB inserts
are not considered accesses.  So, the insert can't be contained inside 
the lock due to instruction
reordering.  We need an access dependency but idtlbt, for example, has 
no output register.

After the current lock is taken, the only instruction that cause a 
significant microarchitectural event
is the reload of the PTE.  We might get an instruction TLB fault, 
particularly on the insert, but that,
seems unlikely.  I think this suggests that the lock release should 
remain after the insert as it
might cause an event.

The lock was originally just for the instructions that broadcast a TLB 
purge.

Using IPIs to do TLB flushes is another can of worms as they require 
interrupts.  Only PA 2.0
has local purges and doing a full TLB shootdown takes a long time.
Mikulas Patocka Sept. 20, 2018, 6:39 p.m. UTC | #6
On Thu, 20 Sep 2018, John David Anglin wrote:

> On 2018-09-20 12:11 PM, Mikulas Patocka wrote:
> > 
> > On Thu, 20 Sep 2018, Mikulas Patocka wrote:
> > 
> > > But there's code like this that needs to be changed. It needs to first set
> > > the PTE and then purge the tlb - otherwise the old pte value could be
> > > reloaded by a different CPU after purge_tlb_entries() because the TLB
> > > reload code no longer takes the lock.
> > > 
> > > static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long
> > > addr, pte_t *ptep)
> > > {
> > >          unsigned long flags;
> > >          spin_lock_irqsave(&pa_tlb_lock, flags);
> > >          purge_tlb_entries(mm, addr);
> > >          set_pte(ptep, pte_wrprotect(*ptep));
> > >          spin_unlock_irqrestore(&pa_tlb_lock, flags);
> > > }
> > > 
> > > 
> > > 
> > > The accessed bit is in the same byte as the present bit, so what will
> > > happen if the TLB fault races with pte clearing?
> > > 
> > > Linux also encodes the swap location in non-present PTEs - could that race
> > > with the update of the byte containing the accessed bit?
> > > 
> > > Perhaps the best solution would be to store the accessed and dirty flags
> > > in the two topmost bytes of the PTE and make sure that these bytes are not
> > > used for anything else.
> > > 
> > > Mikulas
> > And I realized that even if you don't modify the PTE, dropping the lock
> > from the TLB handler is wrong.
> > 
> > If you drop the lock, the CPU can read valid PTE, then wait unlimited
> > amount of time (due to some microarchitectural event) and then insert the
> > PTE into TLB - long after the PTE was cleared and flushed by some other
> > CPU.
> > 
> > If you drop the lock, you must stop relying on the instructions that
> > broadcast TLB shootdown and you must send IPIs to flush TLBs just like on
> > x86.
> 
> I don't believe the lock helps with the insert.  As far as I can tell 

If you insert into TLB without holding the lock, you may be inserting 
arbitrarily old value.

Mikulas
John David Anglin Sept. 20, 2018, 6:55 p.m. UTC | #7
On 2018-09-20 1:40 PM, Mikulas Patocka wrote:
>
> On Thu, 20 Sep 2018, John David Anglin wrote:
>
>> You ask good questions.  If the races that you describe are possible, this
>> argues for continuing
>> to use the lock.  It has never been clear to me whether the kernel changes
>> page table entries
>> while code using the entries or not.
> The kernel doesn't stop processes while changing the pagetables - so the
> change may happen simultaneously with another CPU accessing the PTE that
> is being changed.
Okay.
>
> In x86 the kernel changes the pagetable and then broadcasts an IPI to
> other processors to flush the TLB.
>
> On parisc, the request to flush the TLB may be broadcasted by the hardware
> - but we need the lock in the TLB handler to make sure that it doesn't
> load an entry after it was invalidated.
>
>> On 2018-09-20 10:42 AM, Mikulas Patocka wrote:
>>> But there's code like this that needs to be changed. It needs to first set
>>> the PTE and then purge the tlb - otherwise the old pte value could be
>>> reloaded by a different CPU after purge_tlb_entries() because the TLB
>>> reload code no longer takes the lock.
>>>
>>> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long
>>> addr, pte_t *ptep)
>>> {
>>>           unsigned long flags;
>>>           spin_lock_irqsave(&pa_tlb_lock, flags);
>>>           purge_tlb_entries(mm, addr);
>>>           set_pte(ptep, pte_wrprotect(*ptep));
>>>           spin_unlock_irqrestore(&pa_tlb_lock, flags);
>>> }
>> Should be tried.
> But I think this isn't sufficient - see my other email. There's this race
> possible if we drop the lock from the fault handler:
>
> CPU1 (executing a page fault handler):
> load *pte from pagetable
>
> CPU2 (executing ptep_set_wrprotect)
> spin_lock_irqsave(&pa_tlb_lock, flags);
> set_pte(ptep, pte_wrprotect(*ptep));
> purge_tlb_entries(mm, addr);
> spin_unlock_irqrestore(&pa_tlb_lock, flags);
>
> CPU1 (continuing the page fault handler)
> insert pte into TLB - now CPU1 is running with stale pte entry in TLB
See my comment about locking and TLB inserts.  I think we have fairly 
deterministic timing
in TLB handler but the lock doesn't ensure that the release won't occur 
occur before the
insert.  Here, I'm assuming we still lock in TLB handler.

I added the purge_tlb_entries code and it might be redundant.  Maybe it 
makes the potential
for a race worse?
>
>
> Does anyone have an idea how does HP-UX handle the TLB faults?
I can disassemble HP-UX but I'm not sure if it will help.  I know the 
standard spin lock code
uses waiters.

Dave
John David Anglin Sept. 20, 2018, 7:04 p.m. UTC | #8
On 2018-09-20 2:39 PM, Mikulas Patocka wrote:
>
> On Thu, 20 Sep 2018, John David Anglin wrote:
>
>> On 2018-09-20 12:11 PM, Mikulas Patocka wrote:
>>> On Thu, 20 Sep 2018, Mikulas Patocka wrote:
>>>
>>>> But there's code like this that needs to be changed. It needs to first set
>>>> the PTE and then purge the tlb - otherwise the old pte value could be
>>>> reloaded by a different CPU after purge_tlb_entries() because the TLB
>>>> reload code no longer takes the lock.
>>>>
>>>> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long
>>>> addr, pte_t *ptep)
>>>> {
>>>>           unsigned long flags;
>>>>           spin_lock_irqsave(&pa_tlb_lock, flags);
>>>>           purge_tlb_entries(mm, addr);
>>>>           set_pte(ptep, pte_wrprotect(*ptep));
>>>>           spin_unlock_irqrestore(&pa_tlb_lock, flags);
>>>> }
>>>>
>>>>
>>>>
>>>> The accessed bit is in the same byte as the present bit, so what will
>>>> happen if the TLB fault races with pte clearing?
>>>>
>>>> Linux also encodes the swap location in non-present PTEs - could that race
>>>> with the update of the byte containing the accessed bit?
>>>>
>>>> Perhaps the best solution would be to store the accessed and dirty flags
>>>> in the two topmost bytes of the PTE and make sure that these bytes are not
>>>> used for anything else.
>>>>
>>>> Mikulas
>>> And I realized that even if you don't modify the PTE, dropping the lock
>>> from the TLB handler is wrong.
>>>
>>> If you drop the lock, the CPU can read valid PTE, then wait unlimited
>>> amount of time (due to some microarchitectural event) and then insert the
>>> PTE into TLB - long after the PTE was cleared and flushed by some other
>>> CPU.
>>>
>>> If you drop the lock, you must stop relying on the instructions that
>>> broadcast TLB shootdown and you must send IPIs to flush TLBs just like on
>>> x86.
>> I don't believe the lock helps with the insert.  As far as I can tell
> If you insert into TLB without holding the lock, you may be inserting
> arbitrarily old value.
There is only a problem if this is done after purge.

On PA 2.0, there are two reorder buffers - one for accesses and the 
other for integer operations.
Some instructions go in both.  The arch does not say TLB inserts are 
accesses.  TLB purges are
accesses.  So, integer operations can leak from a spin lock except when 
there is a dependency on
an access.
Mikulas Patocka Sept. 20, 2018, 7:25 p.m. UTC | #9
On Thu, 20 Sep 2018, John David Anglin wrote:

> On 2018-09-20 1:40 PM, Mikulas Patocka wrote:
> > 
> > But I think this isn't sufficient - see my other email. There's this race
> > possible if we drop the lock from the fault handler:
> > 
> > CPU1 (executing a page fault handler):
> > load *pte from pagetable
> > 
> > CPU2 (executing ptep_set_wrprotect)
> > spin_lock_irqsave(&pa_tlb_lock, flags);
> > set_pte(ptep, pte_wrprotect(*ptep));
> > purge_tlb_entries(mm, addr);
> > spin_unlock_irqrestore(&pa_tlb_lock, flags);
> > 
> > CPU1 (continuing the page fault handler)
> > insert pte into TLB - now CPU1 is running with stale pte entry in TLB
> 
> See my comment about locking and TLB inserts.  I think we have fairly 
> deterministic timing in TLB handler

Cache misses take hundreds of cycles. You'd have only deterministic 
behavior if you locked the TLB handler in the cache.

> but the lock doesn't ensure that the release won't occur occur before 
> the insert.  Here, I'm assuming we still lock in TLB handler.

In the current state (with the lock), this race condition can't happen. It 
would be:

CPU1 (executing a page fault handler):
lock pa_tlb_lock
load *pte from pagetable
insert pte into TLB
unlock pa_tlb_lock

CPU2 (executing ptep_set_wrprotect)
spin_lock_irqsave(&pa_tlb_lock, flags);
set_pte(ptep, pte_wrprotect(*ptep));
purge_tlb_entries(mm, addr);
spin_unlock_irqrestore(&pa_tlb_lock, flags);

--- the purge_tlb_entries function would purge the translation that was 
loaded by CPU1, so there will be no stale entry in the TLB. The lock is 
needed even if the TLB handler doesn't modify the pagetable.

Mikulas
Mikulas Patocka Sept. 20, 2018, 7:30 p.m. UTC | #10
On Thu, 20 Sep 2018, John David Anglin wrote:

> On 2018-09-20 2:39 PM, Mikulas Patocka wrote:
> > 
> > On Thu, 20 Sep 2018, John David Anglin wrote:
> > 
> > > On 2018-09-20 12:11 PM, Mikulas Patocka wrote:
> > > > On Thu, 20 Sep 2018, Mikulas Patocka wrote:
> > > > 
> > > > > But there's code like this that needs to be changed. It needs to first
> > > > > set
> > > > > the PTE and then purge the tlb - otherwise the old pte value could be
> > > > > reloaded by a different CPU after purge_tlb_entries() because the TLB
> > > > > reload code no longer takes the lock.
> > > > > 
> > > > > static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned
> > > > > long
> > > > > addr, pte_t *ptep)
> > > > > {
> > > > >           unsigned long flags;
> > > > >           spin_lock_irqsave(&pa_tlb_lock, flags);
> > > > >           purge_tlb_entries(mm, addr);
> > > > >           set_pte(ptep, pte_wrprotect(*ptep));
> > > > >           spin_unlock_irqrestore(&pa_tlb_lock, flags);
> > > > > }
> > > > > 
> > > > > 
> > > > > 
> > > > > The accessed bit is in the same byte as the present bit, so what will
> > > > > happen if the TLB fault races with pte clearing?
> > > > > 
> > > > > Linux also encodes the swap location in non-present PTEs - could that
> > > > > race
> > > > > with the update of the byte containing the accessed bit?
> > > > > 
> > > > > Perhaps the best solution would be to store the accessed and dirty
> > > > > flags
> > > > > in the two topmost bytes of the PTE and make sure that these bytes are
> > > > > not
> > > > > used for anything else.
> > > > > 
> > > > > Mikulas
> > > > And I realized that even if you don't modify the PTE, dropping the lock
> > > > from the TLB handler is wrong.
> > > > 
> > > > If you drop the lock, the CPU can read valid PTE, then wait unlimited
> > > > amount of time (due to some microarchitectural event) and then insert
> > > > the
> > > > PTE into TLB - long after the PTE was cleared and flushed by some other
> > > > CPU.
> > > > 
> > > > If you drop the lock, you must stop relying on the instructions that
> > > > broadcast TLB shootdown and you must send IPIs to flush TLBs just like
> > > > on
> > > > x86.
> > > I don't believe the lock helps with the insert.  As far as I can tell
> >
> > If you insert into TLB without holding the lock, you may be inserting
> > arbitrarily old value.
> 
> There is only a problem if this is done after purge.

Yes - and it may happen - I've posted an example:
https://www.spinics.net/lists/linux-parisc/msg09131.html 

> On PA 2.0, there are two reorder buffers - one for accesses and the 
> other for integer operations. Some instructions go in both.  The arch 
> does not say TLB inserts are accesses.  TLB purges are accesses.  So, 
> integer operations can leak from a spin lock except when there is a 
> dependency on an access.

This problem happens even if the CPU didn't reorder instructions at all. 
The TLB handler first reads a PTE from memory and then inserts it into the 
TLB. These two operations (load and insert) are not atomic. Between the 
load and insert, the PTE value in memory may change arbitrarily.

Mikulas
John David Anglin Sept. 20, 2018, 8:28 p.m. UTC | #11
On 2018-09-20 3:25 PM, Mikulas Patocka wrote:
>
> On Thu, 20 Sep 2018, John David Anglin wrote:
>
>> On 2018-09-20 1:40 PM, Mikulas Patocka wrote:
>>> But I think this isn't sufficient - see my other email. There's this race
>>> possible if we drop the lock from the fault handler:
>>>
>>> CPU1 (executing a page fault handler):
>>> load *pte from pagetable
>>>
>>> CPU2 (executing ptep_set_wrprotect)
>>> spin_lock_irqsave(&pa_tlb_lock, flags);
>>> set_pte(ptep, pte_wrprotect(*ptep));
>>> purge_tlb_entries(mm, addr);
>>> spin_unlock_irqrestore(&pa_tlb_lock, flags);
>>>
>>> CPU1 (continuing the page fault handler)
>>> insert pte into TLB - now CPU1 is running with stale pte entry in TLB
>> See my comment about locking and TLB inserts.  I think we have fairly
>> deterministic timing in TLB handler
> Cache misses take hundreds of cycles. You'd have only deterministic
> behavior if you locked the TLB handler in the cache.
Maybe the handlers need to be cache line aligned.  Probably would help 
but maybe with
aliasing a line could still get flushed.  I don't think there's a way to 
lock lines in the cache.
>> but the lock doesn't ensure that the release won't occur occur before
>> the insert.  Here, I'm assuming we still lock in TLB handler.
> In the current state (with the lock), this race condition can't happen. It
> would be:
>
> CPU1 (executing a page fault handler):
> lock pa_tlb_lock
> load *pte from pagetable
> insert pte into TLB
> unlock pa_tlb_lock
The above two operations might be interchanged when viewed from CPU2.  
Is there time
for CPU2 to get lock and purge TLB entry before insert?

The unlock operation will execute after the last access but not be 
retired until after insert.

>
> CPU2 (executing ptep_set_wrprotect)
> spin_lock_irqsave(&pa_tlb_lock, flags);
> set_pte(ptep, pte_wrprotect(*ptep));
> purge_tlb_entries(mm, addr);
> spin_unlock_irqrestore(&pa_tlb_lock, flags);
>
> --- the purge_tlb_entries function would purge the translation that was
> loaded by CPU1, so there will be no stale entry in the TLB. The lock is
> needed even if the TLB handler doesn't modify the pagetable.
I understand the issue.  I'm just not sure the lock solves the issue on 
PA 2.0.  Maybe that's why
local purge instructions were added in PA 2.0.  If that's the case, then 
the purge_tlb_entries
line needs to be after the unlock.  You are convincing me that using IPI 
and local purges may
be the only correct solution.

I'll run some tests with the locks in the TLB handler and routines in 
pgtable.h updated.
John David Anglin Sept. 20, 2018, 8:37 p.m. UTC | #12
On 2018-09-20 3:30 PM, Mikulas Patocka wrote:
>
> On Thu, 20 Sep 2018, John David Anglin wrote:
>
>> On 2018-09-20 2:39 PM, Mikulas Patocka wrote:
>>> On Thu, 20 Sep 2018, John David Anglin wrote:
>>>
>>>> On 2018-09-20 12:11 PM, Mikulas Patocka wrote:
>>>>> On Thu, 20 Sep 2018, Mikulas Patocka wrote:
>>>>>
>>>>>> But there's code like this that needs to be changed. It needs to first
>>>>>> set
>>>>>> the PTE and then purge the tlb - otherwise the old pte value could be
>>>>>> reloaded by a different CPU after purge_tlb_entries() because the TLB
>>>>>> reload code no longer takes the lock.
>>>>>>
>>>>>> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned
>>>>>> long
>>>>>> addr, pte_t *ptep)
>>>>>> {
>>>>>>            unsigned long flags;
>>>>>>            spin_lock_irqsave(&pa_tlb_lock, flags);
>>>>>>            purge_tlb_entries(mm, addr);
>>>>>>            set_pte(ptep, pte_wrprotect(*ptep));
>>>>>>            spin_unlock_irqrestore(&pa_tlb_lock, flags);
>>>>>> }
>>>>>>
>>>>>>
>>>>>>
>>>>>> The accessed bit is in the same byte as the present bit, so what will
>>>>>> happen if the TLB fault races with pte clearing?
>>>>>>
>>>>>> Linux also encodes the swap location in non-present PTEs - could that
>>>>>> race
>>>>>> with the update of the byte containing the accessed bit?
>>>>>>
>>>>>> Perhaps the best solution would be to store the accessed and dirty
>>>>>> flags
>>>>>> in the two topmost bytes of the PTE and make sure that these bytes are
>>>>>> not
>>>>>> used for anything else.
>>>>>>
>>>>>> Mikulas
>>>>> And I realized that even if you don't modify the PTE, dropping the lock
>>>>> from the TLB handler is wrong.
>>>>>
>>>>> If you drop the lock, the CPU can read valid PTE, then wait unlimited
>>>>> amount of time (due to some microarchitectural event) and then insert
>>>>> the
>>>>> PTE into TLB - long after the PTE was cleared and flushed by some other
>>>>> CPU.
>>>>>
>>>>> If you drop the lock, you must stop relying on the instructions that
>>>>> broadcast TLB shootdown and you must send IPIs to flush TLBs just like
>>>>> on
>>>>> x86.
>>>> I don't believe the lock helps with the insert.  As far as I can tell
>>> If you insert into TLB without holding the lock, you may be inserting
>>> arbitrarily old value.
>> There is only a problem if this is done after purge.
> Yes - and it may happen - I've posted an example:
> https://www.spinics.net/lists/linux-parisc/msg09131.html
>
>> On PA 2.0, there are two reorder buffers - one for accesses and the
>> other for integer operations. Some instructions go in both.  The arch
>> does not say TLB inserts are accesses.  TLB purges are accesses.  So,
>> integer operations can leak from a spin lock except when there is a
>> dependency on an access.
> This problem happens even if the CPU didn't reorder instructions at all.
> The TLB handler first reads a PTE from memory and then inserts it into the
> TLB. These two operations (load and insert) are not atomic. Between the
> load and insert, the PTE value in memory may change arbitrarily.
Yes.  I believe that we probably can't serialize purge and insert 
operations with a spin lock.
That's probably why we still see some random segmentation faults.
Mikulas Patocka Sept. 20, 2018, 10:42 p.m. UTC | #13
On Thu, 20 Sep 2018, John David Anglin wrote:

> On 2018-09-20 3:30 PM, Mikulas Patocka wrote:
> > 
> > On Thu, 20 Sep 2018, John David Anglin wrote:
> > 
> > > On 2018-09-20 2:39 PM, Mikulas Patocka wrote:
> > > > On Thu, 20 Sep 2018, John David Anglin wrote:
> > > > 
> > > > > On 2018-09-20 12:11 PM, Mikulas Patocka wrote:
> > > > > > On Thu, 20 Sep 2018, Mikulas Patocka wrote:
> > > > > > 
> > > > > > > But there's code like this that needs to be changed. It needs to
> > > > > > > first
> > > > > > > set
> > > > > > > the PTE and then purge the tlb - otherwise the old pte value could
> > > > > > > be
> > > > > > > reloaded by a different CPU after purge_tlb_entries() because the
> > > > > > > TLB
> > > > > > > reload code no longer takes the lock.
> > > > > > > 
> > > > > > > static inline void ptep_set_wrprotect(struct mm_struct *mm,
> > > > > > > unsigned
> > > > > > > long
> > > > > > > addr, pte_t *ptep)
> > > > > > > {
> > > > > > >            unsigned long flags;
> > > > > > >            spin_lock_irqsave(&pa_tlb_lock, flags);
> > > > > > >            purge_tlb_entries(mm, addr);
> > > > > > >            set_pte(ptep, pte_wrprotect(*ptep));
> > > > > > >            spin_unlock_irqrestore(&pa_tlb_lock, flags);
> > > > > > > }
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > The accessed bit is in the same byte as the present bit, so what
> > > > > > > will
> > > > > > > happen if the TLB fault races with pte clearing?
> > > > > > > 
> > > > > > > Linux also encodes the swap location in non-present PTEs - could
> > > > > > > that
> > > > > > > race
> > > > > > > with the update of the byte containing the accessed bit?
> > > > > > > 
> > > > > > > Perhaps the best solution would be to store the accessed and dirty
> > > > > > > flags
> > > > > > > in the two topmost bytes of the PTE and make sure that these bytes
> > > > > > > are
> > > > > > > not
> > > > > > > used for anything else.
> > > > > > > 
> > > > > > > Mikulas
> > > > > > And I realized that even if you don't modify the PTE, dropping the
> > > > > > lock
> > > > > > from the TLB handler is wrong.
> > > > > > 
> > > > > > If you drop the lock, the CPU can read valid PTE, then wait
> > > > > > unlimited
> > > > > > amount of time (due to some microarchitectural event) and then
> > > > > > insert
> > > > > > the
> > > > > > PTE into TLB - long after the PTE was cleared and flushed by some
> > > > > > other
> > > > > > CPU.
> > > > > > 
> > > > > > If you drop the lock, you must stop relying on the instructions that
> > > > > > broadcast TLB shootdown and you must send IPIs to flush TLBs just
> > > > > > like
> > > > > > on
> > > > > > x86.
> > > > > I don't believe the lock helps with the insert.  As far as I can tell
> > > > If you insert into TLB without holding the lock, you may be inserting
> > > > arbitrarily old value.
> > > There is only a problem if this is done after purge.
> > Yes - and it may happen - I've posted an example:
> > https://www.spinics.net/lists/linux-parisc/msg09131.html
> > 
> > > On PA 2.0, there are two reorder buffers - one for accesses and the
> > > other for integer operations. Some instructions go in both.  The arch
> > > does not say TLB inserts are accesses.  TLB purges are accesses.  So,
> > > integer operations can leak from a spin lock except when there is a
> > > dependency on an access.
> > This problem happens even if the CPU didn't reorder instructions at all.
> > The TLB handler first reads a PTE from memory and then inserts it into the
> > TLB. These two operations (load and insert) are not atomic. Between the
> > load and insert, the PTE value in memory may change arbitrarily.
>
> Yes.  I believe that we probably can't serialize purge and insert 
> operations with a spin lock. That's probably why we still see some 
> random segmentation faults.

The kmap code is clearly violating the PA-RISC specification.

If a userspace page is simultaneously accessed from userspace process and 
from kernel that called kmap on it, it violates the PA-RISC specification 
because the virtual addresses are not congruent. It may result in data 
corruption or even machine checks.

kmap should return addresses that are congruent with the current userspace 
mapping of the page.

Mikulas
Mikulas Patocka Sept. 20, 2018, 10:47 p.m. UTC | #14
On Thu, 20 Sep 2018, John David Anglin wrote:

> > In the current state (with the lock), this race condition can't happen. It
> > would be:
> > 
> > CPU1 (executing a page fault handler):
> > lock pa_tlb_lock
> > load *pte from pagetable
> > insert pte into TLB
> > unlock pa_tlb_lock
>
> The above two operations might be interchanged when viewed from CPU2.  
> Is there time for CPU2 to get lock and purge TLB entry before insert?

It's hard to say if idtlbt is serializing instruction or not. Maybe it is, 
but the manual doesn't specify that.

Anyway, the current code has "sync" between the idtlbt and unlock, so they 
can't be interchanged.

> The unlock operation will execute after the last access but not be retired
> until after insert.
> 
> > 
> > CPU2 (executing ptep_set_wrprotect)
> > spin_lock_irqsave(&pa_tlb_lock, flags);
> > set_pte(ptep, pte_wrprotect(*ptep));
> > purge_tlb_entries(mm, addr);
> > spin_unlock_irqrestore(&pa_tlb_lock, flags);
> > 
> > --- the purge_tlb_entries function would purge the translation that was
> > loaded by CPU1, so there will be no stale entry in the TLB. The lock is
> > needed even if the TLB handler doesn't modify the pagetable.
>
> I understand the issue.  I'm just not sure the lock solves the issue on 
> PA 2.0.  Maybe that's why local purge instructions were added in PA 
> 2.0.  If that's the case, then the purge_tlb_entries line needs to be 
> after the unlock.  You are convincing me that using IPI and local purges 
> may be the only correct solution.

If we use IPIs, we can drop the lock from the fault handler.

> I'll run some tests with the locks in the TLB handler and routines in
> pgtable.h updated.

Perhaps disassembling the HP-UX TLB handlers would be the best option :) 
I suppose they selected the optimal implementation.

Mikulas
John David Anglin Sept. 20, 2018, 11:49 p.m. UTC | #15
On 2018-09-20 6:47 PM, Mikulas Patocka wrote:
> Anyway, the current code has "sync" between the idtlbt and unlock, so they
> can't be interchanged.
I'm not so sure.  The "sync" instruction enforces order on accesses and 
cache flushes.

Dave
John David Anglin Sept. 20, 2018, 11:55 p.m. UTC | #16
On 2018-09-20 6:42 PM, Mikulas Patocka wrote:
>
> On Thu, 20 Sep 2018, John David Anglin wrote:
>
>> On 2018-09-20 3:30 PM, Mikulas Patocka wrote:
>>> On Thu, 20 Sep 2018, John David Anglin wrote:
>>>
>>>> On 2018-09-20 2:39 PM, Mikulas Patocka wrote:
>>>>> On Thu, 20 Sep 2018, John David Anglin wrote:
>>>>>
>>>>>> On 2018-09-20 12:11 PM, Mikulas Patocka wrote:
>>>>>>> On Thu, 20 Sep 2018, Mikulas Patocka wrote:
>>>>>>>
>>>>>>>> But there's code like this that needs to be changed. It needs to
>>>>>>>> first
>>>>>>>> set
>>>>>>>> the PTE and then purge the tlb - otherwise the old pte value could
>>>>>>>> be
>>>>>>>> reloaded by a different CPU after purge_tlb_entries() because the
>>>>>>>> TLB
>>>>>>>> reload code no longer takes the lock.
>>>>>>>>
>>>>>>>> static inline void ptep_set_wrprotect(struct mm_struct *mm,
>>>>>>>> unsigned
>>>>>>>> long
>>>>>>>> addr, pte_t *ptep)
>>>>>>>> {
>>>>>>>>             unsigned long flags;
>>>>>>>>             spin_lock_irqsave(&pa_tlb_lock, flags);
>>>>>>>>             purge_tlb_entries(mm, addr);
>>>>>>>>             set_pte(ptep, pte_wrprotect(*ptep));
>>>>>>>>             spin_unlock_irqrestore(&pa_tlb_lock, flags);
>>>>>>>> }
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> The accessed bit is in the same byte as the present bit, so what
>>>>>>>> will
>>>>>>>> happen if the TLB fault races with pte clearing?
>>>>>>>>
>>>>>>>> Linux also encodes the swap location in non-present PTEs - could
>>>>>>>> that
>>>>>>>> race
>>>>>>>> with the update of the byte containing the accessed bit?
>>>>>>>>
>>>>>>>> Perhaps the best solution would be to store the accessed and dirty
>>>>>>>> flags
>>>>>>>> in the two topmost bytes of the PTE and make sure that these bytes
>>>>>>>> are
>>>>>>>> not
>>>>>>>> used for anything else.
>>>>>>>>
>>>>>>>> Mikulas
>>>>>>> And I realized that even if you don't modify the PTE, dropping the
>>>>>>> lock
>>>>>>> from the TLB handler is wrong.
>>>>>>>
>>>>>>> If you drop the lock, the CPU can read valid PTE, then wait
>>>>>>> unlimited
>>>>>>> amount of time (due to some microarchitectural event) and then
>>>>>>> insert
>>>>>>> the
>>>>>>> PTE into TLB - long after the PTE was cleared and flushed by some
>>>>>>> other
>>>>>>> CPU.
>>>>>>>
>>>>>>> If you drop the lock, you must stop relying on the instructions that
>>>>>>> broadcast TLB shootdown and you must send IPIs to flush TLBs just
>>>>>>> like
>>>>>>> on
>>>>>>> x86.
>>>>>> I don't believe the lock helps with the insert.  As far as I can tell
>>>>> If you insert into TLB without holding the lock, you may be inserting
>>>>> arbitrarily old value.
>>>> There is only a problem if this is done after purge.
>>> Yes - and it may happen - I've posted an example:
>>> https://www.spinics.net/lists/linux-parisc/msg09131.html
>>>
>>>> On PA 2.0, there are two reorder buffers - one for accesses and the
>>>> other for integer operations. Some instructions go in both.  The arch
>>>> does not say TLB inserts are accesses.  TLB purges are accesses.  So,
>>>> integer operations can leak from a spin lock except when there is a
>>>> dependency on an access.
>>> This problem happens even if the CPU didn't reorder instructions at all.
>>> The TLB handler first reads a PTE from memory and then inserts it into the
>>> TLB. These two operations (load and insert) are not atomic. Between the
>>> load and insert, the PTE value in memory may change arbitrarily.
>> Yes.  I believe that we probably can't serialize purge and insert
>> operations with a spin lock. That's probably why we still see some
>> random segmentation faults.
> The kmap code is clearly violating the PA-RISC specification.
>
> If a userspace page is simultaneously accessed from userspace process and
> from kernel that called kmap on it, it violates the PA-RISC specification
> because the virtual addresses are not congruent. It may result in data
> corruption or even machine checks.
>
> kmap should return addresses that are congruent with the current userspace
> mapping of the page.
If you have thoughts on how to do that, let me know.  At the moment, we 
flush the userspace
page before using the kernel page.  We flush the kernel page when we are 
done.  However,
these flushes may leave TLB entries which potentially might corrupt the 
cache.  There are
TLB purges after but there is a window where the TLB might do something 
bad.  I tried using
tmpalias flushes but they caused lockups sometimes.  The tmpalias 
flushes suppress move in.

Dave
John David Anglin Sept. 21, 2018, 12:31 a.m. UTC | #17
On 2018-09-20 6:47 PM, Mikulas Patocka wrote:
> Perhaps disassembling the HP-UX TLB handlers would be the best option:)  
> I suppose they selected the optimal implementation.
This is what I see.  The situation isn't very clear as I believe the 
code may get patched depending
on the processor.
000000000002b0a0 <tbitmss_PCXU>:
    2b0a0:       d9 19 03 e0     extrd,u,* r8,31,32,r25
    2b0a4:       0b 21 02 99     xor r1,r25,r25
    2b0a8:       f3 19 0c 0c     depd,* r25,31,20,r24

000000000002b0ac <pdir_base_patch_021>:
    2b0ac:       20 20 00 0a     ldil L%500000,r1

000000000002b0b0 <pdir_shift_patch_021>:
    2b0b0:       f0 21 00 00     depd,z,* r1,63,32,r1

000000000002b0b4 <pdir_mask_patch_021>:
    2b0b4:       f0 31 04 a8     depd,* r17,58,24,r1
    2b0b8:       0c 20 10 d1     ldd 0(r1),r17

000000000002b0bc <tbitloop_PCXU>:
    2b0bc:       bf 11 20 3a     cmpb,*<>,n r17,r24,2b0e0 
<tbit_target_miss_PCXU>
    2b0c0:       0c 30 10 c8     ldd 8(r1),r8
    2b0c4:       50 29 00 20     ldd 10(r1),r9
    2b0c8:       c4 48 60 3a     bb,*<,n r8,2,2b0ec <t_vioref_trap_PCXU>

000000000002b0cc <make_nop_if_split_TLB_2_0_10>:
    2b0cc:       f5 0e 0d 3d     depdi,* 7,22,3,r8
    2b0d0:       05 09 18 00     idtlbt r9,r8
    2b0d4:       0c 28 12 d0     std r8,8(r1)
    2b0d8:       00 00 0c a0     rfi,r
    2b0dc:       08 00 02 40     nop

000000000002b0e0 <tbit_target_miss_PCXU>:
    2b0e0:       50 21 00 30     ldd 18(r1),r1
    2b0e4:       bc 20 3f a7     cmpb,*<>,n r0,r1,2b0bc <tbitloop_PCXU>
    2b0e8:       0c 20 10 d1     ldd 0(r1),r17

000000000002b0ec <t_vioref_trap_PCXU>:
    2b0ec:       20 35 90 14     ldil L%aaa800,r1
    2b0f0:       48 21 00 28     ldw 14(r1),r1
    2b0f4:       08 01 22 40     or,= r1,r0,r0
    2b0f8:       e8 20 c0 02     bv,n r0(r1)
    2b0fc:       03 96 18 40     mtctl r22,tr4
    2b100:       03 c2 18 40     mtctl rp,tr6
    2b104:       e8 42 04 80     b,l 2f34c <$swap_shadows>,rp
    2b108:       03 5a 18 40     mtctl r26,tr2
    2b10c:       34 1a 00 02     ldi 1,r26
    2b110:       03 00 08 b6     mfctl tr0,r22
    2b114:       6a da 00 e0     stw r26,70(r22)
    2b118:       03 c0 08 a2     mfctl tr6,rp
    2b11c:       02 00 08 b6     mfctl itmr,r22
    2b120:       03 fe 18 40     mtctl sp,tr7
    2b124:       03 b6 18 40     mtctl r22,tr5
    2b128:       23 56 50 02     ldil L%16c800,r26
    2b12c:       e3 40 2c 08     be 604(sr4,r26)
    2b130:       34 1a 00 2a     ldi 15,r26
         ...
John David Anglin Sept. 23, 2018, 3:54 p.m. UTC | #18
On 2018-09-20 8:31 PM, John David Anglin wrote:
> On 2018-09-20 6:47 PM, Mikulas Patocka wrote:
>> Perhaps disassembling the HP-UX TLB handlers would be the best 
>> option:) I suppose they selected the optimal implementation.
> This is what I see.  The situation isn't very clear as I believe the 
> code may get patched depending
> on the processor.
> 000000000002b0a0 <tbitmss_PCXU>:
>    2b0a0:       d9 19 03 e0     extrd,u,* r8,31,32,r25
>    2b0a4:       0b 21 02 99     xor r1,r25,r25
>    2b0a8:       f3 19 0c 0c     depd,* r25,31,20,r24
>
> 000000000002b0ac <pdir_base_patch_021>:
>    2b0ac:       20 20 00 0a     ldil L%500000,r1
>
> 000000000002b0b0 <pdir_shift_patch_021>:
>    2b0b0:       f0 21 00 00     depd,z,* r1,63,32,r1
>
> 000000000002b0b4 <pdir_mask_patch_021>:
>    2b0b4:       f0 31 04 a8     depd,* r17,58,24,r1
>    2b0b8:       0c 20 10 d1     ldd 0(r1),r17
>
> 000000000002b0bc <tbitloop_PCXU>:
>    2b0bc:       bf 11 20 3a     cmpb,*<>,n r17,r24,2b0e0 
> <tbit_target_miss_PCXU>
>    2b0c0:       0c 30 10 c8     ldd 8(r1),r8
>    2b0c4:       50 29 00 20     ldd 10(r1),r9
>    2b0c8:       c4 48 60 3a     bb,*<,n r8,2,2b0ec <t_vioref_trap_PCXU>
>
> 000000000002b0cc <make_nop_if_split_TLB_2_0_10>:
>    2b0cc:       f5 0e 0d 3d     depdi,* 7,22,3,r8
>    2b0d0:       05 09 18 00     idtlbt r9,r8
>    2b0d4:       0c 28 12 d0     std r8,8(r1)
>    2b0d8:       00 00 0c a0     rfi,r
>    2b0dc:       08 00 02 40     nop
I would say HP-UX has a much better page table layout and lookup. The 
flags are set
by the "depdi" instruction and they are always written back. There's no 
locking so I
suspect HP-UX only changes the page directory when it's safe to do so.

Dave
diff mbox series

Patch

diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index fa6b7c78f18a..7efce946ba04 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -202,7 +202,7 @@  static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
 #define _PAGE_HUGE     (1 << xlate_pabit(_PAGE_HPAGE_BIT))
 #define _PAGE_USER     (1 << xlate_pabit(_PAGE_USER_BIT))
 
-#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE |  _PAGE_DIRTY | _PAGE_ACCESSED)
+#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)
 #define _PAGE_CHG_MASK	(PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _PAGE_KERNEL_RO	(_PAGE_PRESENT | _PAGE_READ | _PAGE_DIRTY | _PAGE_ACCESSED)
 #define _PAGE_KERNEL_EXEC	(_PAGE_KERNEL_RO | _PAGE_EXEC)
@@ -227,22 +227,22 @@  static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
 
 #ifndef __ASSEMBLY__
 
-#define PAGE_NONE	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_ACCESSED)
+#define PAGE_NONE	__pgprot(_PAGE_PRESENT | _PAGE_USER)
+#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE)
 /* Others seem to make this executable, I don't know if that's correct
    or not.  The stack is mapped this way though so this is necessary
    in the short term - dhd@linuxcare.com, 2000-08-08 */
-#define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED)
-#define PAGE_WRITEONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE | _PAGE_ACCESSED)
-#define PAGE_EXECREAD   __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC |_PAGE_ACCESSED)
+#define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ)
+#define PAGE_WRITEONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE)
+#define PAGE_EXECREAD   __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC)
 #define PAGE_COPY       PAGE_EXECREAD
-#define PAGE_RWX        __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC |_PAGE_ACCESSED)
+#define PAGE_RWX        __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
 #define PAGE_KERNEL	__pgprot(_PAGE_KERNEL)
 #define PAGE_KERNEL_EXEC	__pgprot(_PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_RWX	__pgprot(_PAGE_KERNEL_RWX)
 #define PAGE_KERNEL_RO	__pgprot(_PAGE_KERNEL_RO)
 #define PAGE_KERNEL_UNC	__pgprot(_PAGE_KERNEL | _PAGE_NO_CACHE)
-#define PAGE_GATEWAY    __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_GATEWAY| _PAGE_READ)
+#define PAGE_GATEWAY    __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_GATEWAY| _PAGE_READ)
 
 
 /*
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 1b4732e20137..8039241b669b 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -46,16 +46,6 @@ 
 	.level 2.0
 #endif
 
-	.import		pa_tlb_lock,data
-	.macro  load_pa_tlb_lock reg
-#if __PA_LDCW_ALIGNMENT > 4
-	load32	PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
-	depi	0,31,__PA_LDCW_ALIGN_ORDER, \reg
-#else
-	load32	PA(pa_tlb_lock), \reg
-#endif
-	.endm
-
 	/* space_to_prot macro creates a prot id from a space id */
 
 #if (SPACEID_SHIFT) == 0
@@ -462,47 +452,17 @@ 
 	L2_ptep		\pgd,\pte,\index,\va,\fault
 	.endm
 
-	/* Acquire pa_tlb_lock lock and recheck page is still present. */
-	.macro		tlb_lock	spc,ptp,pte,tmp,tmp1,fault
-#ifdef CONFIG_SMP
-	cmpib,COND(=),n	0,\spc,2f
-	load_pa_tlb_lock \tmp
-1:	LDCW		0(\tmp),\tmp1
-	cmpib,COND(=)	0,\tmp1,1b
-	nop
-	LDREG		0(\ptp),\pte
-	bb,<,n		\pte,_PAGE_PRESENT_BIT,2f
-	b		\fault
-	stw		 \spc,0(\tmp)
-2:
-#endif
-	.endm
-
-	/* Release pa_tlb_lock lock without reloading lock address. */
-	.macro		tlb_unlock0	spc,tmp
-#ifdef CONFIG_SMP
-	or,COND(=)	%r0,\spc,%r0
-	sync
-	or,COND(=)	%r0,\spc,%r0
-	stw             \spc,0(\tmp)
-#endif
-	.endm
-
-	/* Release pa_tlb_lock lock. */
-	.macro		tlb_unlock1	spc,tmp
-#ifdef CONFIG_SMP
-	load_pa_tlb_lock \tmp
-	tlb_unlock0	\spc,\tmp
-#endif
-	.endm
-
-	/* Set the _PAGE_ACCESSED bit of the PTE.  Be clever and
-	 * don't needlessly dirty the cache line if it was already set */
+	/* Set the _PAGE_ACCESSED bit of the PTE without overwriting the
+	 * dirty bit.  Fortunately, it resides in a different byte of
+	 * the PTE than the dirty bit.  Be clever and don't needlessly
+	 * dirty the cache line if it was already set */
 	.macro		update_accessed	ptp,pte,tmp,tmp1
 	ldi		_PAGE_ACCESSED,\tmp1
 	or		\tmp1,\pte,\tmp
+	/* Extract aligned byte from PTE containing page accessed bit */
+	extru		\tmp,_PAGE_ACCESSED_BIT,8,\tmp
 	and,COND(<>)	\tmp1,\pte,%r0
-	STREG		\tmp,0(\ptp)
+	stb		\tmp,__BITS_PER_LONG/8-2(\tmp)
 	.endm
 
 	/* Set the dirty bit (and accessed bit).  No need to be
@@ -1167,14 +1127,12 @@  dtlb_miss_20w:
 
 	L3_ptep		ptp,pte,t0,va,dtlb_check_alias_20w
 
-	tlb_lock	spc,ptp,pte,t0,t1,dtlb_check_alias_20w
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb	spc,pte,prot,t1
 	
 	idtlbt          pte,prot
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1193,14 +1151,12 @@  nadtlb_miss_20w:
 
 	L3_ptep		ptp,pte,t0,va,nadtlb_check_alias_20w
 
-	tlb_lock	spc,ptp,pte,t0,t1,nadtlb_check_alias_20w
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb	spc,pte,prot,t1
 
 	idtlbt          pte,prot
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1221,7 +1177,6 @@  dtlb_miss_11:
 
 	L2_ptep		ptp,pte,t0,va,dtlb_check_alias_11
 
-	tlb_lock	spc,ptp,pte,t0,t1,dtlb_check_alias_11
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb_11	spc,pte,prot
@@ -1234,7 +1189,6 @@  dtlb_miss_11:
 
 	mtsp		t1, %sr1	/* Restore sr1 */
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1254,7 +1208,6 @@  nadtlb_miss_11:
 
 	L2_ptep		ptp,pte,t0,va,nadtlb_check_alias_11
 
-	tlb_lock	spc,ptp,pte,t0,t1,nadtlb_check_alias_11
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb_11	spc,pte,prot
@@ -1267,7 +1220,6 @@  nadtlb_miss_11:
 
 	mtsp		t1, %sr1	/* Restore sr1 */
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1287,7 +1239,6 @@  dtlb_miss_20:
 
 	L2_ptep		ptp,pte,t0,va,dtlb_check_alias_20
 
-	tlb_lock	spc,ptp,pte,t0,t1,dtlb_check_alias_20
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb	spc,pte,prot,t1
@@ -1296,7 +1247,6 @@  dtlb_miss_20:
 
 	idtlbt          pte,prot
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1315,7 +1265,6 @@  nadtlb_miss_20:
 
 	L2_ptep		ptp,pte,t0,va,nadtlb_check_alias_20
 
-	tlb_lock	spc,ptp,pte,t0,t1,nadtlb_check_alias_20
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb	spc,pte,prot,t1
@@ -1324,7 +1273,6 @@  nadtlb_miss_20:
 	
 	idtlbt		pte,prot
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1424,14 +1372,12 @@  itlb_miss_20w:
 
 	L3_ptep		ptp,pte,t0,va,itlb_fault
 
-	tlb_lock	spc,ptp,pte,t0,t1,itlb_fault
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb	spc,pte,prot,t1
 	
 	iitlbt          pte,prot
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1448,14 +1394,12 @@  naitlb_miss_20w:
 
 	L3_ptep		ptp,pte,t0,va,naitlb_check_alias_20w
 
-	tlb_lock	spc,ptp,pte,t0,t1,naitlb_check_alias_20w
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb	spc,pte,prot,t1
 
 	iitlbt          pte,prot
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1476,7 +1420,6 @@  itlb_miss_11:
 
 	L2_ptep		ptp,pte,t0,va,itlb_fault
 
-	tlb_lock	spc,ptp,pte,t0,t1,itlb_fault
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb_11	spc,pte,prot
@@ -1489,7 +1432,6 @@  itlb_miss_11:
 
 	mtsp		t1, %sr1	/* Restore sr1 */
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1500,7 +1442,6 @@  naitlb_miss_11:
 
 	L2_ptep		ptp,pte,t0,va,naitlb_check_alias_11
 
-	tlb_lock	spc,ptp,pte,t0,t1,naitlb_check_alias_11
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb_11	spc,pte,prot
@@ -1513,7 +1454,6 @@  naitlb_miss_11:
 
 	mtsp		t1, %sr1	/* Restore sr1 */
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1534,7 +1474,6 @@  itlb_miss_20:
 
 	L2_ptep		ptp,pte,t0,va,itlb_fault
 
-	tlb_lock	spc,ptp,pte,t0,t1,itlb_fault
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb	spc,pte,prot,t1
@@ -1543,7 +1482,6 @@  itlb_miss_20:
 
 	iitlbt          pte,prot
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1554,7 +1492,6 @@  naitlb_miss_20:
 
 	L2_ptep		ptp,pte,t0,va,naitlb_check_alias_20
 
-	tlb_lock	spc,ptp,pte,t0,t1,naitlb_check_alias_20
 	update_accessed	ptp,pte,t0,t1
 
 	make_insert_tlb	spc,pte,prot,t1
@@ -1563,7 +1500,6 @@  naitlb_miss_20:
 
 	iitlbt          pte,prot
 
-	tlb_unlock1	spc,t0
 	rfir
 	nop
 
@@ -1586,14 +1522,12 @@  dbit_trap_20w:
 
 	L3_ptep		ptp,pte,t0,va,dbit_fault
 
-	tlb_lock	spc,ptp,pte,t0,t1,dbit_fault
 	update_dirty	ptp,pte,t1
 
 	make_insert_tlb	spc,pte,prot,t1
 		
 	idtlbt          pte,prot
 
-	tlb_unlock0	spc,t0
 	rfir
 	nop
 #else
@@ -1606,7 +1540,6 @@  dbit_trap_11:
 
 	L2_ptep		ptp,pte,t0,va,dbit_fault
 
-	tlb_lock	spc,ptp,pte,t0,t1,dbit_fault
 	update_dirty	ptp,pte,t1
 
 	make_insert_tlb_11	spc,pte,prot
@@ -1619,7 +1552,6 @@  dbit_trap_11:
 
 	mtsp            t1, %sr1     /* Restore sr1 */
 
-	tlb_unlock0	spc,t0
 	rfir
 	nop
 
@@ -1630,7 +1562,6 @@  dbit_trap_20:
 
 	L2_ptep		ptp,pte,t0,va,dbit_fault
 
-	tlb_lock	spc,ptp,pte,t0,t1,dbit_fault
 	update_dirty	ptp,pte,t1
 
 	make_insert_tlb	spc,pte,prot,t1
@@ -1639,7 +1570,6 @@  dbit_trap_20:
 	
 	idtlbt		pte,prot
 
-	tlb_unlock0	spc,t0
 	rfir
 	nop
 #endif