Message ID | 20180916233010.GA5591@mx3210.localdomain (mailing list archive) |
---|---|
State | Accepted, archived |
Headers | show |
Series | parisc: Remove locking from TLB handler and set page accessed | expand |
But there's code like this that needs to be changed. It needs to first set the PTE and then purge the tlb - otherwise the old pte value could be reloaded by a different CPU after purge_tlb_entries() because the TLB reload code no longer takes the lock. static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { unsigned long flags; spin_lock_irqsave(&pa_tlb_lock, flags); purge_tlb_entries(mm, addr); set_pte(ptep, pte_wrprotect(*ptep)); spin_unlock_irqrestore(&pa_tlb_lock, flags); } The accessed bit is in the same byte as the present bit, so what will happen if the TLB fault races with pte clearing? Linux also encodes the swap location in non-present PTEs - could that race with the update of the byte containing the accessed bit? Perhaps the best solution would be to store the accessed and dirty flags in the two topmost bytes of the PTE and make sure that these bytes are not used for anything else. Mikulas On Sun, 16 Sep 2018, John David Anglin wrote: > Locking was introduced into the TLB handlers primarily to ensure that updates > to the accessed and dirty bits in PTE entries were atomic. However, this > has a significant impact on performance. Also, it turned out that the user > page defines in pgtable.h always set the accessed bit, so it wasn't necessary > to update the accessed bit at all. The impact of this is that the kernel > doesn't know if a page has been accessed or not. > > This patch removes the locking from the TLB handlers. We update the page > accessed bit in the page table entries using a stb instruction that doesn't > affect the dirty bit. Thus, an access can't clobber the dirty bit. > > With this change, the accessed bit in user PTE entries is now updated when a > page is accessed and hopefully this will improve performance. > > diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h > index fa6b7c78f18a..7efce946ba04 100644 > --- a/arch/parisc/include/asm/pgtable.h > +++ b/arch/parisc/include/asm/pgtable.h > @@ -202,7 +202,7 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) > #define _PAGE_HUGE (1 << xlate_pabit(_PAGE_HPAGE_BIT)) > #define _PAGE_USER (1 << xlate_pabit(_PAGE_USER_BIT)) > > -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) > +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) > #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) > #define _PAGE_KERNEL_RO (_PAGE_PRESENT | _PAGE_READ | _PAGE_DIRTY | _PAGE_ACCESSED) > #define _PAGE_KERNEL_EXEC (_PAGE_KERNEL_RO | _PAGE_EXEC) > @@ -227,22 +227,22 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) > > #ifndef __ASSEMBLY__ > > -#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) > -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_ACCESSED) > +#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_USER) > +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE) > /* Others seem to make this executable, I don't know if that's correct > or not. The stack is mapped this way though so this is necessary > in the short term - dhd@linuxcare.com, 2000-08-08 */ > -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED) > -#define PAGE_WRITEONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE | _PAGE_ACCESSED) > -#define PAGE_EXECREAD __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC |_PAGE_ACCESSED) > +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ) > +#define PAGE_WRITEONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE) > +#define PAGE_EXECREAD __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC) > #define PAGE_COPY PAGE_EXECREAD > -#define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC |_PAGE_ACCESSED) > +#define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) > #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) > #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) > #define PAGE_KERNEL_RWX __pgprot(_PAGE_KERNEL_RWX) > #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO) > #define PAGE_KERNEL_UNC __pgprot(_PAGE_KERNEL | _PAGE_NO_CACHE) > -#define PAGE_GATEWAY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_GATEWAY| _PAGE_READ) > +#define PAGE_GATEWAY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_GATEWAY| _PAGE_READ) > > > /* > diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S > index 1b4732e20137..8039241b669b 100644 > --- a/arch/parisc/kernel/entry.S > +++ b/arch/parisc/kernel/entry.S > @@ -46,16 +46,6 @@ > .level 2.0 > #endif > > - .import pa_tlb_lock,data > - .macro load_pa_tlb_lock reg > -#if __PA_LDCW_ALIGNMENT > 4 > - load32 PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg > - depi 0,31,__PA_LDCW_ALIGN_ORDER, \reg > -#else > - load32 PA(pa_tlb_lock), \reg > -#endif > - .endm > - > /* space_to_prot macro creates a prot id from a space id */ > > #if (SPACEID_SHIFT) == 0 > @@ -462,47 +452,17 @@ > L2_ptep \pgd,\pte,\index,\va,\fault > .endm > > - /* Acquire pa_tlb_lock lock and recheck page is still present. */ > - .macro tlb_lock spc,ptp,pte,tmp,tmp1,fault > -#ifdef CONFIG_SMP > - cmpib,COND(=),n 0,\spc,2f > - load_pa_tlb_lock \tmp > -1: LDCW 0(\tmp),\tmp1 > - cmpib,COND(=) 0,\tmp1,1b > - nop > - LDREG 0(\ptp),\pte > - bb,<,n \pte,_PAGE_PRESENT_BIT,2f > - b \fault > - stw \spc,0(\tmp) > -2: > -#endif > - .endm > - > - /* Release pa_tlb_lock lock without reloading lock address. */ > - .macro tlb_unlock0 spc,tmp > -#ifdef CONFIG_SMP > - or,COND(=) %r0,\spc,%r0 > - sync > - or,COND(=) %r0,\spc,%r0 > - stw \spc,0(\tmp) > -#endif > - .endm > - > - /* Release pa_tlb_lock lock. */ > - .macro tlb_unlock1 spc,tmp > -#ifdef CONFIG_SMP > - load_pa_tlb_lock \tmp > - tlb_unlock0 \spc,\tmp > -#endif > - .endm > - > - /* Set the _PAGE_ACCESSED bit of the PTE. Be clever and > - * don't needlessly dirty the cache line if it was already set */ > + /* Set the _PAGE_ACCESSED bit of the PTE without overwriting the > + * dirty bit. Fortunately, it resides in a different byte of > + * the PTE than the dirty bit. Be clever and don't needlessly > + * dirty the cache line if it was already set */ > .macro update_accessed ptp,pte,tmp,tmp1 > ldi _PAGE_ACCESSED,\tmp1 > or \tmp1,\pte,\tmp > + /* Extract aligned byte from PTE containing page accessed bit */ > + extru \tmp,_PAGE_ACCESSED_BIT,8,\tmp > and,COND(<>) \tmp1,\pte,%r0 > - STREG \tmp,0(\ptp) > + stb \tmp,__BITS_PER_LONG/8-2(\tmp) > .endm > > /* Set the dirty bit (and accessed bit). No need to be > @@ -1167,14 +1127,12 @@ dtlb_miss_20w: > > L3_ptep ptp,pte,t0,va,dtlb_check_alias_20w > > - tlb_lock spc,ptp,pte,t0,t1,dtlb_check_alias_20w > update_accessed ptp,pte,t0,t1 > > make_insert_tlb spc,pte,prot,t1 > > idtlbt pte,prot > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1193,14 +1151,12 @@ nadtlb_miss_20w: > > L3_ptep ptp,pte,t0,va,nadtlb_check_alias_20w > > - tlb_lock spc,ptp,pte,t0,t1,nadtlb_check_alias_20w > update_accessed ptp,pte,t0,t1 > > make_insert_tlb spc,pte,prot,t1 > > idtlbt pte,prot > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1221,7 +1177,6 @@ dtlb_miss_11: > > L2_ptep ptp,pte,t0,va,dtlb_check_alias_11 > > - tlb_lock spc,ptp,pte,t0,t1,dtlb_check_alias_11 > update_accessed ptp,pte,t0,t1 > > make_insert_tlb_11 spc,pte,prot > @@ -1234,7 +1189,6 @@ dtlb_miss_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1254,7 +1208,6 @@ nadtlb_miss_11: > > L2_ptep ptp,pte,t0,va,nadtlb_check_alias_11 > > - tlb_lock spc,ptp,pte,t0,t1,nadtlb_check_alias_11 > update_accessed ptp,pte,t0,t1 > > make_insert_tlb_11 spc,pte,prot > @@ -1267,7 +1220,6 @@ nadtlb_miss_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1287,7 +1239,6 @@ dtlb_miss_20: > > L2_ptep ptp,pte,t0,va,dtlb_check_alias_20 > > - tlb_lock spc,ptp,pte,t0,t1,dtlb_check_alias_20 > update_accessed ptp,pte,t0,t1 > > make_insert_tlb spc,pte,prot,t1 > @@ -1296,7 +1247,6 @@ dtlb_miss_20: > > idtlbt pte,prot > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1315,7 +1265,6 @@ nadtlb_miss_20: > > L2_ptep ptp,pte,t0,va,nadtlb_check_alias_20 > > - tlb_lock spc,ptp,pte,t0,t1,nadtlb_check_alias_20 > update_accessed ptp,pte,t0,t1 > > make_insert_tlb spc,pte,prot,t1 > @@ -1324,7 +1273,6 @@ nadtlb_miss_20: > > idtlbt pte,prot > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1424,14 +1372,12 @@ itlb_miss_20w: > > L3_ptep ptp,pte,t0,va,itlb_fault > > - tlb_lock spc,ptp,pte,t0,t1,itlb_fault > update_accessed ptp,pte,t0,t1 > > make_insert_tlb spc,pte,prot,t1 > > iitlbt pte,prot > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1448,14 +1394,12 @@ naitlb_miss_20w: > > L3_ptep ptp,pte,t0,va,naitlb_check_alias_20w > > - tlb_lock spc,ptp,pte,t0,t1,naitlb_check_alias_20w > update_accessed ptp,pte,t0,t1 > > make_insert_tlb spc,pte,prot,t1 > > iitlbt pte,prot > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1476,7 +1420,6 @@ itlb_miss_11: > > L2_ptep ptp,pte,t0,va,itlb_fault > > - tlb_lock spc,ptp,pte,t0,t1,itlb_fault > update_accessed ptp,pte,t0,t1 > > make_insert_tlb_11 spc,pte,prot > @@ -1489,7 +1432,6 @@ itlb_miss_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1500,7 +1442,6 @@ naitlb_miss_11: > > L2_ptep ptp,pte,t0,va,naitlb_check_alias_11 > > - tlb_lock spc,ptp,pte,t0,t1,naitlb_check_alias_11 > update_accessed ptp,pte,t0,t1 > > make_insert_tlb_11 spc,pte,prot > @@ -1513,7 +1454,6 @@ naitlb_miss_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1534,7 +1474,6 @@ itlb_miss_20: > > L2_ptep ptp,pte,t0,va,itlb_fault > > - tlb_lock spc,ptp,pte,t0,t1,itlb_fault > update_accessed ptp,pte,t0,t1 > > make_insert_tlb spc,pte,prot,t1 > @@ -1543,7 +1482,6 @@ itlb_miss_20: > > iitlbt pte,prot > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1554,7 +1492,6 @@ naitlb_miss_20: > > L2_ptep ptp,pte,t0,va,naitlb_check_alias_20 > > - tlb_lock spc,ptp,pte,t0,t1,naitlb_check_alias_20 > update_accessed ptp,pte,t0,t1 > > make_insert_tlb spc,pte,prot,t1 > @@ -1563,7 +1500,6 @@ naitlb_miss_20: > > iitlbt pte,prot > > - tlb_unlock1 spc,t0 > rfir > nop > > @@ -1586,14 +1522,12 @@ dbit_trap_20w: > > L3_ptep ptp,pte,t0,va,dbit_fault > > - tlb_lock spc,ptp,pte,t0,t1,dbit_fault > update_dirty ptp,pte,t1 > > make_insert_tlb spc,pte,prot,t1 > > idtlbt pte,prot > > - tlb_unlock0 spc,t0 > rfir > nop > #else > @@ -1606,7 +1540,6 @@ dbit_trap_11: > > L2_ptep ptp,pte,t0,va,dbit_fault > > - tlb_lock spc,ptp,pte,t0,t1,dbit_fault > update_dirty ptp,pte,t1 > > make_insert_tlb_11 spc,pte,prot > @@ -1619,7 +1552,6 @@ dbit_trap_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock0 spc,t0 > rfir > nop > > @@ -1630,7 +1562,6 @@ dbit_trap_20: > > L2_ptep ptp,pte,t0,va,dbit_fault > > - tlb_lock spc,ptp,pte,t0,t1,dbit_fault > update_dirty ptp,pte,t1 > > make_insert_tlb spc,pte,prot,t1 > @@ -1639,7 +1570,6 @@ dbit_trap_20: > > idtlbt pte,prot > > - tlb_unlock0 spc,t0 > rfir > nop > #endif > > Signed-off-by: John David Anglin <dave.anglin@bell.net> > >
On Thu, 20 Sep 2018, Mikulas Patocka wrote: > But there's code like this that needs to be changed. It needs to first set > the PTE and then purge the tlb - otherwise the old pte value could be > reloaded by a different CPU after purge_tlb_entries() because the TLB > reload code no longer takes the lock. > > static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) > { > unsigned long flags; > spin_lock_irqsave(&pa_tlb_lock, flags); > purge_tlb_entries(mm, addr); > set_pte(ptep, pte_wrprotect(*ptep)); > spin_unlock_irqrestore(&pa_tlb_lock, flags); > } > > > > The accessed bit is in the same byte as the present bit, so what will > happen if the TLB fault races with pte clearing? > > Linux also encodes the swap location in non-present PTEs - could that race > with the update of the byte containing the accessed bit? > > Perhaps the best solution would be to store the accessed and dirty flags > in the two topmost bytes of the PTE and make sure that these bytes are not > used for anything else. > > Mikulas And I realized that even if you don't modify the PTE, dropping the lock from the TLB handler is wrong. If you drop the lock, the CPU can read valid PTE, then wait unlimited amount of time (due to some microarchitectural event) and then insert the PTE into TLB - long after the PTE was cleared and flushed by some other CPU. If you drop the lock, you must stop relying on the instructions that broadcast TLB shootdown and you must send IPIs to flush TLBs just like on x86. Mikulas
You ask good questions. If the races that you describe are possible, this argues for continuing to use the lock. It has never been clear to me whether the kernel changes page table entries while code using the entries or not. On 2018-09-20 10:42 AM, Mikulas Patocka wrote: > But there's code like this that needs to be changed. It needs to first set > the PTE and then purge the tlb - otherwise the old pte value could be > reloaded by a different CPU after purge_tlb_entries() because the TLB > reload code no longer takes the lock. > > static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) > { > unsigned long flags; > spin_lock_irqsave(&pa_tlb_lock, flags); > purge_tlb_entries(mm, addr); > set_pte(ptep, pte_wrprotect(*ptep)); > spin_unlock_irqrestore(&pa_tlb_lock, flags); > } Should be tried. > > The accessed bit is in the same byte as the present bit, so what will > happen if the TLB fault races with pte clearing? > > Linux also encodes the swap location in non-present PTEs - could that race > with the update of the byte containing the accessed bit? The current code checks the present bit twice, once inside the lock. The initial check is only required for kernel faults > > Perhaps the best solution would be to store the accessed and dirty flags > in the two topmost bytes of the PTE and make sure that these bytes are not > used for anything else. The dirty flag is a hardware flag and I don't think it can move. > > Mikulas > > > On Sun, 16 Sep 2018, John David Anglin wrote: > >> Locking was introduced into the TLB handlers primarily to ensure that updates >> to the accessed and dirty bits in PTE entries were atomic. However, this >> has a significant impact on performance. Also, it turned out that the user >> page defines in pgtable.h always set the accessed bit, so it wasn't necessary >> to update the accessed bit at all. The impact of this is that the kernel >> doesn't know if a page has been accessed or not. >> >> This patch removes the locking from the TLB handlers. We update the page >> accessed bit in the page table entries using a stb instruction that doesn't >> affect the dirty bit. Thus, an access can't clobber the dirty bit. >> >> With this change, the accessed bit in user PTE entries is now updated when a >> page is accessed and hopefully this will improve performance. >> >> diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h >> index fa6b7c78f18a..7efce946ba04 100644 >> --- a/arch/parisc/include/asm/pgtable.h >> +++ b/arch/parisc/include/asm/pgtable.h >> @@ -202,7 +202,7 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) >> #define _PAGE_HUGE (1 << xlate_pabit(_PAGE_HPAGE_BIT)) >> #define _PAGE_USER (1 << xlate_pabit(_PAGE_USER_BIT)) >> >> -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) >> +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) >> #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) >> #define _PAGE_KERNEL_RO (_PAGE_PRESENT | _PAGE_READ | _PAGE_DIRTY | _PAGE_ACCESSED) >> #define _PAGE_KERNEL_EXEC (_PAGE_KERNEL_RO | _PAGE_EXEC) >> @@ -227,22 +227,22 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) >> >> #ifndef __ASSEMBLY__ >> >> -#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) >> -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_ACCESSED) >> +#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_USER) >> +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE) >> /* Others seem to make this executable, I don't know if that's correct >> or not. The stack is mapped this way though so this is necessary >> in the short term - dhd@linuxcare.com, 2000-08-08 */ >> -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED) >> -#define PAGE_WRITEONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE | _PAGE_ACCESSED) >> -#define PAGE_EXECREAD __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC |_PAGE_ACCESSED) >> +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ) >> +#define PAGE_WRITEONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE) >> +#define PAGE_EXECREAD __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC) >> #define PAGE_COPY PAGE_EXECREAD >> -#define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC |_PAGE_ACCESSED) >> +#define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) >> #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) >> #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) >> #define PAGE_KERNEL_RWX __pgprot(_PAGE_KERNEL_RWX) >> #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO) >> #define PAGE_KERNEL_UNC __pgprot(_PAGE_KERNEL | _PAGE_NO_CACHE) >> -#define PAGE_GATEWAY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_GATEWAY| _PAGE_READ) >> +#define PAGE_GATEWAY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_GATEWAY| _PAGE_READ) >> >> >> /* >> diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S >> index 1b4732e20137..8039241b669b 100644 >> --- a/arch/parisc/kernel/entry.S >> +++ b/arch/parisc/kernel/entry.S >> @@ -46,16 +46,6 @@ >> .level 2.0 >> #endif >> >> - .import pa_tlb_lock,data >> - .macro load_pa_tlb_lock reg >> -#if __PA_LDCW_ALIGNMENT > 4 >> - load32 PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg >> - depi 0,31,__PA_LDCW_ALIGN_ORDER, \reg >> -#else >> - load32 PA(pa_tlb_lock), \reg >> -#endif >> - .endm >> - >> /* space_to_prot macro creates a prot id from a space id */ >> >> #if (SPACEID_SHIFT) == 0 >> @@ -462,47 +452,17 @@ >> L2_ptep \pgd,\pte,\index,\va,\fault >> .endm >> >> - /* Acquire pa_tlb_lock lock and recheck page is still present. */ >> - .macro tlb_lock spc,ptp,pte,tmp,tmp1,fault >> -#ifdef CONFIG_SMP >> - cmpib,COND(=),n 0,\spc,2f >> - load_pa_tlb_lock \tmp >> -1: LDCW 0(\tmp),\tmp1 >> - cmpib,COND(=) 0,\tmp1,1b >> - nop >> - LDREG 0(\ptp),\pte >> - bb,<,n \pte,_PAGE_PRESENT_BIT,2f >> - b \fault >> - stw \spc,0(\tmp) >> -2: >> -#endif >> - .endm >> - >> - /* Release pa_tlb_lock lock without reloading lock address. */ >> - .macro tlb_unlock0 spc,tmp >> -#ifdef CONFIG_SMP >> - or,COND(=) %r0,\spc,%r0 >> - sync >> - or,COND(=) %r0,\spc,%r0 >> - stw \spc,0(\tmp) >> -#endif >> - .endm >> - >> - /* Release pa_tlb_lock lock. */ >> - .macro tlb_unlock1 spc,tmp >> -#ifdef CONFIG_SMP >> - load_pa_tlb_lock \tmp >> - tlb_unlock0 \spc,\tmp >> -#endif >> - .endm >> - >> - /* Set the _PAGE_ACCESSED bit of the PTE. Be clever and >> - * don't needlessly dirty the cache line if it was already set */ >> + /* Set the _PAGE_ACCESSED bit of the PTE without overwriting the >> + * dirty bit. Fortunately, it resides in a different byte of >> + * the PTE than the dirty bit. Be clever and don't needlessly >> + * dirty the cache line if it was already set */ >> .macro update_accessed ptp,pte,tmp,tmp1 >> ldi _PAGE_ACCESSED,\tmp1 >> or \tmp1,\pte,\tmp >> + /* Extract aligned byte from PTE containing page accessed bit */ >> + extru \tmp,_PAGE_ACCESSED_BIT,8,\tmp >> and,COND(<>) \tmp1,\pte,%r0 >> - STREG \tmp,0(\ptp) >> + stb \tmp,__BITS_PER_LONG/8-2(\tmp) >> .endm >> >> /* Set the dirty bit (and accessed bit). No need to be >> @@ -1167,14 +1127,12 @@ dtlb_miss_20w: >> >> L3_ptep ptp,pte,t0,va,dtlb_check_alias_20w >> >> - tlb_lock spc,ptp,pte,t0,t1,dtlb_check_alias_20w >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> >> idtlbt pte,prot >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1193,14 +1151,12 @@ nadtlb_miss_20w: >> >> L3_ptep ptp,pte,t0,va,nadtlb_check_alias_20w >> >> - tlb_lock spc,ptp,pte,t0,t1,nadtlb_check_alias_20w >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> >> idtlbt pte,prot >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1221,7 +1177,6 @@ dtlb_miss_11: >> >> L2_ptep ptp,pte,t0,va,dtlb_check_alias_11 >> >> - tlb_lock spc,ptp,pte,t0,t1,dtlb_check_alias_11 >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb_11 spc,pte,prot >> @@ -1234,7 +1189,6 @@ dtlb_miss_11: >> >> mtsp t1, %sr1 /* Restore sr1 */ >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1254,7 +1208,6 @@ nadtlb_miss_11: >> >> L2_ptep ptp,pte,t0,va,nadtlb_check_alias_11 >> >> - tlb_lock spc,ptp,pte,t0,t1,nadtlb_check_alias_11 >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb_11 spc,pte,prot >> @@ -1267,7 +1220,6 @@ nadtlb_miss_11: >> >> mtsp t1, %sr1 /* Restore sr1 */ >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1287,7 +1239,6 @@ dtlb_miss_20: >> >> L2_ptep ptp,pte,t0,va,dtlb_check_alias_20 >> >> - tlb_lock spc,ptp,pte,t0,t1,dtlb_check_alias_20 >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> @@ -1296,7 +1247,6 @@ dtlb_miss_20: >> >> idtlbt pte,prot >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1315,7 +1265,6 @@ nadtlb_miss_20: >> >> L2_ptep ptp,pte,t0,va,nadtlb_check_alias_20 >> >> - tlb_lock spc,ptp,pte,t0,t1,nadtlb_check_alias_20 >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> @@ -1324,7 +1273,6 @@ nadtlb_miss_20: >> >> idtlbt pte,prot >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1424,14 +1372,12 @@ itlb_miss_20w: >> >> L3_ptep ptp,pte,t0,va,itlb_fault >> >> - tlb_lock spc,ptp,pte,t0,t1,itlb_fault >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> >> iitlbt pte,prot >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1448,14 +1394,12 @@ naitlb_miss_20w: >> >> L3_ptep ptp,pte,t0,va,naitlb_check_alias_20w >> >> - tlb_lock spc,ptp,pte,t0,t1,naitlb_check_alias_20w >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> >> iitlbt pte,prot >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1476,7 +1420,6 @@ itlb_miss_11: >> >> L2_ptep ptp,pte,t0,va,itlb_fault >> >> - tlb_lock spc,ptp,pte,t0,t1,itlb_fault >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb_11 spc,pte,prot >> @@ -1489,7 +1432,6 @@ itlb_miss_11: >> >> mtsp t1, %sr1 /* Restore sr1 */ >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1500,7 +1442,6 @@ naitlb_miss_11: >> >> L2_ptep ptp,pte,t0,va,naitlb_check_alias_11 >> >> - tlb_lock spc,ptp,pte,t0,t1,naitlb_check_alias_11 >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb_11 spc,pte,prot >> @@ -1513,7 +1454,6 @@ naitlb_miss_11: >> >> mtsp t1, %sr1 /* Restore sr1 */ >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1534,7 +1474,6 @@ itlb_miss_20: >> >> L2_ptep ptp,pte,t0,va,itlb_fault >> >> - tlb_lock spc,ptp,pte,t0,t1,itlb_fault >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> @@ -1543,7 +1482,6 @@ itlb_miss_20: >> >> iitlbt pte,prot >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1554,7 +1492,6 @@ naitlb_miss_20: >> >> L2_ptep ptp,pte,t0,va,naitlb_check_alias_20 >> >> - tlb_lock spc,ptp,pte,t0,t1,naitlb_check_alias_20 >> update_accessed ptp,pte,t0,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> @@ -1563,7 +1500,6 @@ naitlb_miss_20: >> >> iitlbt pte,prot >> >> - tlb_unlock1 spc,t0 >> rfir >> nop >> >> @@ -1586,14 +1522,12 @@ dbit_trap_20w: >> >> L3_ptep ptp,pte,t0,va,dbit_fault >> >> - tlb_lock spc,ptp,pte,t0,t1,dbit_fault >> update_dirty ptp,pte,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> >> idtlbt pte,prot >> >> - tlb_unlock0 spc,t0 >> rfir >> nop >> #else >> @@ -1606,7 +1540,6 @@ dbit_trap_11: >> >> L2_ptep ptp,pte,t0,va,dbit_fault >> >> - tlb_lock spc,ptp,pte,t0,t1,dbit_fault >> update_dirty ptp,pte,t1 >> >> make_insert_tlb_11 spc,pte,prot >> @@ -1619,7 +1552,6 @@ dbit_trap_11: >> >> mtsp t1, %sr1 /* Restore sr1 */ >> >> - tlb_unlock0 spc,t0 >> rfir >> nop >> >> @@ -1630,7 +1562,6 @@ dbit_trap_20: >> >> L2_ptep ptp,pte,t0,va,dbit_fault >> >> - tlb_lock spc,ptp,pte,t0,t1,dbit_fault >> update_dirty ptp,pte,t1 >> >> make_insert_tlb spc,pte,prot,t1 >> @@ -1639,7 +1570,6 @@ dbit_trap_20: >> >> idtlbt pte,prot >> >> - tlb_unlock0 spc,t0 >> rfir >> nop >> #endif >> >> Signed-off-by: John David Anglin <dave.anglin@bell.net> >> >>
On Thu, 20 Sep 2018, John David Anglin wrote: > You ask good questions. If the races that you describe are possible, this > argues for continuing > to use the lock. It has never been clear to me whether the kernel changes > page table entries > while code using the entries or not. The kernel doesn't stop processes while changing the pagetables - so the change may happen simultaneously with another CPU accessing the PTE that is being changed. In x86 the kernel changes the pagetable and then broadcasts an IPI to other processors to flush the TLB. On parisc, the request to flush the TLB may be broadcasted by the hardware - but we need the lock in the TLB handler to make sure that it doesn't load an entry after it was invalidated. > On 2018-09-20 10:42 AM, Mikulas Patocka wrote: > > But there's code like this that needs to be changed. It needs to first set > > the PTE and then purge the tlb - otherwise the old pte value could be > > reloaded by a different CPU after purge_tlb_entries() because the TLB > > reload code no longer takes the lock. > > > > static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long > > addr, pte_t *ptep) > > { > > unsigned long flags; > > spin_lock_irqsave(&pa_tlb_lock, flags); > > purge_tlb_entries(mm, addr); > > set_pte(ptep, pte_wrprotect(*ptep)); > > spin_unlock_irqrestore(&pa_tlb_lock, flags); > > } > Should be tried. But I think this isn't sufficient - see my other email. There's this race possible if we drop the lock from the fault handler: CPU1 (executing a page fault handler): load *pte from pagetable CPU2 (executing ptep_set_wrprotect) spin_lock_irqsave(&pa_tlb_lock, flags); set_pte(ptep, pte_wrprotect(*ptep)); purge_tlb_entries(mm, addr); spin_unlock_irqrestore(&pa_tlb_lock, flags); CPU1 (continuing the page fault handler) insert pte into TLB - now CPU1 is running with stale pte entry in TLB Does anyone have an idea how does HP-UX handle the TLB faults? Mikulas
On 2018-09-20 12:11 PM, Mikulas Patocka wrote: > > On Thu, 20 Sep 2018, Mikulas Patocka wrote: > >> But there's code like this that needs to be changed. It needs to first set >> the PTE and then purge the tlb - otherwise the old pte value could be >> reloaded by a different CPU after purge_tlb_entries() because the TLB >> reload code no longer takes the lock. >> >> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) >> { >> unsigned long flags; >> spin_lock_irqsave(&pa_tlb_lock, flags); >> purge_tlb_entries(mm, addr); >> set_pte(ptep, pte_wrprotect(*ptep)); >> spin_unlock_irqrestore(&pa_tlb_lock, flags); >> } >> >> >> >> The accessed bit is in the same byte as the present bit, so what will >> happen if the TLB fault races with pte clearing? >> >> Linux also encodes the swap location in non-present PTEs - could that race >> with the update of the byte containing the accessed bit? >> >> Perhaps the best solution would be to store the accessed and dirty flags >> in the two topmost bytes of the PTE and make sure that these bytes are not >> used for anything else. >> >> Mikulas > And I realized that even if you don't modify the PTE, dropping the lock > from the TLB handler is wrong. > > If you drop the lock, the CPU can read valid PTE, then wait unlimited > amount of time (due to some microarchitectural event) and then insert the > PTE into TLB - long after the PTE was cleared and flushed by some other > CPU. > > If you drop the lock, you must stop relying on the instructions that > broadcast TLB shootdown and you must send IPIs to flush TLBs just like on > x86. I don't believe the lock helps with the insert. As far as I can tell from the arch, TLB inserts are not considered accesses. So, the insert can't be contained inside the lock due to instruction reordering. We need an access dependency but idtlbt, for example, has no output register. After the current lock is taken, the only instruction that cause a significant microarchitectural event is the reload of the PTE. We might get an instruction TLB fault, particularly on the insert, but that, seems unlikely. I think this suggests that the lock release should remain after the insert as it might cause an event. The lock was originally just for the instructions that broadcast a TLB purge. Using IPIs to do TLB flushes is another can of worms as they require interrupts. Only PA 2.0 has local purges and doing a full TLB shootdown takes a long time.
On Thu, 20 Sep 2018, John David Anglin wrote: > On 2018-09-20 12:11 PM, Mikulas Patocka wrote: > > > > On Thu, 20 Sep 2018, Mikulas Patocka wrote: > > > > > But there's code like this that needs to be changed. It needs to first set > > > the PTE and then purge the tlb - otherwise the old pte value could be > > > reloaded by a different CPU after purge_tlb_entries() because the TLB > > > reload code no longer takes the lock. > > > > > > static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long > > > addr, pte_t *ptep) > > > { > > > unsigned long flags; > > > spin_lock_irqsave(&pa_tlb_lock, flags); > > > purge_tlb_entries(mm, addr); > > > set_pte(ptep, pte_wrprotect(*ptep)); > > > spin_unlock_irqrestore(&pa_tlb_lock, flags); > > > } > > > > > > > > > > > > The accessed bit is in the same byte as the present bit, so what will > > > happen if the TLB fault races with pte clearing? > > > > > > Linux also encodes the swap location in non-present PTEs - could that race > > > with the update of the byte containing the accessed bit? > > > > > > Perhaps the best solution would be to store the accessed and dirty flags > > > in the two topmost bytes of the PTE and make sure that these bytes are not > > > used for anything else. > > > > > > Mikulas > > And I realized that even if you don't modify the PTE, dropping the lock > > from the TLB handler is wrong. > > > > If you drop the lock, the CPU can read valid PTE, then wait unlimited > > amount of time (due to some microarchitectural event) and then insert the > > PTE into TLB - long after the PTE was cleared and flushed by some other > > CPU. > > > > If you drop the lock, you must stop relying on the instructions that > > broadcast TLB shootdown and you must send IPIs to flush TLBs just like on > > x86. > > I don't believe the lock helps with the insert. As far as I can tell If you insert into TLB without holding the lock, you may be inserting arbitrarily old value. Mikulas
On 2018-09-20 1:40 PM, Mikulas Patocka wrote: > > On Thu, 20 Sep 2018, John David Anglin wrote: > >> You ask good questions. If the races that you describe are possible, this >> argues for continuing >> to use the lock. It has never been clear to me whether the kernel changes >> page table entries >> while code using the entries or not. > The kernel doesn't stop processes while changing the pagetables - so the > change may happen simultaneously with another CPU accessing the PTE that > is being changed. Okay. > > In x86 the kernel changes the pagetable and then broadcasts an IPI to > other processors to flush the TLB. > > On parisc, the request to flush the TLB may be broadcasted by the hardware > - but we need the lock in the TLB handler to make sure that it doesn't > load an entry after it was invalidated. > >> On 2018-09-20 10:42 AM, Mikulas Patocka wrote: >>> But there's code like this that needs to be changed. It needs to first set >>> the PTE and then purge the tlb - otherwise the old pte value could be >>> reloaded by a different CPU after purge_tlb_entries() because the TLB >>> reload code no longer takes the lock. >>> >>> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long >>> addr, pte_t *ptep) >>> { >>> unsigned long flags; >>> spin_lock_irqsave(&pa_tlb_lock, flags); >>> purge_tlb_entries(mm, addr); >>> set_pte(ptep, pte_wrprotect(*ptep)); >>> spin_unlock_irqrestore(&pa_tlb_lock, flags); >>> } >> Should be tried. > But I think this isn't sufficient - see my other email. There's this race > possible if we drop the lock from the fault handler: > > CPU1 (executing a page fault handler): > load *pte from pagetable > > CPU2 (executing ptep_set_wrprotect) > spin_lock_irqsave(&pa_tlb_lock, flags); > set_pte(ptep, pte_wrprotect(*ptep)); > purge_tlb_entries(mm, addr); > spin_unlock_irqrestore(&pa_tlb_lock, flags); > > CPU1 (continuing the page fault handler) > insert pte into TLB - now CPU1 is running with stale pte entry in TLB See my comment about locking and TLB inserts. I think we have fairly deterministic timing in TLB handler but the lock doesn't ensure that the release won't occur occur before the insert. Here, I'm assuming we still lock in TLB handler. I added the purge_tlb_entries code and it might be redundant. Maybe it makes the potential for a race worse? > > > Does anyone have an idea how does HP-UX handle the TLB faults? I can disassemble HP-UX but I'm not sure if it will help. I know the standard spin lock code uses waiters. Dave
On 2018-09-20 2:39 PM, Mikulas Patocka wrote: > > On Thu, 20 Sep 2018, John David Anglin wrote: > >> On 2018-09-20 12:11 PM, Mikulas Patocka wrote: >>> On Thu, 20 Sep 2018, Mikulas Patocka wrote: >>> >>>> But there's code like this that needs to be changed. It needs to first set >>>> the PTE and then purge the tlb - otherwise the old pte value could be >>>> reloaded by a different CPU after purge_tlb_entries() because the TLB >>>> reload code no longer takes the lock. >>>> >>>> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long >>>> addr, pte_t *ptep) >>>> { >>>> unsigned long flags; >>>> spin_lock_irqsave(&pa_tlb_lock, flags); >>>> purge_tlb_entries(mm, addr); >>>> set_pte(ptep, pte_wrprotect(*ptep)); >>>> spin_unlock_irqrestore(&pa_tlb_lock, flags); >>>> } >>>> >>>> >>>> >>>> The accessed bit is in the same byte as the present bit, so what will >>>> happen if the TLB fault races with pte clearing? >>>> >>>> Linux also encodes the swap location in non-present PTEs - could that race >>>> with the update of the byte containing the accessed bit? >>>> >>>> Perhaps the best solution would be to store the accessed and dirty flags >>>> in the two topmost bytes of the PTE and make sure that these bytes are not >>>> used for anything else. >>>> >>>> Mikulas >>> And I realized that even if you don't modify the PTE, dropping the lock >>> from the TLB handler is wrong. >>> >>> If you drop the lock, the CPU can read valid PTE, then wait unlimited >>> amount of time (due to some microarchitectural event) and then insert the >>> PTE into TLB - long after the PTE was cleared and flushed by some other >>> CPU. >>> >>> If you drop the lock, you must stop relying on the instructions that >>> broadcast TLB shootdown and you must send IPIs to flush TLBs just like on >>> x86. >> I don't believe the lock helps with the insert. As far as I can tell > If you insert into TLB without holding the lock, you may be inserting > arbitrarily old value. There is only a problem if this is done after purge. On PA 2.0, there are two reorder buffers - one for accesses and the other for integer operations. Some instructions go in both. The arch does not say TLB inserts are accesses. TLB purges are accesses. So, integer operations can leak from a spin lock except when there is a dependency on an access.
On Thu, 20 Sep 2018, John David Anglin wrote: > On 2018-09-20 1:40 PM, Mikulas Patocka wrote: > > > > But I think this isn't sufficient - see my other email. There's this race > > possible if we drop the lock from the fault handler: > > > > CPU1 (executing a page fault handler): > > load *pte from pagetable > > > > CPU2 (executing ptep_set_wrprotect) > > spin_lock_irqsave(&pa_tlb_lock, flags); > > set_pte(ptep, pte_wrprotect(*ptep)); > > purge_tlb_entries(mm, addr); > > spin_unlock_irqrestore(&pa_tlb_lock, flags); > > > > CPU1 (continuing the page fault handler) > > insert pte into TLB - now CPU1 is running with stale pte entry in TLB > > See my comment about locking and TLB inserts. I think we have fairly > deterministic timing in TLB handler Cache misses take hundreds of cycles. You'd have only deterministic behavior if you locked the TLB handler in the cache. > but the lock doesn't ensure that the release won't occur occur before > the insert. Here, I'm assuming we still lock in TLB handler. In the current state (with the lock), this race condition can't happen. It would be: CPU1 (executing a page fault handler): lock pa_tlb_lock load *pte from pagetable insert pte into TLB unlock pa_tlb_lock CPU2 (executing ptep_set_wrprotect) spin_lock_irqsave(&pa_tlb_lock, flags); set_pte(ptep, pte_wrprotect(*ptep)); purge_tlb_entries(mm, addr); spin_unlock_irqrestore(&pa_tlb_lock, flags); --- the purge_tlb_entries function would purge the translation that was loaded by CPU1, so there will be no stale entry in the TLB. The lock is needed even if the TLB handler doesn't modify the pagetable. Mikulas
On Thu, 20 Sep 2018, John David Anglin wrote: > On 2018-09-20 2:39 PM, Mikulas Patocka wrote: > > > > On Thu, 20 Sep 2018, John David Anglin wrote: > > > > > On 2018-09-20 12:11 PM, Mikulas Patocka wrote: > > > > On Thu, 20 Sep 2018, Mikulas Patocka wrote: > > > > > > > > > But there's code like this that needs to be changed. It needs to first > > > > > set > > > > > the PTE and then purge the tlb - otherwise the old pte value could be > > > > > reloaded by a different CPU after purge_tlb_entries() because the TLB > > > > > reload code no longer takes the lock. > > > > > > > > > > static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned > > > > > long > > > > > addr, pte_t *ptep) > > > > > { > > > > > unsigned long flags; > > > > > spin_lock_irqsave(&pa_tlb_lock, flags); > > > > > purge_tlb_entries(mm, addr); > > > > > set_pte(ptep, pte_wrprotect(*ptep)); > > > > > spin_unlock_irqrestore(&pa_tlb_lock, flags); > > > > > } > > > > > > > > > > > > > > > > > > > > The accessed bit is in the same byte as the present bit, so what will > > > > > happen if the TLB fault races with pte clearing? > > > > > > > > > > Linux also encodes the swap location in non-present PTEs - could that > > > > > race > > > > > with the update of the byte containing the accessed bit? > > > > > > > > > > Perhaps the best solution would be to store the accessed and dirty > > > > > flags > > > > > in the two topmost bytes of the PTE and make sure that these bytes are > > > > > not > > > > > used for anything else. > > > > > > > > > > Mikulas > > > > And I realized that even if you don't modify the PTE, dropping the lock > > > > from the TLB handler is wrong. > > > > > > > > If you drop the lock, the CPU can read valid PTE, then wait unlimited > > > > amount of time (due to some microarchitectural event) and then insert > > > > the > > > > PTE into TLB - long after the PTE was cleared and flushed by some other > > > > CPU. > > > > > > > > If you drop the lock, you must stop relying on the instructions that > > > > broadcast TLB shootdown and you must send IPIs to flush TLBs just like > > > > on > > > > x86. > > > I don't believe the lock helps with the insert. As far as I can tell > > > > If you insert into TLB without holding the lock, you may be inserting > > arbitrarily old value. > > There is only a problem if this is done after purge. Yes - and it may happen - I've posted an example: https://www.spinics.net/lists/linux-parisc/msg09131.html > On PA 2.0, there are two reorder buffers - one for accesses and the > other for integer operations. Some instructions go in both. The arch > does not say TLB inserts are accesses. TLB purges are accesses. So, > integer operations can leak from a spin lock except when there is a > dependency on an access. This problem happens even if the CPU didn't reorder instructions at all. The TLB handler first reads a PTE from memory and then inserts it into the TLB. These two operations (load and insert) are not atomic. Between the load and insert, the PTE value in memory may change arbitrarily. Mikulas
On 2018-09-20 3:25 PM, Mikulas Patocka wrote: > > On Thu, 20 Sep 2018, John David Anglin wrote: > >> On 2018-09-20 1:40 PM, Mikulas Patocka wrote: >>> But I think this isn't sufficient - see my other email. There's this race >>> possible if we drop the lock from the fault handler: >>> >>> CPU1 (executing a page fault handler): >>> load *pte from pagetable >>> >>> CPU2 (executing ptep_set_wrprotect) >>> spin_lock_irqsave(&pa_tlb_lock, flags); >>> set_pte(ptep, pte_wrprotect(*ptep)); >>> purge_tlb_entries(mm, addr); >>> spin_unlock_irqrestore(&pa_tlb_lock, flags); >>> >>> CPU1 (continuing the page fault handler) >>> insert pte into TLB - now CPU1 is running with stale pte entry in TLB >> See my comment about locking and TLB inserts. I think we have fairly >> deterministic timing in TLB handler > Cache misses take hundreds of cycles. You'd have only deterministic > behavior if you locked the TLB handler in the cache. Maybe the handlers need to be cache line aligned. Probably would help but maybe with aliasing a line could still get flushed. I don't think there's a way to lock lines in the cache. >> but the lock doesn't ensure that the release won't occur occur before >> the insert. Here, I'm assuming we still lock in TLB handler. > In the current state (with the lock), this race condition can't happen. It > would be: > > CPU1 (executing a page fault handler): > lock pa_tlb_lock > load *pte from pagetable > insert pte into TLB > unlock pa_tlb_lock The above two operations might be interchanged when viewed from CPU2. Is there time for CPU2 to get lock and purge TLB entry before insert? The unlock operation will execute after the last access but not be retired until after insert. > > CPU2 (executing ptep_set_wrprotect) > spin_lock_irqsave(&pa_tlb_lock, flags); > set_pte(ptep, pte_wrprotect(*ptep)); > purge_tlb_entries(mm, addr); > spin_unlock_irqrestore(&pa_tlb_lock, flags); > > --- the purge_tlb_entries function would purge the translation that was > loaded by CPU1, so there will be no stale entry in the TLB. The lock is > needed even if the TLB handler doesn't modify the pagetable. I understand the issue. I'm just not sure the lock solves the issue on PA 2.0. Maybe that's why local purge instructions were added in PA 2.0. If that's the case, then the purge_tlb_entries line needs to be after the unlock. You are convincing me that using IPI and local purges may be the only correct solution. I'll run some tests with the locks in the TLB handler and routines in pgtable.h updated.
On 2018-09-20 3:30 PM, Mikulas Patocka wrote: > > On Thu, 20 Sep 2018, John David Anglin wrote: > >> On 2018-09-20 2:39 PM, Mikulas Patocka wrote: >>> On Thu, 20 Sep 2018, John David Anglin wrote: >>> >>>> On 2018-09-20 12:11 PM, Mikulas Patocka wrote: >>>>> On Thu, 20 Sep 2018, Mikulas Patocka wrote: >>>>> >>>>>> But there's code like this that needs to be changed. It needs to first >>>>>> set >>>>>> the PTE and then purge the tlb - otherwise the old pte value could be >>>>>> reloaded by a different CPU after purge_tlb_entries() because the TLB >>>>>> reload code no longer takes the lock. >>>>>> >>>>>> static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned >>>>>> long >>>>>> addr, pte_t *ptep) >>>>>> { >>>>>> unsigned long flags; >>>>>> spin_lock_irqsave(&pa_tlb_lock, flags); >>>>>> purge_tlb_entries(mm, addr); >>>>>> set_pte(ptep, pte_wrprotect(*ptep)); >>>>>> spin_unlock_irqrestore(&pa_tlb_lock, flags); >>>>>> } >>>>>> >>>>>> >>>>>> >>>>>> The accessed bit is in the same byte as the present bit, so what will >>>>>> happen if the TLB fault races with pte clearing? >>>>>> >>>>>> Linux also encodes the swap location in non-present PTEs - could that >>>>>> race >>>>>> with the update of the byte containing the accessed bit? >>>>>> >>>>>> Perhaps the best solution would be to store the accessed and dirty >>>>>> flags >>>>>> in the two topmost bytes of the PTE and make sure that these bytes are >>>>>> not >>>>>> used for anything else. >>>>>> >>>>>> Mikulas >>>>> And I realized that even if you don't modify the PTE, dropping the lock >>>>> from the TLB handler is wrong. >>>>> >>>>> If you drop the lock, the CPU can read valid PTE, then wait unlimited >>>>> amount of time (due to some microarchitectural event) and then insert >>>>> the >>>>> PTE into TLB - long after the PTE was cleared and flushed by some other >>>>> CPU. >>>>> >>>>> If you drop the lock, you must stop relying on the instructions that >>>>> broadcast TLB shootdown and you must send IPIs to flush TLBs just like >>>>> on >>>>> x86. >>>> I don't believe the lock helps with the insert. As far as I can tell >>> If you insert into TLB without holding the lock, you may be inserting >>> arbitrarily old value. >> There is only a problem if this is done after purge. > Yes - and it may happen - I've posted an example: > https://www.spinics.net/lists/linux-parisc/msg09131.html > >> On PA 2.0, there are two reorder buffers - one for accesses and the >> other for integer operations. Some instructions go in both. The arch >> does not say TLB inserts are accesses. TLB purges are accesses. So, >> integer operations can leak from a spin lock except when there is a >> dependency on an access. > This problem happens even if the CPU didn't reorder instructions at all. > The TLB handler first reads a PTE from memory and then inserts it into the > TLB. These two operations (load and insert) are not atomic. Between the > load and insert, the PTE value in memory may change arbitrarily. Yes. I believe that we probably can't serialize purge and insert operations with a spin lock. That's probably why we still see some random segmentation faults.
On Thu, 20 Sep 2018, John David Anglin wrote: > On 2018-09-20 3:30 PM, Mikulas Patocka wrote: > > > > On Thu, 20 Sep 2018, John David Anglin wrote: > > > > > On 2018-09-20 2:39 PM, Mikulas Patocka wrote: > > > > On Thu, 20 Sep 2018, John David Anglin wrote: > > > > > > > > > On 2018-09-20 12:11 PM, Mikulas Patocka wrote: > > > > > > On Thu, 20 Sep 2018, Mikulas Patocka wrote: > > > > > > > > > > > > > But there's code like this that needs to be changed. It needs to > > > > > > > first > > > > > > > set > > > > > > > the PTE and then purge the tlb - otherwise the old pte value could > > > > > > > be > > > > > > > reloaded by a different CPU after purge_tlb_entries() because the > > > > > > > TLB > > > > > > > reload code no longer takes the lock. > > > > > > > > > > > > > > static inline void ptep_set_wrprotect(struct mm_struct *mm, > > > > > > > unsigned > > > > > > > long > > > > > > > addr, pte_t *ptep) > > > > > > > { > > > > > > > unsigned long flags; > > > > > > > spin_lock_irqsave(&pa_tlb_lock, flags); > > > > > > > purge_tlb_entries(mm, addr); > > > > > > > set_pte(ptep, pte_wrprotect(*ptep)); > > > > > > > spin_unlock_irqrestore(&pa_tlb_lock, flags); > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > > > > > The accessed bit is in the same byte as the present bit, so what > > > > > > > will > > > > > > > happen if the TLB fault races with pte clearing? > > > > > > > > > > > > > > Linux also encodes the swap location in non-present PTEs - could > > > > > > > that > > > > > > > race > > > > > > > with the update of the byte containing the accessed bit? > > > > > > > > > > > > > > Perhaps the best solution would be to store the accessed and dirty > > > > > > > flags > > > > > > > in the two topmost bytes of the PTE and make sure that these bytes > > > > > > > are > > > > > > > not > > > > > > > used for anything else. > > > > > > > > > > > > > > Mikulas > > > > > > And I realized that even if you don't modify the PTE, dropping the > > > > > > lock > > > > > > from the TLB handler is wrong. > > > > > > > > > > > > If you drop the lock, the CPU can read valid PTE, then wait > > > > > > unlimited > > > > > > amount of time (due to some microarchitectural event) and then > > > > > > insert > > > > > > the > > > > > > PTE into TLB - long after the PTE was cleared and flushed by some > > > > > > other > > > > > > CPU. > > > > > > > > > > > > If you drop the lock, you must stop relying on the instructions that > > > > > > broadcast TLB shootdown and you must send IPIs to flush TLBs just > > > > > > like > > > > > > on > > > > > > x86. > > > > > I don't believe the lock helps with the insert. As far as I can tell > > > > If you insert into TLB without holding the lock, you may be inserting > > > > arbitrarily old value. > > > There is only a problem if this is done after purge. > > Yes - and it may happen - I've posted an example: > > https://www.spinics.net/lists/linux-parisc/msg09131.html > > > > > On PA 2.0, there are two reorder buffers - one for accesses and the > > > other for integer operations. Some instructions go in both. The arch > > > does not say TLB inserts are accesses. TLB purges are accesses. So, > > > integer operations can leak from a spin lock except when there is a > > > dependency on an access. > > This problem happens even if the CPU didn't reorder instructions at all. > > The TLB handler first reads a PTE from memory and then inserts it into the > > TLB. These two operations (load and insert) are not atomic. Between the > > load and insert, the PTE value in memory may change arbitrarily. > > Yes. I believe that we probably can't serialize purge and insert > operations with a spin lock. That's probably why we still see some > random segmentation faults. The kmap code is clearly violating the PA-RISC specification. If a userspace page is simultaneously accessed from userspace process and from kernel that called kmap on it, it violates the PA-RISC specification because the virtual addresses are not congruent. It may result in data corruption or even machine checks. kmap should return addresses that are congruent with the current userspace mapping of the page. Mikulas
On Thu, 20 Sep 2018, John David Anglin wrote: > > In the current state (with the lock), this race condition can't happen. It > > would be: > > > > CPU1 (executing a page fault handler): > > lock pa_tlb_lock > > load *pte from pagetable > > insert pte into TLB > > unlock pa_tlb_lock > > The above two operations might be interchanged when viewed from CPU2. > Is there time for CPU2 to get lock and purge TLB entry before insert? It's hard to say if idtlbt is serializing instruction or not. Maybe it is, but the manual doesn't specify that. Anyway, the current code has "sync" between the idtlbt and unlock, so they can't be interchanged. > The unlock operation will execute after the last access but not be retired > until after insert. > > > > > CPU2 (executing ptep_set_wrprotect) > > spin_lock_irqsave(&pa_tlb_lock, flags); > > set_pte(ptep, pte_wrprotect(*ptep)); > > purge_tlb_entries(mm, addr); > > spin_unlock_irqrestore(&pa_tlb_lock, flags); > > > > --- the purge_tlb_entries function would purge the translation that was > > loaded by CPU1, so there will be no stale entry in the TLB. The lock is > > needed even if the TLB handler doesn't modify the pagetable. > > I understand the issue. I'm just not sure the lock solves the issue on > PA 2.0. Maybe that's why local purge instructions were added in PA > 2.0. If that's the case, then the purge_tlb_entries line needs to be > after the unlock. You are convincing me that using IPI and local purges > may be the only correct solution. If we use IPIs, we can drop the lock from the fault handler. > I'll run some tests with the locks in the TLB handler and routines in > pgtable.h updated. Perhaps disassembling the HP-UX TLB handlers would be the best option :) I suppose they selected the optimal implementation. Mikulas
On 2018-09-20 6:47 PM, Mikulas Patocka wrote: > Anyway, the current code has "sync" between the idtlbt and unlock, so they > can't be interchanged. I'm not so sure. The "sync" instruction enforces order on accesses and cache flushes. Dave
On 2018-09-20 6:42 PM, Mikulas Patocka wrote: > > On Thu, 20 Sep 2018, John David Anglin wrote: > >> On 2018-09-20 3:30 PM, Mikulas Patocka wrote: >>> On Thu, 20 Sep 2018, John David Anglin wrote: >>> >>>> On 2018-09-20 2:39 PM, Mikulas Patocka wrote: >>>>> On Thu, 20 Sep 2018, John David Anglin wrote: >>>>> >>>>>> On 2018-09-20 12:11 PM, Mikulas Patocka wrote: >>>>>>> On Thu, 20 Sep 2018, Mikulas Patocka wrote: >>>>>>> >>>>>>>> But there's code like this that needs to be changed. It needs to >>>>>>>> first >>>>>>>> set >>>>>>>> the PTE and then purge the tlb - otherwise the old pte value could >>>>>>>> be >>>>>>>> reloaded by a different CPU after purge_tlb_entries() because the >>>>>>>> TLB >>>>>>>> reload code no longer takes the lock. >>>>>>>> >>>>>>>> static inline void ptep_set_wrprotect(struct mm_struct *mm, >>>>>>>> unsigned >>>>>>>> long >>>>>>>> addr, pte_t *ptep) >>>>>>>> { >>>>>>>> unsigned long flags; >>>>>>>> spin_lock_irqsave(&pa_tlb_lock, flags); >>>>>>>> purge_tlb_entries(mm, addr); >>>>>>>> set_pte(ptep, pte_wrprotect(*ptep)); >>>>>>>> spin_unlock_irqrestore(&pa_tlb_lock, flags); >>>>>>>> } >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> The accessed bit is in the same byte as the present bit, so what >>>>>>>> will >>>>>>>> happen if the TLB fault races with pte clearing? >>>>>>>> >>>>>>>> Linux also encodes the swap location in non-present PTEs - could >>>>>>>> that >>>>>>>> race >>>>>>>> with the update of the byte containing the accessed bit? >>>>>>>> >>>>>>>> Perhaps the best solution would be to store the accessed and dirty >>>>>>>> flags >>>>>>>> in the two topmost bytes of the PTE and make sure that these bytes >>>>>>>> are >>>>>>>> not >>>>>>>> used for anything else. >>>>>>>> >>>>>>>> Mikulas >>>>>>> And I realized that even if you don't modify the PTE, dropping the >>>>>>> lock >>>>>>> from the TLB handler is wrong. >>>>>>> >>>>>>> If you drop the lock, the CPU can read valid PTE, then wait >>>>>>> unlimited >>>>>>> amount of time (due to some microarchitectural event) and then >>>>>>> insert >>>>>>> the >>>>>>> PTE into TLB - long after the PTE was cleared and flushed by some >>>>>>> other >>>>>>> CPU. >>>>>>> >>>>>>> If you drop the lock, you must stop relying on the instructions that >>>>>>> broadcast TLB shootdown and you must send IPIs to flush TLBs just >>>>>>> like >>>>>>> on >>>>>>> x86. >>>>>> I don't believe the lock helps with the insert. As far as I can tell >>>>> If you insert into TLB without holding the lock, you may be inserting >>>>> arbitrarily old value. >>>> There is only a problem if this is done after purge. >>> Yes - and it may happen - I've posted an example: >>> https://www.spinics.net/lists/linux-parisc/msg09131.html >>> >>>> On PA 2.0, there are two reorder buffers - one for accesses and the >>>> other for integer operations. Some instructions go in both. The arch >>>> does not say TLB inserts are accesses. TLB purges are accesses. So, >>>> integer operations can leak from a spin lock except when there is a >>>> dependency on an access. >>> This problem happens even if the CPU didn't reorder instructions at all. >>> The TLB handler first reads a PTE from memory and then inserts it into the >>> TLB. These two operations (load and insert) are not atomic. Between the >>> load and insert, the PTE value in memory may change arbitrarily. >> Yes. I believe that we probably can't serialize purge and insert >> operations with a spin lock. That's probably why we still see some >> random segmentation faults. > The kmap code is clearly violating the PA-RISC specification. > > If a userspace page is simultaneously accessed from userspace process and > from kernel that called kmap on it, it violates the PA-RISC specification > because the virtual addresses are not congruent. It may result in data > corruption or even machine checks. > > kmap should return addresses that are congruent with the current userspace > mapping of the page. If you have thoughts on how to do that, let me know. At the moment, we flush the userspace page before using the kernel page. We flush the kernel page when we are done. However, these flushes may leave TLB entries which potentially might corrupt the cache. There are TLB purges after but there is a window where the TLB might do something bad. I tried using tmpalias flushes but they caused lockups sometimes. The tmpalias flushes suppress move in. Dave
On 2018-09-20 6:47 PM, Mikulas Patocka wrote: > Perhaps disassembling the HP-UX TLB handlers would be the best option:) > I suppose they selected the optimal implementation. This is what I see. The situation isn't very clear as I believe the code may get patched depending on the processor. 000000000002b0a0 <tbitmss_PCXU>: 2b0a0: d9 19 03 e0 extrd,u,* r8,31,32,r25 2b0a4: 0b 21 02 99 xor r1,r25,r25 2b0a8: f3 19 0c 0c depd,* r25,31,20,r24 000000000002b0ac <pdir_base_patch_021>: 2b0ac: 20 20 00 0a ldil L%500000,r1 000000000002b0b0 <pdir_shift_patch_021>: 2b0b0: f0 21 00 00 depd,z,* r1,63,32,r1 000000000002b0b4 <pdir_mask_patch_021>: 2b0b4: f0 31 04 a8 depd,* r17,58,24,r1 2b0b8: 0c 20 10 d1 ldd 0(r1),r17 000000000002b0bc <tbitloop_PCXU>: 2b0bc: bf 11 20 3a cmpb,*<>,n r17,r24,2b0e0 <tbit_target_miss_PCXU> 2b0c0: 0c 30 10 c8 ldd 8(r1),r8 2b0c4: 50 29 00 20 ldd 10(r1),r9 2b0c8: c4 48 60 3a bb,*<,n r8,2,2b0ec <t_vioref_trap_PCXU> 000000000002b0cc <make_nop_if_split_TLB_2_0_10>: 2b0cc: f5 0e 0d 3d depdi,* 7,22,3,r8 2b0d0: 05 09 18 00 idtlbt r9,r8 2b0d4: 0c 28 12 d0 std r8,8(r1) 2b0d8: 00 00 0c a0 rfi,r 2b0dc: 08 00 02 40 nop 000000000002b0e0 <tbit_target_miss_PCXU>: 2b0e0: 50 21 00 30 ldd 18(r1),r1 2b0e4: bc 20 3f a7 cmpb,*<>,n r0,r1,2b0bc <tbitloop_PCXU> 2b0e8: 0c 20 10 d1 ldd 0(r1),r17 000000000002b0ec <t_vioref_trap_PCXU>: 2b0ec: 20 35 90 14 ldil L%aaa800,r1 2b0f0: 48 21 00 28 ldw 14(r1),r1 2b0f4: 08 01 22 40 or,= r1,r0,r0 2b0f8: e8 20 c0 02 bv,n r0(r1) 2b0fc: 03 96 18 40 mtctl r22,tr4 2b100: 03 c2 18 40 mtctl rp,tr6 2b104: e8 42 04 80 b,l 2f34c <$swap_shadows>,rp 2b108: 03 5a 18 40 mtctl r26,tr2 2b10c: 34 1a 00 02 ldi 1,r26 2b110: 03 00 08 b6 mfctl tr0,r22 2b114: 6a da 00 e0 stw r26,70(r22) 2b118: 03 c0 08 a2 mfctl tr6,rp 2b11c: 02 00 08 b6 mfctl itmr,r22 2b120: 03 fe 18 40 mtctl sp,tr7 2b124: 03 b6 18 40 mtctl r22,tr5 2b128: 23 56 50 02 ldil L%16c800,r26 2b12c: e3 40 2c 08 be 604(sr4,r26) 2b130: 34 1a 00 2a ldi 15,r26 ...
On 2018-09-20 8:31 PM, John David Anglin wrote: > On 2018-09-20 6:47 PM, Mikulas Patocka wrote: >> Perhaps disassembling the HP-UX TLB handlers would be the best >> option:) I suppose they selected the optimal implementation. > This is what I see. The situation isn't very clear as I believe the > code may get patched depending > on the processor. > 000000000002b0a0 <tbitmss_PCXU>: > 2b0a0: d9 19 03 e0 extrd,u,* r8,31,32,r25 > 2b0a4: 0b 21 02 99 xor r1,r25,r25 > 2b0a8: f3 19 0c 0c depd,* r25,31,20,r24 > > 000000000002b0ac <pdir_base_patch_021>: > 2b0ac: 20 20 00 0a ldil L%500000,r1 > > 000000000002b0b0 <pdir_shift_patch_021>: > 2b0b0: f0 21 00 00 depd,z,* r1,63,32,r1 > > 000000000002b0b4 <pdir_mask_patch_021>: > 2b0b4: f0 31 04 a8 depd,* r17,58,24,r1 > 2b0b8: 0c 20 10 d1 ldd 0(r1),r17 > > 000000000002b0bc <tbitloop_PCXU>: > 2b0bc: bf 11 20 3a cmpb,*<>,n r17,r24,2b0e0 > <tbit_target_miss_PCXU> > 2b0c0: 0c 30 10 c8 ldd 8(r1),r8 > 2b0c4: 50 29 00 20 ldd 10(r1),r9 > 2b0c8: c4 48 60 3a bb,*<,n r8,2,2b0ec <t_vioref_trap_PCXU> > > 000000000002b0cc <make_nop_if_split_TLB_2_0_10>: > 2b0cc: f5 0e 0d 3d depdi,* 7,22,3,r8 > 2b0d0: 05 09 18 00 idtlbt r9,r8 > 2b0d4: 0c 28 12 d0 std r8,8(r1) > 2b0d8: 00 00 0c a0 rfi,r > 2b0dc: 08 00 02 40 nop I would say HP-UX has a much better page table layout and lookup. The flags are set by the "depdi" instruction and they are always written back. There's no locking so I suspect HP-UX only changes the page directory when it's safe to do so. Dave
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index fa6b7c78f18a..7efce946ba04 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -202,7 +202,7 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #define _PAGE_HUGE (1 << xlate_pabit(_PAGE_HPAGE_BIT)) #define _PAGE_USER (1 << xlate_pabit(_PAGE_USER_BIT)) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) #define _PAGE_KERNEL_RO (_PAGE_PRESENT | _PAGE_READ | _PAGE_DIRTY | _PAGE_ACCESSED) #define _PAGE_KERNEL_EXEC (_PAGE_KERNEL_RO | _PAGE_EXEC) @@ -227,22 +227,22 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #ifndef __ASSEMBLY__ -#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_ACCESSED) +#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_USER) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE) /* Others seem to make this executable, I don't know if that's correct or not. The stack is mapped this way though so this is necessary in the short term - dhd@linuxcare.com, 2000-08-08 */ -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED) -#define PAGE_WRITEONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE | _PAGE_ACCESSED) -#define PAGE_EXECREAD __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC |_PAGE_ACCESSED) +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ) +#define PAGE_WRITEONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITE) +#define PAGE_EXECREAD __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC) #define PAGE_COPY PAGE_EXECREAD -#define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC |_PAGE_ACCESSED) +#define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) #define PAGE_KERNEL_RWX __pgprot(_PAGE_KERNEL_RWX) #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO) #define PAGE_KERNEL_UNC __pgprot(_PAGE_KERNEL | _PAGE_NO_CACHE) -#define PAGE_GATEWAY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_GATEWAY| _PAGE_READ) +#define PAGE_GATEWAY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_GATEWAY| _PAGE_READ) /* diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 1b4732e20137..8039241b669b 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -46,16 +46,6 @@ .level 2.0 #endif - .import pa_tlb_lock,data - .macro load_pa_tlb_lock reg -#if __PA_LDCW_ALIGNMENT > 4 - load32 PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg - depi 0,31,__PA_LDCW_ALIGN_ORDER, \reg -#else - load32 PA(pa_tlb_lock), \reg -#endif - .endm - /* space_to_prot macro creates a prot id from a space id */ #if (SPACEID_SHIFT) == 0 @@ -462,47 +452,17 @@ L2_ptep \pgd,\pte,\index,\va,\fault .endm - /* Acquire pa_tlb_lock lock and recheck page is still present. */ - .macro tlb_lock spc,ptp,pte,tmp,tmp1,fault -#ifdef CONFIG_SMP - cmpib,COND(=),n 0,\spc,2f - load_pa_tlb_lock \tmp -1: LDCW 0(\tmp),\tmp1 - cmpib,COND(=) 0,\tmp1,1b - nop - LDREG 0(\ptp),\pte - bb,<,n \pte,_PAGE_PRESENT_BIT,2f - b \fault - stw \spc,0(\tmp) -2: -#endif - .endm - - /* Release pa_tlb_lock lock without reloading lock address. */ - .macro tlb_unlock0 spc,tmp -#ifdef CONFIG_SMP - or,COND(=) %r0,\spc,%r0 - sync - or,COND(=) %r0,\spc,%r0 - stw \spc,0(\tmp) -#endif - .endm - - /* Release pa_tlb_lock lock. */ - .macro tlb_unlock1 spc,tmp -#ifdef CONFIG_SMP - load_pa_tlb_lock \tmp - tlb_unlock0 \spc,\tmp -#endif - .endm - - /* Set the _PAGE_ACCESSED bit of the PTE. Be clever and - * don't needlessly dirty the cache line if it was already set */ + /* Set the _PAGE_ACCESSED bit of the PTE without overwriting the + * dirty bit. Fortunately, it resides in a different byte of + * the PTE than the dirty bit. Be clever and don't needlessly + * dirty the cache line if it was already set */ .macro update_accessed ptp,pte,tmp,tmp1 ldi _PAGE_ACCESSED,\tmp1 or \tmp1,\pte,\tmp + /* Extract aligned byte from PTE containing page accessed bit */ + extru \tmp,_PAGE_ACCESSED_BIT,8,\tmp and,COND(<>) \tmp1,\pte,%r0 - STREG \tmp,0(\ptp) + stb \tmp,__BITS_PER_LONG/8-2(\tmp) .endm /* Set the dirty bit (and accessed bit). No need to be @@ -1167,14 +1127,12 @@ dtlb_miss_20w: L3_ptep ptp,pte,t0,va,dtlb_check_alias_20w - tlb_lock spc,ptp,pte,t0,t1,dtlb_check_alias_20w update_accessed ptp,pte,t0,t1 make_insert_tlb spc,pte,prot,t1 idtlbt pte,prot - tlb_unlock1 spc,t0 rfir nop @@ -1193,14 +1151,12 @@ nadtlb_miss_20w: L3_ptep ptp,pte,t0,va,nadtlb_check_alias_20w - tlb_lock spc,ptp,pte,t0,t1,nadtlb_check_alias_20w update_accessed ptp,pte,t0,t1 make_insert_tlb spc,pte,prot,t1 idtlbt pte,prot - tlb_unlock1 spc,t0 rfir nop @@ -1221,7 +1177,6 @@ dtlb_miss_11: L2_ptep ptp,pte,t0,va,dtlb_check_alias_11 - tlb_lock spc,ptp,pte,t0,t1,dtlb_check_alias_11 update_accessed ptp,pte,t0,t1 make_insert_tlb_11 spc,pte,prot @@ -1234,7 +1189,6 @@ dtlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock1 spc,t0 rfir nop @@ -1254,7 +1208,6 @@ nadtlb_miss_11: L2_ptep ptp,pte,t0,va,nadtlb_check_alias_11 - tlb_lock spc,ptp,pte,t0,t1,nadtlb_check_alias_11 update_accessed ptp,pte,t0,t1 make_insert_tlb_11 spc,pte,prot @@ -1267,7 +1220,6 @@ nadtlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock1 spc,t0 rfir nop @@ -1287,7 +1239,6 @@ dtlb_miss_20: L2_ptep ptp,pte,t0,va,dtlb_check_alias_20 - tlb_lock spc,ptp,pte,t0,t1,dtlb_check_alias_20 update_accessed ptp,pte,t0,t1 make_insert_tlb spc,pte,prot,t1 @@ -1296,7 +1247,6 @@ dtlb_miss_20: idtlbt pte,prot - tlb_unlock1 spc,t0 rfir nop @@ -1315,7 +1265,6 @@ nadtlb_miss_20: L2_ptep ptp,pte,t0,va,nadtlb_check_alias_20 - tlb_lock spc,ptp,pte,t0,t1,nadtlb_check_alias_20 update_accessed ptp,pte,t0,t1 make_insert_tlb spc,pte,prot,t1 @@ -1324,7 +1273,6 @@ nadtlb_miss_20: idtlbt pte,prot - tlb_unlock1 spc,t0 rfir nop @@ -1424,14 +1372,12 @@ itlb_miss_20w: L3_ptep ptp,pte,t0,va,itlb_fault - tlb_lock spc,ptp,pte,t0,t1,itlb_fault update_accessed ptp,pte,t0,t1 make_insert_tlb spc,pte,prot,t1 iitlbt pte,prot - tlb_unlock1 spc,t0 rfir nop @@ -1448,14 +1394,12 @@ naitlb_miss_20w: L3_ptep ptp,pte,t0,va,naitlb_check_alias_20w - tlb_lock spc,ptp,pte,t0,t1,naitlb_check_alias_20w update_accessed ptp,pte,t0,t1 make_insert_tlb spc,pte,prot,t1 iitlbt pte,prot - tlb_unlock1 spc,t0 rfir nop @@ -1476,7 +1420,6 @@ itlb_miss_11: L2_ptep ptp,pte,t0,va,itlb_fault - tlb_lock spc,ptp,pte,t0,t1,itlb_fault update_accessed ptp,pte,t0,t1 make_insert_tlb_11 spc,pte,prot @@ -1489,7 +1432,6 @@ itlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock1 spc,t0 rfir nop @@ -1500,7 +1442,6 @@ naitlb_miss_11: L2_ptep ptp,pte,t0,va,naitlb_check_alias_11 - tlb_lock spc,ptp,pte,t0,t1,naitlb_check_alias_11 update_accessed ptp,pte,t0,t1 make_insert_tlb_11 spc,pte,prot @@ -1513,7 +1454,6 @@ naitlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock1 spc,t0 rfir nop @@ -1534,7 +1474,6 @@ itlb_miss_20: L2_ptep ptp,pte,t0,va,itlb_fault - tlb_lock spc,ptp,pte,t0,t1,itlb_fault update_accessed ptp,pte,t0,t1 make_insert_tlb spc,pte,prot,t1 @@ -1543,7 +1482,6 @@ itlb_miss_20: iitlbt pte,prot - tlb_unlock1 spc,t0 rfir nop @@ -1554,7 +1492,6 @@ naitlb_miss_20: L2_ptep ptp,pte,t0,va,naitlb_check_alias_20 - tlb_lock spc,ptp,pte,t0,t1,naitlb_check_alias_20 update_accessed ptp,pte,t0,t1 make_insert_tlb spc,pte,prot,t1 @@ -1563,7 +1500,6 @@ naitlb_miss_20: iitlbt pte,prot - tlb_unlock1 spc,t0 rfir nop @@ -1586,14 +1522,12 @@ dbit_trap_20w: L3_ptep ptp,pte,t0,va,dbit_fault - tlb_lock spc,ptp,pte,t0,t1,dbit_fault update_dirty ptp,pte,t1 make_insert_tlb spc,pte,prot,t1 idtlbt pte,prot - tlb_unlock0 spc,t0 rfir nop #else @@ -1606,7 +1540,6 @@ dbit_trap_11: L2_ptep ptp,pte,t0,va,dbit_fault - tlb_lock spc,ptp,pte,t0,t1,dbit_fault update_dirty ptp,pte,t1 make_insert_tlb_11 spc,pte,prot @@ -1619,7 +1552,6 @@ dbit_trap_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock0 spc,t0 rfir nop @@ -1630,7 +1562,6 @@ dbit_trap_20: L2_ptep ptp,pte,t0,va,dbit_fault - tlb_lock spc,ptp,pte,t0,t1,dbit_fault update_dirty ptp,pte,t1 make_insert_tlb spc,pte,prot,t1 @@ -1639,7 +1570,6 @@ dbit_trap_20: idtlbt pte,prot - tlb_unlock0 spc,t0 rfir nop #endif
Locking was introduced into the TLB handlers primarily to ensure that updates to the accessed and dirty bits in PTE entries were atomic. However, this has a significant impact on performance. Also, it turned out that the user page defines in pgtable.h always set the accessed bit, so it wasn't necessary to update the accessed bit at all. The impact of this is that the kernel doesn't know if a page has been accessed or not. This patch removes the locking from the TLB handlers. We update the page accessed bit in the page table entries using a stb instruction that doesn't affect the dirty bit. Thus, an access can't clobber the dirty bit. With this change, the accessed bit in user PTE entries is now updated when a page is accessed and hopefully this will improve performance. Signed-off-by: John David Anglin <dave.anglin@bell.net>