Message ID | 20230309180213.180263-2-hbathini@linux.ibm.com (mailing list archive) |
---|---|
State | Not Applicable |
Delegated to: | BPF |
Headers | show |
Series | enable bpf_prog_pack allocator for powerpc | expand |
Le 09/03/2023 à 19:02, Hari Bathini a écrit : > patch_instruction() entails setting up pte, patching the instruction, > clearing the pte and flushing the tlb. If multiple instructions need > to be patched, every instruction would have to go through the above > drill unnecessarily. Instead, introduce function patch_instructions() > that patches multiple instructions at one go while setting up the pte, > clearing the pte and flushing the tlb only once per page range of > instructions. Observed ~5X improvement in speed of execution using > patch_instructions() over patch_instructions(), when more instructions > are to be patched. I get a 13% degradation on the time needed to activate ftrace on a powerpc 8xx. Before your patch, activation ftrace takes 550k timebase ticks. After your patch it takes 620k timebase ticks. Christophe > > Signed-off-by: Hari Bathini <hbathini@linux.ibm.com> > --- > arch/powerpc/include/asm/code-patching.h | 1 + > arch/powerpc/lib/code-patching.c | 151 ++++++++++++++++------- > 2 files changed, 106 insertions(+), 46 deletions(-) > > diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h > index 3f881548fb61..059fc4fe700e 100644 > --- a/arch/powerpc/include/asm/code-patching.h > +++ b/arch/powerpc/include/asm/code-patching.h > @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr, > int patch_branch(u32 *addr, unsigned long target, int flags); > int patch_instruction(u32 *addr, ppc_inst_t instr); > int raw_patch_instruction(u32 *addr, ppc_inst_t instr); > +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len); > > static inline unsigned long patch_site_addr(s32 *site) > { > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c > index b00112d7ad46..33857b9b53de 100644 > --- a/arch/powerpc/lib/code-patching.c > +++ b/arch/powerpc/lib/code-patching.c > @@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr) > flush_tlb_kernel_range(addr, addr + PAGE_SIZE); > } > > -static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) > +static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, size_t len) > { > - int err; > - u32 *patch_addr; > - unsigned long text_poke_addr; > - pte_t *pte; > - unsigned long pfn = get_patch_pfn(addr); > - struct mm_struct *patching_mm; > - struct mm_struct *orig_mm; > + struct mm_struct *patching_mm, *orig_mm; > + unsigned long text_poke_addr, pfn; > + u32 *patch_addr, *end, *pend; > + ppc_inst_t instr; > spinlock_t *ptl; > + int ilen, err; > + pte_t *pte; > > patching_mm = __this_cpu_read(cpu_patching_context.mm); > text_poke_addr = __this_cpu_read(cpu_patching_context.addr); > - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > > pte = get_locked_pte(patching_mm, text_poke_addr, &ptl); > if (!pte) > return -ENOMEM; > > - __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); > + end = (void *)addr + len; > + do { > + pfn = get_patch_pfn(addr); > + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); > > - /* order PTE update before use, also serves as the hwsync */ > - asm volatile("ptesync": : :"memory"); > - > - /* order context switch after arbitrary prior code */ > - isync(); > - > - orig_mm = start_using_temp_mm(patching_mm); > - > - err = __patch_instruction(addr, instr, patch_addr); > + /* order PTE update before use, also serves as the hwsync */ > + asm volatile("ptesync": : :"memory"); > > - /* hwsync performed by __patch_instruction (sync) if successful */ > - if (err) > - mb(); /* sync */ > + /* order context switch after arbitrary prior code */ > + isync(); > + > + orig_mm = start_using_temp_mm(patching_mm); > + > + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr); > + if (end < pend) > + pend = end; > + > + while (addr < pend) { > + instr = ppc_inst_read(code); > + ilen = ppc_inst_len(instr); > + err = __patch_instruction(addr, instr, patch_addr); > + /* hwsync performed by __patch_instruction (sync) if successful */ > + if (err) { > + mb(); /* sync */ > + break; > + } > + > + patch_addr = (void *)patch_addr + ilen; > + addr = (void *)addr + ilen; > + if (!fill_inst) > + code = (void *)code + ilen; > + } > > - /* context synchronisation performed by __patch_instruction (isync or exception) */ > - stop_using_temp_mm(patching_mm, orig_mm); > + /* context synchronisation performed by __patch_instruction (isync or exception) */ > + stop_using_temp_mm(patching_mm, orig_mm); > > - pte_clear(patching_mm, text_poke_addr, pte); > - /* > - * ptesync to order PTE update before TLB invalidation done > - * by radix__local_flush_tlb_page_psize (in _tlbiel_va) > - */ > - local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); > + pte_clear(patching_mm, text_poke_addr, pte); > + /* > + * ptesync to order PTE update before TLB invalidation done > + * by radix__local_flush_tlb_page_psize (in _tlbiel_va) > + */ > + local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); > + if (err) > + break; > + } while (addr < end); > > pte_unmap_unlock(pte, ptl); > > return err; > } > > -static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) > +static int __do_patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len) > { > - int err; > - u32 *patch_addr; > - unsigned long text_poke_addr; > + unsigned long text_poke_addr, pfn; > + u32 *patch_addr, *end, *pend; > + ppc_inst_t instr; > + int ilen, err; > pte_t *pte; > - unsigned long pfn = get_patch_pfn(addr); > > text_poke_addr = (unsigned long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK; > - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > - > pte = __this_cpu_read(cpu_patching_context.pte); > - __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); > - /* See ptesync comment in radix__set_pte_at() */ > - if (radix_enabled()) > - asm volatile("ptesync": : :"memory"); > > - err = __patch_instruction(addr, instr, patch_addr); > + end = (void *)addr + len; > + do { > + pfn = get_patch_pfn(addr); > + __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); > + /* See ptesync comment in radix__set_pte_at() */ > + if (radix_enabled()) > + asm volatile("ptesync": : :"memory"); > + > + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr); > + if (end < pend) > + pend = end; > + > + while (addr < pend) { > + instr = ppc_inst_read(code); > + ilen = ppc_inst_len(instr); > + err = __patch_instruction(addr, instr, patch_addr); > + if (err) > + break; > + > + patch_addr = (void *)patch_addr + ilen; > + addr = (void *)addr + ilen; > + if (!fill_inst) > + code = (void *)code + ilen; > + } > > - pte_clear(&init_mm, text_poke_addr, pte); > - flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); > + pte_clear(&init_mm, text_poke_addr, pte); > + flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); > + if (err) > + break; > + } while (addr < end); > > return err; > } > @@ -369,15 +409,34 @@ int patch_instruction(u32 *addr, ppc_inst_t instr) > > local_irq_save(flags); > if (mm_patch_enabled()) > - err = __do_patch_instruction_mm(addr, instr); > + err = __do_patch_instructions_mm(addr, (u32 *)&instr, false, ppc_inst_len(instr)); > else > - err = __do_patch_instruction(addr, instr); > + err = __do_patch_instructions(addr, (u32 *)&instr, false, ppc_inst_len(instr)); > local_irq_restore(flags); > > return err; > } > NOKPROBE_SYMBOL(patch_instruction); > > +/* > + * Patch 'addr' with 'len' bytes of instructions from 'code'. > + */ > +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len) > +{ > + unsigned long flags; > + int err; > + > + local_irq_save(flags); > + if (mm_patch_enabled()) > + err = __do_patch_instructions_mm(addr, code, fill_inst, len); > + else > + err = __do_patch_instructions(addr, code, fill_inst, len); > + local_irq_restore(flags); > + > + return err; > +} > +NOKPROBE_SYMBOL(patch_instructions); > + > int patch_branch(u32 *addr, unsigned long target, int flags) > { > ppc_inst_t instr;
Le 10/03/2023 à 19:26, Christophe Leroy a écrit : > > > Le 09/03/2023 à 19:02, Hari Bathini a écrit : >> patch_instruction() entails setting up pte, patching the instruction, >> clearing the pte and flushing the tlb. If multiple instructions need >> to be patched, every instruction would have to go through the above >> drill unnecessarily. Instead, introduce function patch_instructions() >> that patches multiple instructions at one go while setting up the pte, >> clearing the pte and flushing the tlb only once per page range of >> instructions. Observed ~5X improvement in speed of execution using >> patch_instructions() over patch_instructions(), when more instructions >> are to be patched. > > I get a 13% degradation on the time needed to activate ftrace on a > powerpc 8xx. > > Before your patch, activation ftrace takes 550k timebase ticks. After > your patch it takes 620k timebase ticks. > More details about the problem: Before your patch, patch_instruction() is a simple, stackless function (Note that the first branch is noped out after startup). 00000254 <patch_instruction>: 254: 48 00 00 6c b 2c0 <patch_instruction+0x6c> 258: 7c e0 00 a6 mfmsr r7 25c: 7c 51 13 a6 mtspr 81,r2 260: 3d 40 00 00 lis r10,0 262: R_PPC_ADDR16_HA .data 264: 39 4a 00 00 addi r10,r10,0 266: R_PPC_ADDR16_LO .data 268: 7c 69 1b 78 mr r9,r3 26c: 3d 29 40 00 addis r9,r9,16384 270: 81 0a 00 08 lwz r8,8(r10) 274: 55 29 00 26 rlwinm r9,r9,0,0,19 278: 81 4a 00 04 lwz r10,4(r10) 27c: 61 29 01 25 ori r9,r9,293 280: 91 28 00 00 stw r9,0(r8) 284: 55 49 00 26 rlwinm r9,r10,0,0,19 288: 50 6a 05 3e rlwimi r10,r3,0,20,31 28c: 90 8a 00 00 stw r4,0(r10) 290: 7c 00 50 6c dcbst 0,r10 294: 7c 00 04 ac hwsync 298: 7c 00 1f ac icbi 0,r3 29c: 7c 00 04 ac hwsync 2a0: 4c 00 01 2c isync 2a4: 38 60 00 00 li r3,0 2a8: 39 40 00 00 li r10,0 2ac: 91 48 00 00 stw r10,0(r8) 2b0: 7c 00 4a 64 tlbie r9,r0 2b4: 7c 00 04 ac hwsync 2b8: 7c e0 01 24 mtmsr r7 2bc: 4e 80 00 20 blr 2c0: 90 83 00 00 stw r4,0(r3) 2c4: 7c 00 18 6c dcbst 0,r3 2c8: 7c 00 04 ac hwsync 2cc: 7c 00 1f ac icbi 0,r3 2d0: 7c 00 04 ac hwsync 2d4: 4c 00 01 2c isync 2d8: 38 60 00 00 li r3,0 2dc: 4e 80 00 20 blr 2e0: 38 60 ff ff li r3,-1 2e4: 4b ff ff c4 b 2a8 <patch_instruction+0x54> 2e8: 38 60 ff ff li r3,-1 2ec: 4e 80 00 20 blr Once your patch is there, patch_instruction() becomes a function that has to step up a stack in order to call __do_patch_instructions(). And __do_patch_instructions() is quite a big function. 0000022c <__do_patch_instructions>: 22c: 3d 20 00 00 lis r9,0 22e: R_PPC_ADDR16_HA .data 230: 39 29 00 00 addi r9,r9,0 232: R_PPC_ADDR16_LO .data 234: 81 69 00 04 lwz r11,4(r9) 238: 2c 05 00 00 cmpwi r5,0 23c: 81 89 00 08 lwz r12,8(r9) 240: 7c c3 32 14 add r6,r3,r6 244: 55 6b 00 26 rlwinm r11,r11,0,0,19 248: 38 00 00 00 li r0,0 24c: 54 6a 05 3e clrlwi r10,r3,20 250: 21 0a 10 00 subfic r8,r10,4096 254: 7d 03 42 14 add r8,r3,r8 258: 7c 69 1b 78 mr r9,r3 25c: 7f 88 30 40 cmplw cr7,r8,r6 260: 3d 29 40 00 addis r9,r9,16384 264: 55 29 00 26 rlwinm r9,r9,0,0,19 268: 61 29 01 25 ori r9,r9,293 26c: 91 2c 00 00 stw r9,0(r12) 270: 7d 4a 5b 78 or r10,r10,r11 274: 40 9d 00 08 ble cr7,27c <__do_patch_instructions+0x50> 278: 7c c8 33 78 mr r8,r6 27c: 7f 83 40 40 cmplw cr7,r3,r8 280: 40 9c 01 2c bge cr7,3ac <__do_patch_instructions+0x180> 284: 7c 69 18 f8 not r9,r3 288: 7d 28 4a 14 add r9,r8,r9 28c: 55 29 f7 fe rlwinm r9,r9,30,31,31 290: 7c e3 50 50 subf r7,r3,r10 294: 80 a4 00 00 lwz r5,0(r4) 298: 90 aa 00 00 stw r5,0(r10) 29c: 7c 00 50 6c dcbst 0,r10 2a0: 7c 00 04 ac hwsync 2a4: 7c 00 1f ac icbi 0,r3 2a8: 7c 00 04 ac hwsync 2ac: 4c 00 01 2c isync 2b0: 38 63 00 04 addi r3,r3,4 2b4: 40 82 00 08 bne 2bc <__do_patch_instructions+0x90> 2b8: 38 84 00 04 addi r4,r4,4 2bc: 7f 83 40 40 cmplw cr7,r3,r8 2c0: 40 9c 00 a4 bge cr7,364 <__do_patch_instructions+0x138> 2c4: 2f 89 00 00 cmpwi cr7,r9,0 2c8: 41 9e 00 38 beq cr7,300 <__do_patch_instructions+0xd4> 2cc: 7d 23 3a 14 add r9,r3,r7 2d0: 81 44 00 00 lwz r10,0(r4) 2d4: 91 49 00 00 stw r10,0(r9) 2d8: 7c 00 48 6c dcbst 0,r9 2dc: 7c 00 04 ac hwsync 2e0: 7c 00 1f ac icbi 0,r3 2e4: 7c 00 04 ac hwsync 2e8: 4c 00 01 2c isync 2ec: 38 63 00 04 addi r3,r3,4 2f0: 40 82 00 08 bne 2f8 <__do_patch_instructions+0xcc> 2f4: 38 84 00 04 addi r4,r4,4 2f8: 7f 83 40 40 cmplw cr7,r3,r8 2fc: 40 9c 00 68 bge cr7,364 <__do_patch_instructions+0x138> 300: 7d 23 3a 14 add r9,r3,r7 304: 81 44 00 00 lwz r10,0(r4) 308: 91 49 00 00 stw r10,0(r9) 30c: 7c 00 48 6c dcbst 0,r9 310: 7c 00 04 ac hwsync 314: 7c 00 1f ac icbi 0,r3 318: 7c 00 04 ac hwsync 31c: 4c 00 01 2c isync 320: 38 63 00 04 addi r3,r3,4 324: 7c 69 1b 78 mr r9,r3 328: 40 82 00 08 bne 330 <__do_patch_instructions+0x104> 32c: 38 84 00 04 addi r4,r4,4 330: 7d 49 3a 14 add r10,r9,r7 334: 80 a4 00 00 lwz r5,0(r4) 338: 90 aa 00 00 stw r5,0(r10) 33c: 7c 00 50 6c dcbst 0,r10 340: 7c 00 04 ac hwsync 344: 7c 00 4f ac icbi 0,r9 348: 7c 00 04 ac hwsync 34c: 4c 00 01 2c isync 350: 38 69 00 04 addi r3,r9,4 354: 7f 83 40 40 cmplw cr7,r3,r8 358: 40 82 00 08 bne 360 <__do_patch_instructions+0x134> 35c: 38 84 00 04 addi r4,r4,4 360: 41 9c ff a0 blt cr7,300 <__do_patch_instructions+0xd4> 364: 90 0c 00 00 stw r0,0(r12) 368: 39 20 00 00 li r9,0 36c: 7c 00 5a 64 tlbie r11,r0 370: 7c 00 04 ac hwsync 374: 2f 89 00 00 cmpwi cr7,r9,0 378: 40 9e 00 2c bne cr7,3a4 <__do_patch_instructions+0x178> 37c: 7f 86 18 40 cmplw cr7,r6,r3 380: 41 9d fe cc bgt cr7,24c <__do_patch_instructions+0x20> 384: 38 60 00 00 li r3,0 388: 4e 80 00 20 blr 38c: 90 0c 00 00 stw r0,0(r12) 390: 39 20 ff ff li r9,-1 394: 7c 00 5a 64 tlbie r11,r0 398: 7c 00 04 ac hwsync 39c: 2f 89 00 00 cmpwi cr7,r9,0 3a0: 41 9e ff dc beq cr7,37c <__do_patch_instructions+0x150> 3a4: 38 60 ff ff li r3,-1 3a8: 4e 80 00 20 blr 3ac: 39 20 00 00 li r9,0 3b0: 91 2c 00 00 stw r9,0(r12) 3b4: 7c 00 5a 64 tlbie r11,r0 3b8: 7c 00 04 ac hwsync 3bc: 4b ff ff c0 b 37c <__do_patch_instructions+0x150> 000003e8 <patch_instruction>: 3e8: 94 21 ff e0 stwu r1,-32(r1) 3ec: 90 81 00 08 stw r4,8(r1) 3f0: 48 00 00 40 b 430 <patch_instruction+0x48> 3f4: 7c 08 02 a6 mflr r0 3f8: 90 01 00 24 stw r0,36(r1) 3fc: 93 e1 00 1c stw r31,28(r1) 400: 7f e0 00 a6 mfmsr r31 404: 7c 51 13 a6 mtspr 81,r2 408: 38 c0 00 04 li r6,4 40c: 38 81 00 08 addi r4,r1,8 410: 38 a0 00 00 li r5,0 414: 4b ff fe 19 bl 22c <__do_patch_instructions> 418: 7f e0 01 24 mtmsr r31 41c: 80 01 00 24 lwz r0,36(r1) 420: 83 e1 00 1c lwz r31,28(r1) 424: 7c 08 03 a6 mtlr r0 428: 38 21 00 20 addi r1,r1,32 42c: 4e 80 00 20 blr 430: 81 21 00 08 lwz r9,8(r1) 434: 91 23 00 00 stw r9,0(r3) 438: 7c 00 18 6c dcbst 0,r3 43c: 7c 00 04 ac hwsync 440: 7c 00 1f ac icbi 0,r3 444: 7c 00 04 ac hwsync 448: 4c 00 01 2c isync 44c: 38 60 00 00 li r3,0 450: 4b ff ff d8 b 428 <patch_instruction+0x40> 454: 38 60 ff ff li r3,-1 458: 4b ff ff d0 b 428 <patch_instruction+0x40> Christophe
Le 09/03/2023 à 19:02, Hari Bathini a écrit : > patch_instruction() entails setting up pte, patching the instruction, > clearing the pte and flushing the tlb. If multiple instructions need > to be patched, every instruction would have to go through the above > drill unnecessarily. Instead, introduce function patch_instructions() > that patches multiple instructions at one go while setting up the pte, > clearing the pte and flushing the tlb only once per page range of > instructions. Observed ~5X improvement in speed of execution using > patch_instructions() over patch_instructions(), when more instructions > are to be patched. > > Signed-off-by: Hari Bathini <hbathini@linux.ibm.com> > --- > arch/powerpc/include/asm/code-patching.h | 1 + > arch/powerpc/lib/code-patching.c | 151 ++++++++++++++++------- > 2 files changed, 106 insertions(+), 46 deletions(-) > > diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h > index 3f881548fb61..059fc4fe700e 100644 > --- a/arch/powerpc/include/asm/code-patching.h > +++ b/arch/powerpc/include/asm/code-patching.h > @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr, > int patch_branch(u32 *addr, unsigned long target, int flags); > int patch_instruction(u32 *addr, ppc_inst_t instr); > int raw_patch_instruction(u32 *addr, ppc_inst_t instr); > +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len); > > static inline unsigned long patch_site_addr(s32 *site) > { > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c > index b00112d7ad46..33857b9b53de 100644 > --- a/arch/powerpc/lib/code-patching.c > +++ b/arch/powerpc/lib/code-patching.c > @@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr) > flush_tlb_kernel_range(addr, addr + PAGE_SIZE); > } > > -static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) > +static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, size_t len) Some time ago we did a huge work to clean up use of u32 as code versus ppc_inst_t. Please carefully audit all places where you use u32 instead of ppc_inst_t. If you do so, don't use anymore the word 'instruction' in the function name. > { > - int err; > - u32 *patch_addr; > - unsigned long text_poke_addr; > - pte_t *pte; > - unsigned long pfn = get_patch_pfn(addr); > - struct mm_struct *patching_mm; > - struct mm_struct *orig_mm; > + struct mm_struct *patching_mm, *orig_mm; This change is cosmetic and not functionnaly liked to the patch. > + unsigned long text_poke_addr, pfn; > + u32 *patch_addr, *end, *pend; > + ppc_inst_t instr; > spinlock_t *ptl; > + int ilen, err; > + pte_t *pte; Why move this declaration ? > > patching_mm = __this_cpu_read(cpu_patching_context.mm); > text_poke_addr = __this_cpu_read(cpu_patching_context.addr); > - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > > pte = get_locked_pte(patching_mm, text_poke_addr, &ptl); > if (!pte) > return -ENOMEM; > > - __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); > + end = (void *)addr + len; > + do { > + pfn = get_patch_pfn(addr); > + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); > > - /* order PTE update before use, also serves as the hwsync */ > - asm volatile("ptesync": : :"memory"); > - > - /* order context switch after arbitrary prior code */ > - isync(); > - > - orig_mm = start_using_temp_mm(patching_mm); > - > - err = __patch_instruction(addr, instr, patch_addr); > + /* order PTE update before use, also serves as the hwsync */ > + asm volatile("ptesync": : :"memory"); > > - /* hwsync performed by __patch_instruction (sync) if successful */ > - if (err) > - mb(); /* sync */ > + /* order context switch after arbitrary prior code */ > + isync(); > + > + orig_mm = start_using_temp_mm(patching_mm); > + > + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr); > + if (end < pend) > + pend = end; > + > + while (addr < pend) { > + instr = ppc_inst_read(code); > + ilen = ppc_inst_len(instr); > + err = __patch_instruction(addr, instr, patch_addr); > + /* hwsync performed by __patch_instruction (sync) if successful */ As you aim at optimising the loops, you should before cache flushed outside this loop, with flush_dcache_range() and invalidate_dcache_range(). > + if (err) { > + mb(); /* sync */ > + break; > + } > + > + patch_addr = (void *)patch_addr + ilen; > + addr = (void *)addr + ilen; So patch_addr and addr are u32*, ilen is either 4 or 8, and you cast to void to do the math ? That looks odd and error prone. > + if (!fill_inst) > + code = (void *)code + ilen; > + } > > - /* context synchronisation performed by __patch_instruction (isync or exception) */ > - stop_using_temp_mm(patching_mm, orig_mm); > + /* context synchronisation performed by __patch_instruction (isync or exception) */ > + stop_using_temp_mm(patching_mm, orig_mm); > > - pte_clear(patching_mm, text_poke_addr, pte); > - /* > - * ptesync to order PTE update before TLB invalidation done > - * by radix__local_flush_tlb_page_psize (in _tlbiel_va) > - */ > - local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); > + pte_clear(patching_mm, text_poke_addr, pte); > + /* > + * ptesync to order PTE update before TLB invalidation done > + * by radix__local_flush_tlb_page_psize (in _tlbiel_va) > + */ > + local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); > + if (err) > + break; > + } while (addr < end); > > pte_unmap_unlock(pte, ptl); > > return err; > } > > -static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) > +static int __do_patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len) > { > - int err; > - u32 *patch_addr; > - unsigned long text_poke_addr; > + unsigned long text_poke_addr, pfn; > + u32 *patch_addr, *end, *pend; > + ppc_inst_t instr; > + int ilen, err; > pte_t *pte; > - unsigned long pfn = get_patch_pfn(addr); > > text_poke_addr = (unsigned long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK; > - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > - > pte = __this_cpu_read(cpu_patching_context.pte); > - __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); > - /* See ptesync comment in radix__set_pte_at() */ > - if (radix_enabled()) > - asm volatile("ptesync": : :"memory"); > > - err = __patch_instruction(addr, instr, patch_addr); > + end = (void *)addr + len; > + do { > + pfn = get_patch_pfn(addr); > + __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); > + /* See ptesync comment in radix__set_pte_at() */ > + if (radix_enabled()) > + asm volatile("ptesync": : :"memory"); > + > + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr); > + if (end < pend) > + pend = end; > + > + while (addr < pend) { > + instr = ppc_inst_read(code); > + ilen = ppc_inst_len(instr); > + err = __patch_instruction(addr, instr, patch_addr); > + if (err) > + break; > + > + patch_addr = (void *)patch_addr + ilen; > + addr = (void *)addr + ilen; > + if (!fill_inst) > + code = (void *)code + ilen; > + } > > - pte_clear(&init_mm, text_poke_addr, pte); > - flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); > + pte_clear(&init_mm, text_poke_addr, pte); > + flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); > + if (err) > + break; > + } while (addr < end); > > return err; > } > @@ -369,15 +409,34 @@ int patch_instruction(u32 *addr, ppc_inst_t instr) > > local_irq_save(flags); > if (mm_patch_enabled()) > - err = __do_patch_instruction_mm(addr, instr); > + err = __do_patch_instructions_mm(addr, (u32 *)&instr, false, ppc_inst_len(instr)); Do not mix-up u32* and ppc_inst_t, they are not equivalent, do not cast one to the other. > else > - err = __do_patch_instruction(addr, instr); > + err = __do_patch_instructions(addr, (u32 *)&instr, false, ppc_inst_len(instr)); Same. > local_irq_restore(flags); > > return err; > } > NOKPROBE_SYMBOL(patch_instruction); > > +/* > + * Patch 'addr' with 'len' bytes of instructions from 'code'. > + */ > +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len) > +{ > + unsigned long flags; > + int err; > + > + local_irq_save(flags); Won't the irq lock be a bit long ? > + if (mm_patch_enabled()) > + err = __do_patch_instructions_mm(addr, code, fill_inst, len); > + else > + err = __do_patch_instructions(addr, code, fill_inst, len); > + local_irq_restore(flags); > + > + return err; > +} > +NOKPROBE_SYMBOL(patch_instructions); > + > int patch_branch(u32 *addr, unsigned long target, int flags) > { > ppc_inst_t instr;
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 3f881548fb61..059fc4fe700e 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr, int patch_branch(u32 *addr, unsigned long target, int flags); int patch_instruction(u32 *addr, ppc_inst_t instr); int raw_patch_instruction(u32 *addr, ppc_inst_t instr); +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len); static inline unsigned long patch_site_addr(s32 *site) { diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index b00112d7ad46..33857b9b53de 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr) flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } -static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) +static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, size_t len) { - int err; - u32 *patch_addr; - unsigned long text_poke_addr; - pte_t *pte; - unsigned long pfn = get_patch_pfn(addr); - struct mm_struct *patching_mm; - struct mm_struct *orig_mm; + struct mm_struct *patching_mm, *orig_mm; + unsigned long text_poke_addr, pfn; + u32 *patch_addr, *end, *pend; + ppc_inst_t instr; spinlock_t *ptl; + int ilen, err; + pte_t *pte; patching_mm = __this_cpu_read(cpu_patching_context.mm); text_poke_addr = __this_cpu_read(cpu_patching_context.addr); - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); pte = get_locked_pte(patching_mm, text_poke_addr, &ptl); if (!pte) return -ENOMEM; - __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); + end = (void *)addr + len; + do { + pfn = get_patch_pfn(addr); + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); - /* order PTE update before use, also serves as the hwsync */ - asm volatile("ptesync": : :"memory"); - - /* order context switch after arbitrary prior code */ - isync(); - - orig_mm = start_using_temp_mm(patching_mm); - - err = __patch_instruction(addr, instr, patch_addr); + /* order PTE update before use, also serves as the hwsync */ + asm volatile("ptesync": : :"memory"); - /* hwsync performed by __patch_instruction (sync) if successful */ - if (err) - mb(); /* sync */ + /* order context switch after arbitrary prior code */ + isync(); + + orig_mm = start_using_temp_mm(patching_mm); + + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr); + if (end < pend) + pend = end; + + while (addr < pend) { + instr = ppc_inst_read(code); + ilen = ppc_inst_len(instr); + err = __patch_instruction(addr, instr, patch_addr); + /* hwsync performed by __patch_instruction (sync) if successful */ + if (err) { + mb(); /* sync */ + break; + } + + patch_addr = (void *)patch_addr + ilen; + addr = (void *)addr + ilen; + if (!fill_inst) + code = (void *)code + ilen; + } - /* context synchronisation performed by __patch_instruction (isync or exception) */ - stop_using_temp_mm(patching_mm, orig_mm); + /* context synchronisation performed by __patch_instruction (isync or exception) */ + stop_using_temp_mm(patching_mm, orig_mm); - pte_clear(patching_mm, text_poke_addr, pte); - /* - * ptesync to order PTE update before TLB invalidation done - * by radix__local_flush_tlb_page_psize (in _tlbiel_va) - */ - local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); + pte_clear(patching_mm, text_poke_addr, pte); + /* + * ptesync to order PTE update before TLB invalidation done + * by radix__local_flush_tlb_page_psize (in _tlbiel_va) + */ + local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); + if (err) + break; + } while (addr < end); pte_unmap_unlock(pte, ptl); return err; } -static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) +static int __do_patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len) { - int err; - u32 *patch_addr; - unsigned long text_poke_addr; + unsigned long text_poke_addr, pfn; + u32 *patch_addr, *end, *pend; + ppc_inst_t instr; + int ilen, err; pte_t *pte; - unsigned long pfn = get_patch_pfn(addr); text_poke_addr = (unsigned long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK; - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); - pte = __this_cpu_read(cpu_patching_context.pte); - __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); - /* See ptesync comment in radix__set_pte_at() */ - if (radix_enabled()) - asm volatile("ptesync": : :"memory"); - err = __patch_instruction(addr, instr, patch_addr); + end = (void *)addr + len; + do { + pfn = get_patch_pfn(addr); + __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); + /* See ptesync comment in radix__set_pte_at() */ + if (radix_enabled()) + asm volatile("ptesync": : :"memory"); + + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr); + if (end < pend) + pend = end; + + while (addr < pend) { + instr = ppc_inst_read(code); + ilen = ppc_inst_len(instr); + err = __patch_instruction(addr, instr, patch_addr); + if (err) + break; + + patch_addr = (void *)patch_addr + ilen; + addr = (void *)addr + ilen; + if (!fill_inst) + code = (void *)code + ilen; + } - pte_clear(&init_mm, text_poke_addr, pte); - flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); + pte_clear(&init_mm, text_poke_addr, pte); + flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); + if (err) + break; + } while (addr < end); return err; } @@ -369,15 +409,34 @@ int patch_instruction(u32 *addr, ppc_inst_t instr) local_irq_save(flags); if (mm_patch_enabled()) - err = __do_patch_instruction_mm(addr, instr); + err = __do_patch_instructions_mm(addr, (u32 *)&instr, false, ppc_inst_len(instr)); else - err = __do_patch_instruction(addr, instr); + err = __do_patch_instructions(addr, (u32 *)&instr, false, ppc_inst_len(instr)); local_irq_restore(flags); return err; } NOKPROBE_SYMBOL(patch_instruction); +/* + * Patch 'addr' with 'len' bytes of instructions from 'code'. + */ +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len) +{ + unsigned long flags; + int err; + + local_irq_save(flags); + if (mm_patch_enabled()) + err = __do_patch_instructions_mm(addr, code, fill_inst, len); + else + err = __do_patch_instructions(addr, code, fill_inst, len); + local_irq_restore(flags); + + return err; +} +NOKPROBE_SYMBOL(patch_instructions); + int patch_branch(u32 *addr, unsigned long target, int flags) { ppc_inst_t instr;
patch_instruction() entails setting up pte, patching the instruction, clearing the pte and flushing the tlb. If multiple instructions need to be patched, every instruction would have to go through the above drill unnecessarily. Instead, introduce function patch_instructions() that patches multiple instructions at one go while setting up the pte, clearing the pte and flushing the tlb only once per page range of instructions. Observed ~5X improvement in speed of execution using patch_instructions() over patch_instructions(), when more instructions are to be patched. Signed-off-by: Hari Bathini <hbathini@linux.ibm.com> --- arch/powerpc/include/asm/code-patching.h | 1 + arch/powerpc/lib/code-patching.c | 151 ++++++++++++++++------- 2 files changed, 106 insertions(+), 46 deletions(-)