diff mbox series

[RFC] arm64: no need to make pmr visible immediately

Message ID 20190415042434.46465-1-liwei391@huawei.com (mailing list archive)
State RFC
Headers show
Series [RFC] arm64: no need to make pmr visible immediately | expand

Commit Message

liwei (GF) April 15, 2019, 4:24 a.m. UTC
We don't need to make pmr visible immediately when enabling irq.
After removing the "dsb", i did a benchmark and the result shows there
is nearly no performance degradation (<1%) when enabling pseudo nmi:

	Processor: ARMv8 Cortex-A72 (64 Cores), Motherboard: Huawei BC11SPCD (1.58 BIOS), Memory: 16 x 32 GB DDR4-2400MT/s Samsung M393A4K40BB1-CRC, Disk: 4 x 300GB Seagate ST300MM0048 + 2 x 4001GB HUS726040ALA610, Graphics: Huawei Hi1710 [iBMC Intelligent Management chip w/VGA support]

	OS: EulerOS 2.0, Kernel: 4.19.33-test+ (aarch64), Compiler: GCC 7.3.0, File-System: ext4, Screen Resolution: 800x600


    Stream 2013-01-17
    Type: Copy
    MB/s > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 87220 |=====================================
    4.19.33-test+ no nmi #1 ..... 88203 |=====================================
    4.19.33-test+ no nmi #2 ..... 87590 |=====================================
    4.19.33-test+ nmi no dsb #2 . 87003 |====================================


    Stream 2013-01-17
    Type: Scale
    MB/s > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 89157 |====================================
    4.19.33-test+ no nmi #1 ..... 90970 |=====================================
    4.19.33-test+ no nmi #2 ..... 90089 |=====================================
    4.19.33-test+ nmi no dsb #2 . 89601 |====================================


    Stream 2013-01-17
    Type: Triad
    MB/s > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 89969 |====================================
    4.19.33-test+ no nmi #1 ..... 92012 |=====================================
    4.19.33-test+ no nmi #2 ..... 91287 |=====================================
    4.19.33-test+ nmi no dsb #2 . 90475 |====================================


    Stream 2013-01-17
    Type: Add
    MB/s > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 89882 |====================================
    4.19.33-test+ no nmi #1 ..... 91343 |=====================================
    4.19.33-test+ no nmi #2 ..... 90837 |=====================================
    4.19.33-test+ nmi no dsb #2 . 90371 |=====================================


    7-Zip Compression 16.02
    Compress Speed Test
    MIPS > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 56894 |=====================================
    4.19.33-test+ no nmi #1 ..... 56558 |=====================================
    4.19.33-test+ no nmi #2 ..... 54814 |====================================
    4.19.33-test+ nmi no dsb #2 . 55703 |====================================


    C-Ray 1.1
    Total Time - 4K, 16 Rays Per Pixel
    Seconds < Lower Is Better
    4.19.33-test+ nmi no dsb #1 . 29.19 |===================================
    4.19.33-test+ no nmi #1 ..... 30.70 |=====================================
    4.19.33-test+ no nmi #2 ..... 29.73 |====================================
    4.19.33-test+ nmi no dsb #2 . 29.55 |====================================


    XZ Compression 5.2.4
    Compressing ubuntu-16.04.3-server-i386.img, Compression Level 9
    Seconds < Lower Is Better
    4.19.33-test+ nmi no dsb #1 . 159 |=======================================
    4.19.33-test+ no nmi #1 ..... 158 |=======================================
    4.19.33-test+ no nmi #2 ..... 158 |=======================================
    4.19.33-test+ nmi no dsb #2 . 158 |=======================================


    Redis 4.0.8
    Test: LPOP
    Requests Per Second > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 360331 |====================================
    4.19.33-test+ no nmi #1 ..... 352945 |===================================
    4.19.33-test+ no nmi #2 ..... 357336 |====================================
    4.19.33-test+ nmi no dsb #2 . 358403 |====================================


    Redis 4.0.8
    Test: SADD
    Requests Per Second > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 444682 |====================================
    4.19.33-test+ no nmi #1 ..... 420772 |==================================
    4.19.33-test+ no nmi #2 ..... 433746 |===================================
    4.19.33-test+ nmi no dsb #2 . 420890 |==================================


    Redis 4.0.8
    Test: LPUSH
    Requests Per Second > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 361561 |====================================
    4.19.33-test+ no nmi #1 ..... 361402 |====================================
    4.19.33-test+ no nmi #2 ..... 347875 |===================================
    4.19.33-test+ nmi no dsb #2 . 355313 |===================================


    Redis 4.0.8
    Test: GET
    Requests Per Second > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 490271 |====================================
    4.19.33-test+ no nmi #1 ..... 479466 |===================================
    4.19.33-test+ no nmi #2 ..... 481711 |===================================
    4.19.33-test+ nmi no dsb #2 . 495635 |====================================


    Redis 4.0.8
    Test: SET
    Requests Per Second > Higher Is Better
    4.19.33-test+ nmi no dsb #1 . 393653 |===================================
    4.19.33-test+ no nmi #1 ..... 392973 |===================================
    4.19.33-test+ no nmi #2 ..... 391346 |===================================
    4.19.33-test+ nmi no dsb #2 . 404225 |====================================


Signed-off-by: Wei Li <liwei391@huawei.com>
Link: https://www.spinics.net/lists/arm-kernel/msg719005.html
---
 arch/arm64/include/asm/irqflags.h | 12 ++++--------
 arch/arm64/include/asm/kvm_host.h |  1 -
 arch/arm64/kernel/entry.S         |  2 --
 arch/arm64/kvm/hyp/switch.c       |  1 -
 4 files changed, 4 insertions(+), 12 deletions(-)

Comments

Marc Zyngier April 15, 2019, 8:14 a.m. UTC | #1
Hi Wei,

On 15/04/2019 05:24, Wei Li wrote:
> We don't need to make pmr visible immediately when enabling irq.
> After removing the "dsb", i did a benchmark and the result shows there
> is nearly no performance degradation (<1%) when enabling pseudo nmi:

[...]

Almost no performance degradation, and also almost no architectural
compliance. Please look at what the architecture says, quoted here for
your convenience:

<quote>
9.1.6 Observability of the effects of accesses to the GIC registers

[...]
• Architectural execution of a DSB instruction guarantees that
— The last value written to ICC_PMR_EL1 or GICC_PMR is observed by the
associated Redistributor.
</quote>

Of course, removing the DSB removes the overhead. But it also
potentially removes any effect of the write to PMR. It happens to work
on your particular combination of CPU interface and redistributor, but
it doesn't mean it works in the general case.

A valid implementation could leave the PMR value programmed in the CPU
interface, and delay signaling the redistributor until a DSB occurs for
another reason. The net effect would be that interrupts would be delayed
by an arbitrary amount of time, and I'm sure someone will shout at us
for doing so because their machine's behaviour has become erratic or
even deadlock.

At the end of the day, the pseudo-NMI stuff is a debug feature. If what
you care about is speed, disable it.

Thanks,

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
index 7bc6a79f31c4..a670996a6710 100644
--- a/arch/arm64/include/asm/irqflags.h
+++ b/arch/arm64/include/asm/irqflags.h
@@ -41,10 +41,8 @@ 
 static inline void arch_local_irq_enable(void)
 {
 	asm volatile(ALTERNATIVE(
-		"msr	daifclr, #2		// arch_local_irq_enable\n"
-		"nop",
-		"msr_s  " __stringify(SYS_ICC_PMR_EL1) ",%0\n"
-		"dsb	sy",
+		"msr	daifclr, #2		// arch_local_irq_enable",
+		"msr_s  " __stringify(SYS_ICC_PMR_EL1) ",%0",
 		ARM64_HAS_IRQ_PRIO_MASKING)
 		:
 		: "r" ((unsigned long) GIC_PRIO_IRQON)
@@ -113,10 +111,8 @@  static inline unsigned long arch_local_irq_save(void)
 static inline void arch_local_irq_restore(unsigned long flags)
 {
 	asm volatile(ALTERNATIVE(
-			"msr	daif, %0\n"
-			"nop",
-			"msr_s	" __stringify(SYS_ICC_PMR_EL1) ", %0\n"
-			"dsb	sy",
+			"msr	daif, %0",
+			"msr_s	" __stringify(SYS_ICC_PMR_EL1) ", %0",
 			ARM64_HAS_IRQ_PRIO_MASKING)
 		:
 		: "r" ((int)flags)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 17541276fcc7..8b912b372378 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -485,7 +485,6 @@  static inline void kvm_arm_vhe_guest_enter(void)
 	 */
 	if (system_uses_irq_prio_masking()) {
 		gic_write_pmr(GIC_PRIO_IRQON);
-		dsb(sy);
 	}
 }
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 42ca7d43e040..45eeaf237383 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -279,8 +279,6 @@  alternative_else_nop_endif
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	ldr	x20, [sp, #S_PMR_SAVE]
 	msr_s	SYS_ICC_PMR_EL1, x20
-	/* Ensure priority change is seen by redistributor */
-	dsb	sy
 alternative_else_nop_endif
 
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index c6def1fd7e91..9990aca5191e 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -550,7 +550,6 @@  int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
 	 */
 	if (system_uses_irq_prio_masking()) {
 		gic_write_pmr(GIC_PRIO_IRQON);
-		dsb(sy);
 	}
 
 	vcpu = kern_hyp_va(vcpu);