From patchwork Thu Sep 16 06:31:43 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Miao Xie X-Patchwork-Id: 184602 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id o8G6VgK7010386 for ; Thu, 16 Sep 2010 06:31:42 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752565Ab0IPGbk (ORCPT ); Thu, 16 Sep 2010 02:31:40 -0400 Received: from cn.fujitsu.com ([222.73.24.84]:55006 "EHLO song.cn.fujitsu.com" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1751780Ab0IPGbj (ORCPT ); Thu, 16 Sep 2010 02:31:39 -0400 Received: from tang.cn.fujitsu.com (tang.cn.fujitsu.com [10.167.250.3]) by song.cn.fujitsu.com (Postfix) with ESMTP id D5503170176; Thu, 16 Sep 2010 14:31:35 +0800 (CST) Received: from fnst.cn.fujitsu.com (tang.cn.fujitsu.com [127.0.0.1]) by tang.cn.fujitsu.com (8.14.3/8.13.1) with ESMTP id o8G6RxTb008122; Thu, 16 Sep 2010 14:27:59 +0800 Received: from [10.167.141.211] (unknown [10.167.141.211]) by fnst.cn.fujitsu.com (Postfix) with ESMTPA id 1AE3C14C047; Thu, 16 Sep 2010 14:33:13 +0800 (CST) Message-ID: <4C91B9CF.2020401@cn.fujitsu.com> Date: Thu, 16 Sep 2010 14:31:43 +0800 From: Miao Xie Reply-To: miaox@cn.fujitsu.com User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.9) Gecko/20100413 Fedora/3.0.4-2.fc13 Thunderbird/3.0.4 MIME-Version: 1.0 To: Andi Kleen , Andrew Morton , Ingo Molnar , "Theodore Ts'o" , Chris Mason CC: Linux Kernel , Linux Btrfs , Linux Ext4 Subject: [PATCH] x86_64/lib: improve the performance of memmove Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]); Thu, 16 Sep 2010 06:31:43 +0000 (UTC) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 19e2c46..4e64a87 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -55,6 +55,7 @@ extern void *__memcpy(void *to, const void *from, size_t len); void *memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMMOVE +extern void *__memcpy_bwd(void *dest, const void *src, size_t count); void *memmove(void *dest, const void *src, size_t count); int memcmp(const void *cs, const void *ct, size_t count); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index e10cf07..ab241df 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -19,7 +19,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o -lib-y += memcpy_$(BITS).o +lib-y += memcpy_$(BITS).o memcpy_bwd_$(BITS).o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o obj-y += msr.o msr-reg.o msr-reg-export.o diff --git a/arch/x86/lib/memcpy_bwd_64.S b/arch/x86/lib/memcpy_bwd_64.S new file mode 100644 index 0000000..ca894e3 --- /dev/null +++ b/arch/x86/lib/memcpy_bwd_64.S @@ -0,0 +1,137 @@ +/* Copyright 2010 Miao Xie */ + +#include + +#include +#include + +/* + * __memcpy_bwd - Copy a memory block from the end to the beginning + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * rax original destination + */ + + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_bwd_c: + movq %rdi, %rax + + addq %rdx, %rdi + addq %rdx, %rsi + leaq -8(%rdi), %rdi + leaq -8(%rsi), %rsi + + std + + movq %rdx, %rcx + shrq $3, %rcx + andq $7, %rdx + rep movsq + + leaq 8(%rdi), %rdi + leaq 8(%rsi), %rsi + decq %rsi + decq %rdi + movq %rdx, %rcx + rep movsb + + cld + ret +.Lmemcpy_bwd_e: + .previous + +ENTRY(__memcpy_bwd) + CFI_STARTPROC + + movq %rdi, %rax + + addq %rdx, %rdi + addq %rdx, %rsi + + movq %rdx, %rcx + shrq $6, %rcx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decq %rcx + + leaq -64(%rdi), %rdi + leaq -64(%rsi), %rsi + + movq 7*8(%rsi), %r11 + movq 6*8(%rsi), %r8 + movq %r11, 7*8(%rdi) + movq %r8, 6*8(%rdi) + + movq 5*8(%rsi), %r9 + movq 4*8(%rsi), %r10 + movq %r9, 5*8(%rdi) + movq %r10, 4*8(%rdi) + + movq 3*8(%rsi), %r11 + movq 2*8(%rsi), %r8 + movq %r11, 3*8(%rdi) + movq %r8, 2*8(%rdi) + + movq 1*8(%rsi), %r9 + movq 0*8(%rsi), %r10 + movq %r9, 1*8(%rdi) + movq %r10, 0*8(%rdi) + + jnz .Lloop_64 + +.Lhandle_tail: + movq %rdx, %rcx + andq $63, %rcx + shrq $3, %rcx + jz .Lhandle_7 + + .p2align 4 +.Lloop_8: + decq %rcx + + leaq -8(%rsi), %rsi + leaq -8(%rdi), %rdi + + movq (%rsi), %r8 + movq %r8, (%rdi) + + jnz .Lloop_8 + +.Lhandle_7: + movq %rdx, %rcx + andq $7, %rcx + jz .Lend + + .p2align 4 +.Lloop_1: + decq %rcx + + decq %rsi + decq %rdi + + movb (%rsi), %r8b + movb %r8b, (%rdi) + + jnz .Lloop_1 + +.Lend: + ret + CFI_ENDPROC +ENDPROC(__memcpy_bwd) + + .section .altinstructions, "a" + .align 8 + .quad __memcpy_bwd + .quad .Lmemcpy_bwd_c + .word X86_FEATURE_REP_GOOD + + .byte .Lmemcpy_bwd_e - .Lmemcpy_bwd_c + .byte .Lmemcpy_bwd_e - .Lmemcpy_bwd_c + .previous diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c index 0a33909..bd4cbcc 100644 --- a/arch/x86/lib/memmove_64.c +++ b/arch/x86/lib/memmove_64.c @@ -8,14 +8,16 @@ #undef memmove void *memmove(void *dest, const void *src, size_t count) { - if (dest < src) { + if (dest < src || dest - src >= count) return memcpy(dest, src, count); - } else { + else if (count <= 64) { char *p = dest + count; const char *s = src + count; while (count--) *--p = *--s; - } - return dest; + + return dest; + } else + return __memcpy_bwd(dest, src, count); } EXPORT_SYMBOL(memmove);