From patchwork Fri Jan 12 11:53:16 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: chenzhou X-Patchwork-Id: 10160345 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork.web.codeaurora.org (Postfix) with ESMTP id 70CCF602A7 for ; Fri, 12 Jan 2018 11:53:59 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 532F0289A1 for ; Fri, 12 Jan 2018 11:53:59 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 45B9C289A7; Fri, 12 Jan 2018 11:53:59 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-4.2 required=2.0 tests=BAYES_00,DKIM_SIGNED, DKIM_VALID,RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from bombadil.infradead.org (bombadil.infradead.org [65.50.211.133]) (using TLSv1.2 with cipher AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 840E6289A1 for ; Fri, 12 Jan 2018 11:53:58 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=lists.infradead.org; s=bombadil.20170209; h=Sender: Content-Transfer-Encoding:Content-Type:Cc:List-Subscribe:List-Help:List-Post: List-Archive:List-Unsubscribe:List-Id:MIME-Version:Message-ID:Date:Subject:To :From:Reply-To:Content-ID:Content-Description:Resent-Date:Resent-From: Resent-Sender:Resent-To:Resent-Cc:Resent-Message-ID:In-Reply-To:References: List-Owner; bh=jZOwYTaO1oOrKquFsxkMWTze76pD2hFYRmsV2gNCjo0=; b=ntt/0KWmbtbU25 wEGJ/UowyfvcmYCLi4GkHcPEIqxPye45H+FhXQ1+Z7a9edR6wwhSZqdj4qcwDtoRqLjI13i9+55Yn u9WIa5AcYwWV/zw3bRF1SBli5p8eXviXKvNlDUBgB5+akFy26UcCaqlLryxIKKdbxNK+gndcK1+t3 7Q2LJIcInS1IotIN7JWm4av2YA8kHiAuz2jJugV0rm5WlmyswvGK020rckhz6Dn+J0+hWsRrDQysh xRgCKPxBAoALYLaEhx7GWhi8f0Jhy4elzuYSltv7+7mf1yjtY+ItlSlhkFHqo7Qeg0VrZ0m7qNqu8 kRtQs8SDjWGDWWIHus+A==; Received: from localhost ([127.0.0.1] helo=bombadil.infradead.org) by bombadil.infradead.org with esmtp (Exim 4.89 #1 (Red Hat Linux)) id 1eZxuH-0000Xy-FF; Fri, 12 Jan 2018 11:53:57 +0000 Received: from [45.249.212.32] (helo=huawei.com) by bombadil.infradead.org with esmtps (Exim 4.89 #1 (Red Hat Linux)) id 1eZxuE-0000Vp-Gp for linux-arm-kernel@lists.infradead.org; Fri, 12 Jan 2018 11:53:56 +0000 Received: from DGGEMS414-HUB.china.huawei.com (unknown [172.30.72.60]) by Forcepoint Email with ESMTP id 15286D52E549B; Fri, 12 Jan 2018 19:53:21 +0800 (CST) Received: from localhost.localdomain (10.175.101.84) by DGGEMS414-HUB.china.huawei.com (10.3.19.214) with Microsoft SMTP Server id 14.3.361.1; Fri, 12 Jan 2018 19:53:21 +0800 From: Chen Zhou To: , Subject: [PATCH] arm64: support do_csum with neon Date: Fri, 12 Jan 2018 19:53:16 +0800 Message-ID: <1515757996-4675-1-git-send-email-chenzhou10@huawei.com> X-Mailer: git-send-email 1.6.0.2 MIME-Version: 1.0 X-Originating-IP: [10.175.101.84] X-CFilter-Loop: Reflected X-CRM114-Version: 20100106-BlameMichelson ( TRE 0.8.0 (BSD) ) MR-646709E3 X-CRM114-CacheID: sfid-20180112_035354_774070_1874D05A X-CRM114-Status: GOOD ( 12.41 ) X-BeenThere: linux-arm-kernel@lists.infradead.org X-Mailman-Version: 2.1.21 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: linux-arm-kernel@lists.infradead.org Sender: "linux-arm-kernel" Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org X-Virus-Scanned: ClamAV using ClamSMTP On arm64 little endian such as the Cortex-A57, the neon based implementation performance increases by about 70% when len is greater than 512. Signed-off-by: Chen Zhou --- arch/arm64/include/asm/checksum.h | 3 + arch/arm64/lib/Makefile | 1 + arch/arm64/lib/do_csum.S | 177 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 181 insertions(+) create mode 100644 arch/arm64/lib/do_csum.S diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h index 09f6533..e300782 100644 --- a/arch/arm64/include/asm/checksum.h +++ b/arch/arm64/include/asm/checksum.h @@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum) } #define csum_fold csum_fold +#define do_csum do_csum +extern unsigned int do_csum(const unsigned char *, size_t); + static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { __uint128_t tmp; diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 1a811ec..5b6aa34 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -3,6 +3,7 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ clear_page.o memchr.o memcpy.o memmove.o memset.o \ memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ strchr.o strrchr.o +lib-y += do_csum.o # Tell the compiler to treat all general purpose registers as # callee-saved, which allows for efficient runtime patching of the bl diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S new file mode 100644 index 0000000..8e7b486 --- /dev/null +++ b/arch/arm64/lib/do_csum.S @@ -0,0 +1,177 @@ +/* + * Optmized version of the standard do_csum() function + * + * Parameters: + * x0 - address of buffer to checksum (const unsigned char *) + * x1 - length of the buffer (int) + * Returns: + * x0 - the return checksum of the buffer + */ + +#include +#include + +ENTRY(do_csum) + ldr x13, =0xffff + eor x4, x4, x4 + eor x5, x5, x5 + eor v0.16b, v0.16b, v0.16b + + //len is zero or negative + and x6, x1, #0x80000000 + cmp x6, #0 + b.gt out + cbz w1, out + + tst x0, #1 + b.eq addr_not_odd + + //addr is odd + mov x4, #1 + ldr x6, [x0], #1 +#ifdef __AARCH64EB__ + and x6, x6, #0xff +#else + lsl x6, x6, #8 + and x6, x6, x13 +#endif + add x5, x5, x6 + sub x1, x1, #1 + +addr_not_odd: + cmp x1, #32 + b.lt len_4 + cmp x1, #192 + b.ge len_than_192 + b do_loop_16 + +len_than_192: + ldp q1, q0, [x0], #32 + ldp q3, q2, [x0], #32 + ldp q5, q4, [x0], #32 + sub x1, x1, #96 + +do_loop_96: + ldp q7, q6, [x0], #32 + ldp q9, q8, [x0], #32 + ldp q11, q10, [x0], #32 + + uaddl v12.4s, v0.4h, v6.4h + uaddl2 v13.4s, v0.8h, v6.8h + + uaddl v14.4s, v1.4h, v7.4h + uaddl2 v15.4s, v1.8h, v7.8h + + uaddl v16.4s, v2.4h, v8.4h + uaddl2 v17.4s, v2.8h, v8.8h + + uaddl v18.4s, v3.4h, v9.4h + uaddl2 v19.4s, v3.8h, v9.8h + + uaddl v20.4s, v4.4h, v10.4h + uaddl2 v21.4s, v4.8h, v10.8h + uaddl v22.4s, v5.4h, v11.4h + uaddl2 v23.4s, v5.8h, v11.8h + + add v0.4s, v12.4s, v13.4s + add v1.4s, v14.4s, v15.4s + add v2.4s, v16.4s, v17.4s + add v3.4s, v18.4s, v19.4s + add v4.4s, v20.4s, v21.4s + add v5.4s, v22.4s, v23.4s + + sub x1, x1, #96 + cmp x1, #96 + b.ge do_loop_96 + + add v0.4s, v0.4s, v1.4s + add v2.4s, v2.4s, v3.4s + add v4.4s, v4.4s, v5.4s + add v0.4s, v0.4s, v2.4s + add v0.4s, v0.4s, v4.4s //get result + + cmp x1, #16 + b.lt get_64 + +do_loop_16: + ldr q6, [x0], #16 + uaddl v24.4s, v0.4h, v6.4h + uaddl2 v25.4s, v0.8h, v6.8h + add v0.4s, v24.4s, v25.4s + sub x1, x1, #16 + cmp x1, #16 + b.ge do_loop_16 + +get_64: + mov x6, v0.d[0] + add x5, x5, x6 + mov x6, v0.d[1] + + add x5, x5, x6 + cmp x5, x6 + b.ge len_4 + add x5, x5, #1 + +len_4: + cmp x1, #4 + b.lt len_2 + + sub x1, x1, #4 + ldr w6, [x0], #4 + and x6, x6, #0xffffffff + add x5, x5, x6 + b len_4 + +len_2: + cmp x1, #2 + b.lt len_1 + sub x1, x1, #2 + ldrh w6, [x0], #2 + and x6, x6, x13 + add x5, x5, x6 + +len_1: + cmp x1, #1 + b.lt fold_32 + ldr x6, [x0], #1 +#ifdef __AARCH64EB__ + lsl x6, x6, #8 + and x6, x6, x13 +#else + and x6, x6, #0xff +#endif + add x5, x5, x6 + +fold_32: + and x9, x5, x13 //[15:0] + and x10, x13, x5, lsr #16 //[31:16] + and x11, x13, x5, lsr #32 //[47:32] + and x12, x13, x5, lsr #48 //[47:32] + + add x9, x9, x10 + add x11, x11, x12 + + add x9, x9, x11 + + and x10, x9, x13 + and x11, x13, x9, lsr #16 + + add x5, x10, x11 + + and x9, x5, x13 //add carry + and x10, x13, x5, lsr #16 + add x5, x9, x10 + + cbz x4, out //addr isn't odd + + lsr x6, x5, #8 + and x6, x6, #0xff + and x7, x5, #0xff + lsl x7, x7, #8 + + orr x5, x6, x7 + +out: + mov x0, x5 + ret +ENDPROC(do_csum)