From patchwork Mon Dec 13 18:00:56 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Laight X-Patchwork-Id: 12674461 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 45069C433F5 for ; Mon, 13 Dec 2021 18:01:05 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S241678AbhLMSBC (ORCPT ); Mon, 13 Dec 2021 13:01:02 -0500 Received: from eu-smtp-delivery-151.mimecast.com ([185.58.86.151]:53966 "EHLO eu-smtp-delivery-151.mimecast.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S234319AbhLMSBA (ORCPT ); Mon, 13 Dec 2021 13:01:00 -0500 Received: from AcuMS.aculab.com (156.67.243.121 [156.67.243.121]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.2, cipher=TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384) id uk-mta-33-MkGeTNzxOuqL-NcoU0yNBg-1; Mon, 13 Dec 2021 18:00:57 +0000 X-MC-Unique: MkGeTNzxOuqL-NcoU0yNBg-1 Received: from AcuMS.Aculab.com (fd9f:af1c:a25b:0:994c:f5c2:35d6:9b65) by AcuMS.aculab.com (fd9f:af1c:a25b:0:994c:f5c2:35d6:9b65) with Microsoft SMTP Server (TLS) id 15.0.1497.26; Mon, 13 Dec 2021 18:00:56 +0000 Received: from AcuMS.Aculab.com ([fe80::994c:f5c2:35d6:9b65]) by AcuMS.aculab.com ([fe80::994c:f5c2:35d6:9b65%12]) with mapi id 15.00.1497.026; Mon, 13 Dec 2021 18:00:56 +0000 From: David Laight To: 'Noah Goldstein' , 'Eric Dumazet' CC: "'tglx@linutronix.de'" , "'mingo@redhat.com'" , 'Borislav Petkov' , "'dave.hansen@linux.intel.com'" , 'X86 ML' , "'hpa@zytor.com'" , "'peterz@infradead.org'" , "'alexanderduyck@fb.com'" , 'open list' , 'netdev' Subject: [PATCH] lib/x86: Optimise csum_partial of buffers that are not multiples of 8 bytes. Thread-Topic: [PATCH] lib/x86: Optimise csum_partial of buffers that are not multiples of 8 bytes. Thread-Index: AdfwSx7jhGb9mOkwS12sTJ1p5oR1JQ== Date: Mon, 13 Dec 2021 18:00:56 +0000 Message-ID: Accept-Language: en-GB, en-US X-MS-Has-Attach: X-MS-TNEF-Correlator: x-ms-exchange-transport-fromentityheader: Hosted x-originating-ip: [10.202.205.107] MIME-Version: 1.0 Authentication-Results: relay.mimecast.com; auth=pass smtp.auth=C51A453 smtp.mailfrom=david.laight@aculab.com X-Mimecast-Spam-Score: 0 X-Mimecast-Originator: aculab.com Content-Language: en-US Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org Add in the trailing bytes first so that there is no need to worry about the sum exceeding 64 bits. Signed-off-by: David Laight --- This ought to be faster - because of all the removed 'adc $0'. Guessing how fast x86 code will run is hard! There are other ways of handing buffers that are shorter than 8 bytes, but I'd rather hope they don't happen in any hot paths. Note - I've not even compile tested it. (But have tested an equivalent change before.) arch/x86/lib/csum-partial_64.c | 55 ++++++++++++---------------------- 1 file changed, 19 insertions(+), 36 deletions(-) diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index abf819dd8525..fbcc073fc2b5 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -37,6 +37,24 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) u64 temp64 = (__force u64)sum; unsigned result; + if (len & 7) { + if (unlikely(len < 8)) { + /* Avoid falling off the start of the buffer */ + if (len & 4) { + temp64 += *(u32 *)buff; + buff += 4; + } + if (len & 2) { + temp64 += *(u16 *)buff; + buff += 2; + } + if (len & 1) + temp64 += *(u8 *)buff; + goto reduce_to32; + } + temp64 += *(u64 *)(buff + len - 8) << (8 - (len & 7)) * 8; + } + while (unlikely(len >= 64)) { asm("addq 0*8(%[src]),%[res]\n\t" "adcq 1*8(%[src]),%[res]\n\t" @@ -82,43 +100,8 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) : "memory"); buff += 8; } - if (len & 7) { -#ifdef CONFIG_DCACHE_WORD_ACCESS - unsigned int shift = (8 - (len & 7)) * 8; - unsigned long trail; - - trail = (load_unaligned_zeropad(buff) << shift) >> shift; - asm("addq %[trail],%[res]\n\t" - "adcq $0,%[res]" - : [res] "+r" (temp64) - : [trail] "r" (trail)); -#else - if (len & 4) { - asm("addq %[val],%[res]\n\t" - "adcq $0,%[res]" - : [res] "+r" (temp64) - : [val] "r" ((u64)*(u32 *)buff) - : "memory"); - buff += 4; - } - if (len & 2) { - asm("addq %[val],%[res]\n\t" - "adcq $0,%[res]" - : [res] "+r" (temp64) - : [val] "r" ((u64)*(u16 *)buff) - : "memory"); - buff += 2; - } - if (len & 1) { - asm("addq %[val],%[res]\n\t" - "adcq $0,%[res]" - : [res] "+r" (temp64) - : [val] "r" ((u64)*(u8 *)buff) - : "memory"); - } -#endif - } +reduce_to32: result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); return (__force __wsum)result; }