From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from wout1-smtp.messagingengine.com (wout1-smtp.messagingengine.com [64.147.123.24]) by sourceware.org (Postfix) with ESMTPS id A39863973049 for ; Fri, 15 Jan 2021 11:31:36 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org A39863973049 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=danielengel.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gnu@danielengel.com Received: from compute4.internal (compute4.nyi.internal [10.202.2.44]) by mailout.west.internal (Postfix) with ESMTP id BDA11F71; Fri, 15 Jan 2021 06:31:35 -0500 (EST) Received: from mailfrontend2 ([10.202.2.163]) by compute4.internal (MEProxy); Fri, 15 Jan 2021 06:31:35 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=danielengel.com; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; s=fm1; bh=gj+mpZxcaF0D5 4eHPYFGN2K7fPdVtbGLjxfIhLhsfJQ=; b=E/ai1EnJqIJpoJh11zkDylar07gUO BVupk+BJcayWApd836H79PlFSnwZO06SCV6vltP9Rmbt5w0qmlSNA16mWsqQ6c7J u3bWB6h0EthLELIqeUOt5OI7b17iyH+N2aF3evOX6U3ITog4f0OLfxSUiwOUnWem VfsIy1m+HMauJuoBwX1iLMwdbFsicfKKT0kBDEOjixN6Ail1lBnBNtPAysjxztFr PmZD8J/eoz5YuYmt+cLNoyGlig7YL39UHzTFl95SU9BRhXPKccMpF6J/V44YCaUq Ajt2WxMk6t1Y98XauT9WiLnviveli+ZHb0loknpKvKslZqfkz6An79WVg== DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d= messagingengine.com; h=cc:content-transfer-encoding:date:from :in-reply-to:message-id:mime-version:references:subject:to :x-me-proxy:x-me-proxy:x-me-sender:x-me-sender:x-sasl-enc; s= fm1; bh=gj+mpZxcaF0D54eHPYFGN2K7fPdVtbGLjxfIhLhsfJQ=; b=GKlkGjAh AVlu79qdv8kyy9fmJ72F/dwokeAxXWAVO6mf7aaaREzjekvF0tVBet/uSnSjs4wI IIdQod+OsrCw0XA/62FR6QWuNHkAVJg2bIIxnWRIIcQW/S7DknNrPdgYUWiK5ytP OiNnHeup3ELHm2S7BdaRkaB0hFwzKutcBjEO9YllavWgobdMIiLO1NdYRGPFcrG7 I1QMotTcnKVRUOU7xC25AItGmWpFsT5xD4YgDzL7pr+pWgxR/uqUbyv5c1DDxCKJ /SvFL2DYDl1mhvE+rocNHpBf/sPvalfxTrOV6grm7hqj5jThq2T96DdbIydEfSMb RP/21y+QGo9peA== X-ME-Sender: X-ME-Proxy-Cause: gggruggvucftvghtrhhoucdtuddrgeduledrtddvgddtgecutefuodetggdotefrodftvf curfhrohhfihhlvgemucfhrghsthforghilhdpqfgfvfdpuffrtefokffrpgfnqfghnecu uegrihhlohhuthemuceftddtnecunecujfgurhephffvufffkffojghfggfgsedtkeertd ertddtnecuhfhrohhmpeffrghnihgvlhcugfhnghgvlhcuoehgnhhusegurghnihgvlhgv nhhgvghlrdgtohhmqeenucggtffrrghtthgvrhhnpeetkeeikeevieegjeetheejueffte dugeevfedvuefhfeduhfffgfefleegvdfhgeenucffohhmrghinheplhhisgdufhhunhgt shdrshgspdhpohhptghnthdrshgspdhgnhhurdhorhhgnecukfhppeejuddrfeeirddutd dtrddvvddtnecuvehluhhsthgvrhfuihiivgeptdenucfrrghrrghmpehmrghilhhfrhho mhepghhnuhesuggrnhhivghlvghnghgvlhdrtghomh X-ME-Proxy: Received: from sendmail.lorien.danielengel.com (71-36-100-220.ptld.qwest.net [71.36.100.220]) by mail.messagingengine.com (Postfix) with ESMTPA id 001B21080064; Fri, 15 Jan 2021 06:31:34 -0500 (EST) Received: from ubuntu.lorien.danielengel.com (ubuntu.lorien.danielengel.com [10.0.0.96]) by sendmail.lorien.danielengel.com (8.15.2/8.15.2) with ESMTP id 10FBVXHR023736; Fri, 15 Jan 2021 03:31:33 -0800 (PST) (envelope-from gnu@danielengel.com) From: Daniel Engel To: gcc-patches@gcc.gnu.org Cc: Richard.Earnshaw@foss.arm.com, christophe.lyon@linaro.org Subject: [PATCH v5 15/33] Import 'popcnt' functions from the CM0 library Date: Fri, 15 Jan 2021 03:30:43 -0800 Message-Id: X-Mailer: git-send-email 2.25.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-13.2 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, JMQ_SPF_NEUTRAL, KAM_SHORT, RCVD_IN_DNSWL_LOW, RCVD_IN_MSPIKE_H4, RCVD_IN_MSPIKE_WL, SCC_5_SHORT_WORD_LINES, SPF_HELO_PASS, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 15 Jan 2021 11:31:38 -0000 The functional overlap between the single- and double-word functions makes this implementation about 30% smaller than the C functions if both functions are linked together in the same appliation. gcc/libgcc/ChangeLog: 2021-01-13 Daniel Engel * config/arm/popcnt.S (__popcountsi, __popcountdi2): New file. * config/arm/lib1funcs.S: #include bit/popcnt.S * config/arm/t-elf (LIB1ASMFUNCS): Add _popcountsi2/di2. --- libgcc/config/arm/lib1funcs.S | 1 + libgcc/config/arm/popcnt.S | 189 ++++++++++++++++++++++++++++++++++ libgcc/config/arm/t-elf | 2 + 3 files changed, 192 insertions(+) create mode 100644 libgcc/config/arm/popcnt.S diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S index 600ea2dfdc9..bd84a3e4281 100644 --- a/libgcc/config/arm/lib1funcs.S +++ b/libgcc/config/arm/lib1funcs.S @@ -1705,6 +1705,7 @@ LSYM(Lover12): #include "clz2.S" #include "ctz2.S" #include "parity.S" +#include "popcnt.S" /* ------------------------------------------------------------------------ */ /* These next two sections are here despite the fact that they contain Thumb diff --git a/libgcc/config/arm/popcnt.S b/libgcc/config/arm/popcnt.S new file mode 100644 index 00000000000..51b1ed745ee --- /dev/null +++ b/libgcc/config/arm/popcnt.S @@ -0,0 +1,189 @@ +/* popcnt.S: ARM optimized popcount functions + + Copyright (C) 2020-2021 Free Software Foundation, Inc. + Contributed by Daniel Engel (gnu@danielengel.com) + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + + +#ifdef L_popcountdi2 + +// int __popcountdi2(int) +// Returns the number of bits set in $r1:$r0. +// Returns the result in $r0. +FUNC_START_SECTION popcountdi2 .text.sorted.libgcc.popcountdi2 + CFI_START_FUNCTION + + #if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__ + // Initialize the result. + // Compensate for the two extra loop (one for each word) + // required to detect zero arguments. + movs r2, #2 + + LLSYM(__popcountd_loop): + // Same as __popcounts_loop below, except for $r1. + subs r2, #1 + subs r3, r1, #1 + ands r1, r3 + bcs LLSYM(__popcountd_loop) + + // Repeat the operation for the second word. + b LLSYM(__popcounts_loop) + + #else /* !__OPTIMIZE_SIZE__ */ + // Load the one-bit alternating mask. + ldr r3, =0x55555555 + + // Reduce the second word. + lsrs r2, r1, #1 + ands r2, r3 + subs r1, r2 + + // Reduce the first word. + lsrs r2, r0, #1 + ands r2, r3 + subs r0, r2 + + // Load the two-bit alternating mask. + ldr r3, =0x33333333 + + // Reduce the second word. + lsrs r2, r1, #2 + ands r2, r3 + ands r1, r3 + adds r1, r2 + + // Reduce the first word. + lsrs r2, r0, #2 + ands r2, r3 + ands r0, r3 + adds r0, r2 + + // There will be a maximum of 8 bits in each 4-bit field. + // Jump into the single word flow to combine and complete. + b LLSYM(__popcounts_merge) + + #endif /* !__OPTIMIZE_SIZE__ */ +#endif /* L_popcountdi2 */ + + +// The implementation of __popcountdi2() tightly couples with __popcountsi2(), +// such that instructions must appear consecutively in the same memory +// section for proper flow control. However, this construction inhibits +// the ability to discard __popcountdi2() when only using __popcountsi2(). +// Therefore, this block configures __popcountsi2() for compilation twice. +// The first version is a minimal standalone implementation, and the second +// version is the continuation of __popcountdi2(). The standalone version must +// be declared WEAK, so that the combined version can supersede it and +// provide both symbols when required. +// '_popcountsi2' should appear before '_popcountdi2' in LIB1ASMFUNCS. +#if defined(L_popcountsi2) || defined(L_popcountdi2) + +#ifdef L_popcountsi2 +// int __popcountsi2(int) +// Returns '0' if the number of bits set in $r0 is even, and '1' otherwise. +// Returns the result in $r0. +// Uses $r2 as scratch space. +WEAK_START_SECTION popcountsi2 .text.sorted.libgcc.popcountsi2 + CFI_START_FUNCTION + +#else /* L_popcountdi2 */ +FUNC_ENTRY popcountsi2 + +#endif + + #if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__ + // Initialize the result. + // Compensate for the extra loop required to detect zero. + movs r2, #1 + + // Kernighan's algorithm for __popcount(x): + // for (c = 0; x; c++) + // x &= x - 1; + + LLSYM(__popcounts_loop): + // Every loop counts for a '1' set in the argument. + // Count down since it's easier to initialize positive compensation, + // and the negation before function return is free. + subs r2, #1 + + // Clear one bit per loop. + subs r3, r0, #1 + ands r0, r3 + + // If this is a test for zero, it will be impossible to distinguish + // between zero and one bits set: both terminate after one loop. + // Instead, subtraction underflow flags when zero entered the loop. + bcs LLSYM(__popcounts_loop) + + // Invert the result, since we have been counting negative. + rsbs r0, r2, #0 + RET + + #else /* !__OPTIMIZE_SIZE__ */ + + // Load the one-bit alternating mask. + ldr r3, =0x55555555 + + // Reduce the word. + lsrs r1, r0, #1 + ands r1, r3 + subs r0, r1 + + // Load the two-bit alternating mask. + ldr r3, =0x33333333 + + // Reduce the word. + lsrs r1, r0, #2 + ands r0, r3 + ands r1, r3 + LLSYM(__popcounts_merge): + adds r0, r1 + + // Load the four-bit alternating mask. + ldr r3, =0x0F0F0F0F + + // Reduce the word. + lsrs r1, r0, #4 + ands r0, r3 + ands r1, r3 + adds r0, r1 + + // Accumulate individual byte sums into the MSB. + lsls r1, r0, #8 + adds r0, r1 + lsls r1, r0, #16 + adds r0, r1 + + // Isolate the cumulative sum. + lsrs r0, #24 + RET + + #endif /* !__OPTIMIZE_SIZE__ */ + + CFI_END_FUNCTION +FUNC_END popcountsi2 + +#ifdef L_popcountdi2 +FUNC_END popcountdi2 +#endif + +#endif /* L_popcountsi2 || L_popcountdi2 */ + diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf index 0e9b9ce21af..2e3f04aa2f0 100644 --- a/libgcc/config/arm/t-elf +++ b/libgcc/config/arm/t-elf @@ -25,6 +25,7 @@ LIB1ASMFUNCS += \ _clzsi2 \ _ctzsi2 \ _paritysi2 \ + _popcountsi2 \ # Group 1: Integer function objects. @@ -39,6 +40,7 @@ LIB1ASMFUNCS += \ _ffssi2 \ _ffsdi2 \ _paritydi2 \ + _popcountdi2 \ _dvmd_tls \ _divsi3 \ _modsi3 \ -- 2.25.1