From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 82685 invoked by alias); 6 Sep 2019 16:51:42 -0000 Mailing-List: contact libc-stable-help@sourceware.org; run by ezmlm Precedence: bulk List-Post: List-Help: List-Subscribe: List-Archive: Sender: libc-stable-owner@sourceware.org Received: (qmail 82672 invoked by uid 89); 6 Sep 2019 16:51:41 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Checked: by ClamAV 0.100.3 on sourceware.org X-Virus-Found: No X-Spam-SWARE-Status: No, score=-19.6 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_0,GIT_PATCH_1,GIT_PATCH_2,GIT_PATCH_3,KAM_LOTSOFHASH,RCVD_IN_DNSWL_NONE,SPF_HELO_PASS,SPF_PASS autolearn=ham version=3.3.1 spammy= X-Spam-Status: No, score=-19.6 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_0,GIT_PATCH_1,GIT_PATCH_2,GIT_PATCH_3,KAM_LOTSOFHASH,RCVD_IN_DNSWL_NONE,SPF_HELO_PASS,SPF_PASS autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on sourceware.org X-Spam-Level: X-HELO: EUR01-VE1-obe.outbound.protection.outlook.com Received: from mail-eopbgr140044.outbound.protection.outlook.com (HELO EUR01-VE1-obe.outbound.protection.outlook.com) (40.107.14.44) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 06 Sep 2019 16:51:38 +0000 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=armh.onmicrosoft.com; s=selector2-armh-onmicrosoft-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=+bYAbOuHbwgkyzHOXPe7m/Cm6ujLSc7xocTn5CvEdQY=; b=Yrd+cULF77f3BqKhu51RvAqEXpoSaN8zJgLDIqh7yfCLEKOGGXWT/+9YPu3OiJ4r3pOQzbEWtvidOF4x0lr9ckwd1VN6GUA3Ii5dwg0jBe1O76Pal142ci5TskXcqz2sbNt2f9NtoxDgmACsOWugan+943FvhdplS8AAtpoqBhk= Received: from VI1PR08CA0112.eurprd08.prod.outlook.com (2603:10a6:800:d4::14) by AM6PR08MB3816.eurprd08.prod.outlook.com (2603:10a6:20b:8c::26) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.2241.15; Fri, 6 Sep 2019 16:51:33 +0000 Received: from AM5EUR03FT017.eop-EUR03.prod.protection.outlook.com (2a01:111:f400:7e08::207) by VI1PR08CA0112.outlook.office365.com (2603:10a6:800:d4::14) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.2241.13 via Frontend Transport; Fri, 6 Sep 2019 16:51:33 +0000 Authentication-Results: spf=temperror (sender IP is 63.35.35.123) smtp.mailfrom=arm.com; sourceware.org; dkim=pass (signature was verified) header.d=armh.onmicrosoft.com;sourceware.org; dmarc=temperror action=none header.from=arm.com; Received-SPF: TempError (protection.outlook.com: error in processing during lookup of arm.com: DNS Timeout) Received: from 64aa7808-outbound-1.mta.getcheckrecipient.com (63.35.35.123) by AM5EUR03FT017.mail.protection.outlook.com (10.152.16.89) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.2241.14 via Frontend Transport; Fri, 6 Sep 2019 16:51:32 +0000 Received: ("Tessian outbound f83cc93ed55d:v27"); Fri, 06 Sep 2019 16:51:32 +0000 X-CheckRecipientChecked: true X-CR-MTA-CID: 64b4298c31e81f4f X-CR-MTA-TID: 64aa7808 Received: from cf22ec604285.1 (cr-mta-lb-1.cr-mta-net [104.47.12.53]) by 64aa7808-outbound-1.mta.getcheckrecipient.com id AB908E9C-CB5F-44DD-B2B8-143E56FE5CA7.1; Fri, 06 Sep 2019 16:51:26 +0000 Received: from EUR04-DB3-obe.outbound.protection.outlook.com (mail-db3eur04lp2053.outbound.protection.outlook.com [104.47.12.53]) by 64aa7808-outbound-1.mta.getcheckrecipient.com with ESMTPS id cf22ec604285.1 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384); Fri, 06 Sep 2019 16:51:26 +0000 ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=DYbsGVcjzqKjVmacu+1f9xgJmhsJ7Q2Ai13zAhWRn6NNRIyZJhJVo4yzJeRxTTUoUC+8iowHwS6R8ocMjUeyEHkoSLpQDDoysOewRi4zOW7BpDo19fMj2kAidZ4Xr5w/p1Z84H18vgX7eJgrPI21jQk0DCZz5toVAEQUjAsr6NLHiZOGcaCC7El/j2tFItL/LsftFrqHyOrjAW42N4RIsYJ4qWczmYk979D6fxQEdLDQdIEaVzJIainQ/r8eHP4jIl7myT0fM/Zc5BAULhWw7Sz9iHzgBpKgxZWCJLYUEHJWG1lrhkgZECZFp+rf1Xgeafe4CLEBUsb0RfFxdmt04A== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=+bYAbOuHbwgkyzHOXPe7m/Cm6ujLSc7xocTn5CvEdQY=; b=n/ir5qMj8zOwNHlwu4IUioAFu+5suF6GLfM+94vq2PpUe89CrzgW4tNtOiMxdFFMWY/9e5AbGWWBKwtbKEc8bk5RgiVqaXQTwo2ahdMox0G9vdBTuOJctobcbTKJhWSM6fzsCT9+twKIe+cU0MFevqoq79cckhwTxQtYbjyid/FQ2WCEpdNsoIxY0FJXACSQdywrSCUTkDUz2EWyIFQMOsGNZjVdQdml16/Z6dEwa2NtXIv8HNKbRzv6dTutpjUFbGZMvAUQ+xm/221pkTXKIiQ4js7bhpzvjrZhYI8rgxLqzaQ5EbUNocpojGaBuMU0h63NtADBpDEL/+r2K5SKOg== ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=armh.onmicrosoft.com; s=selector2-armh-onmicrosoft-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=+bYAbOuHbwgkyzHOXPe7m/Cm6ujLSc7xocTn5CvEdQY=; b=Yrd+cULF77f3BqKhu51RvAqEXpoSaN8zJgLDIqh7yfCLEKOGGXWT/+9YPu3OiJ4r3pOQzbEWtvidOF4x0lr9ckwd1VN6GUA3Ii5dwg0jBe1O76Pal142ci5TskXcqz2sbNt2f9NtoxDgmACsOWugan+943FvhdplS8AAtpoqBhk= Received: from VI1PR0801MB2127.eurprd08.prod.outlook.com (10.168.62.22) by VI1PR0801MB2109.eurprd08.prod.outlook.com (10.173.74.12) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.2220.19; Fri, 6 Sep 2019 16:51:25 +0000 Received: from VI1PR0801MB2127.eurprd08.prod.outlook.com ([fe80::7c75:98da:fbc1:da02]) by VI1PR0801MB2127.eurprd08.prod.outlook.com ([fe80::7c75:98da:fbc1:da02%11]) with mapi id 15.20.2241.018; Fri, 6 Sep 2019 16:51:25 +0000 From: Wilco Dijkstra To: "libc-stable@sourceware.org" CC: nd Subject: [2.26 COMMITTED][AArch64] Backport memcmp improvements Thread-Topic: [2.26 COMMITTED][AArch64] Backport memcmp improvements Thread-Index: AQHVZNMMk5Oxtqqu4EOhtXwH+NRKFg== Date: Tue, 01 Jan 2019 00:00:00 -0000 Message-ID: Accept-Language: en-GB, en-US Content-Language: en-GB X-MS-Has-Attach: X-MS-TNEF-Correlator: Authentication-Results-Original: spf=none (sender IP is ) smtp.mailfrom=Wilco.Dijkstra@arm.com; x-originating-ip: [217.140.106.54] x-ms-publictraffictype: Email X-MS-Office365-Filtering-Correlation-Id: 66fd0af8-bb90-4c8d-22e7-08d732ea78ad X-MS-Office365-Filtering-HT: Tenant X-Microsoft-Antispam-Untrusted: BCL:0;PCL:0;RULEID:(2390118)(7020095)(4652040)(8989299)(4534185)(4627221)(201703031133081)(201702281549075)(8990200)(5600166)(711020)(4605104)(1401327)(4618075)(2017052603328)(7193020);SRVR:VI1PR0801MB2109; X-MS-TrafficTypeDiagnostic: VI1PR0801MB2109:|AM6PR08MB3816: X-Microsoft-Antispam-PRVS: x-checkrecipientrouted: true x-ms-oob-tlc-oobclassifiers: OLM:5236;OLM:5236; x-forefront-prvs: 0152EBA40F X-Forefront-Antispam-Report-Untrusted: SFV:NSPM;SFS:(10009020)(4636009)(376002)(366004)(136003)(39860400002)(396003)(346002)(54534003)(189003)(199004)(186003)(26005)(316002)(7696005)(102836004)(6506007)(74316002)(305945005)(7736002)(2501003)(8676002)(4326008)(99286004)(81156014)(25786009)(81166006)(8936002)(5640700003)(256004)(14444005)(55016002)(66066001)(6436002)(71190400001)(71200400001)(486006)(476003)(9686003)(53936002)(14454004)(6916009)(33656002)(6116002)(3846002)(2906002)(86362001)(66556008)(66476007)(66946007)(2351001)(478600001)(66446008)(76116006)(52536014)(5660300002)(64756008)(357404004);DIR:OUT;SFP:1101;SCL:1;SRVR:VI1PR0801MB2109;H:VI1PR0801MB2127.eurprd08.prod.outlook.com;FPR:;SPF:None;LANG:en;PTR:InfoNoRecords;MX:1;A:1; received-spf: None (protection.outlook.com: arm.com does not designate permitted sender hosts) X-MS-Exchange-SenderADCheck: 1 X-Microsoft-Antispam-Message-Info-Original: 50rc3BCp5hRMhkCZrINaxosg1RA/pIeo3HEhZfyRxMf3ruFceaKcatXU16X/QA3pl4bd8cpdJAczfCSGsFt64cxqcREJxubek7BkqrkkFb8/pSTASny1VDGMJvHEuazpinsViW3g7jLvuiVXnTpjYoI6wvI3B5Pd6NTC8ZRN/18zhcxY1/AsnJMHA1aJFD2h0jSVsRmoRmPbAN1KrfF3oIrqYeQdltYZafwZdLr4ISDM1UK+CZliDLiT6YyQ3wqHJhn2UoxwjZa4WBWux++ERczweuU7feQ0jfn4wth2ijflZXWKzhmjAFlXnieHt/xNGegr7E5p2MBuAI/Y04qZf+5DA2MMEhAOJFRSxB/nMPEVuskr//JMlxn7A7WgdAQoChhyl7kObMq1ac4T7dd/xi5Z+x97I9aXHW6QrOe2RRM= x-ms-exchange-transport-forked: True Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 X-MS-Exchange-Transport-CrossTenantHeadersStamped: VI1PR0801MB2109 Original-Authentication-Results: spf=none (sender IP is ) smtp.mailfrom=Wilco.Dijkstra@arm.com; X-EOPAttributedMessage: 0 X-MS-Exchange-Transport-CrossTenantHeadersStripped: AM5EUR03FT017.eop-EUR03.prod.protection.outlook.com X-Forefront-Antispam-Report: CIP:63.35.35.123;IPV:CAL;SCL:-1;CTRY:IE;EFV:NLI;SFV:NSPM;SFS:(10009020)(4636009)(39860400002)(136003)(376002)(396003)(346002)(2980300002)(54534003)(189003)(199004)(55016002)(14444005)(186003)(70206006)(70586007)(7696005)(74316002)(102836004)(33656002)(305945005)(6506007)(26005)(25786009)(486006)(5640700003)(9686003)(8936002)(7736002)(99286004)(8746002)(86362001)(316002)(8676002)(356004)(81166006)(3846002)(2351001)(6116002)(52536014)(81156014)(26826003)(6916009)(50466002)(5660300002)(14454004)(22756006)(4326008)(2906002)(478600001)(23756003)(63370400001)(47776003)(126002)(476003)(76130400001)(2501003)(66066001)(336012)(36906005)(63350400001)(357404004);DIR:OUT;SFP:1101;SCL:1;SRVR:AM6PR08MB3816;H:64aa7808-outbound-1.mta.getcheckrecipient.com;FPR:;SPF:TempError;LANG:en;PTR:ec2-63-35-35-123.eu-west-1.compute.amazonaws.com;MX:1;A:1; X-MS-Office365-Filtering-Correlation-Id-Prvs: 3b53de25-7137-4f27-62a6-08d732ea743f X-Microsoft-Antispam: BCL:0;PCL:0;RULEID:(2390118)(7020095)(4652040)(8989299)(4534185)(4627221)(201703031133081)(201702281549075)(8990200)(5600166)(710020)(711020)(4605104)(1401327)(4618075)(2017052603328)(7193020);SRVR:AM6PR08MB3816; NoDisclaimer: True X-Forefront-PRVS: 0152EBA40F X-Microsoft-Antispam-Message-Info: gheVW8Bji8GLZBOBtWqR3W7atzH19Q3tjyCns5t6K3Tvr76k9OEz+3j8dSvY9mE92517BSeDXwZ1pSdffEh1Mr/fs2bqpRu0mW+XhYQmuqoN1H//lUyojfmwxGc56emn6aMkWhotQXwNo6ViErx4vqUcH+AGC+vdMOs7dwcezbWME2dqC3SHXpEeK5iOo2tQ/o044LcdW5B0D67xyYVBo2h4vZvO/Dw4B7aaRXtnWKGkPjzFizsROA2mBz2P4kxh/DzurIGJ+FVBsgc6MoFZnv2gIF6eBajdkyZmcCaR/7ca3h2PBlCDlSsDqKOv+sc0PPsi6uVg5gqTvprm/LCb4cRdagSqnTydrKhFdXhzhYg35AEqUHskbc/0HlmG4pBgDZLYyH0r3jHD2blKh2BEOhiRB5RqKYin8DHUjqp63kU= X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 06 Sep 2019 16:51:32.3112 (UTC) X-MS-Exchange-CrossTenant-Network-Message-Id: 66fd0af8-bb90-4c8d-22e7-08d732ea78ad X-MS-Exchange-CrossTenant-Id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: TenantId=f34e5979-57d9-4aaa-ad4d-b122a662184d;Ip=[63.35.35.123];Helo=[64aa7808-outbound-1.mta.getcheckrecipient.com] X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem X-MS-Exchange-Transport-CrossTenantHeadersStamped: AM6PR08MB3816 X-SW-Source: 2019-09/txt/msg00007.txt.bz2 commit ec4512194f035856b8a231476c9139d72f47c58f Author: Siddhesh Poyarekar Date: Tue Mar 6 19:22:39 2018 +0530 aarch64: Optimized memcmp for medium to large sizes =20=20=20=20 This improved memcmp provides a fast path for compares up to 16 bytes and then compares 16 bytes at a time, thus optimizing loads from both sources. The glibc memcmp microbenchmark retains performance (with an error of ~1ns) for smaller compare sizes and reduces up to 31% of execution time for compares up to 4K on the APM Mustang. On Qualcomm Falkor this improves to almost 48%, i.e. it is almost 2x improvement for sizes of 2K and above. =20=20=20=20 * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a time. =20=20=20=20 (cherry picked from commit 30a81dae5b752f8aa5f96e7f7c341ec57cba3585) commit 600e4e866c4de0cc0b16aec482c65da732960367 Author: Siddhesh Poyarekar Date: Fri Feb 2 10:15:20 2018 +0530 aarch64: Use the L() macro for labels in memcmp =20=20=20=20 The L() macro makes the assembly a bit more readable. =20=20=20=20 * sysdeps/aarch64/memcmp.S: Use L() macro for labels. =20=20=20=20 (cherry picked from commit 84c94d2fd90d84ae7e67657ee8e22c2d1b796f63) commit 1896de3d926d299a1ed5c9f0a4f03f5a81969200 Author: Wilco Dijkstra Date: Thu Aug 10 17:00:38 2017 +0100 [AArch64] Optimized memcmp. =20=20=20=20 This is an optimized memcmp for AArch64. This is a complete rewrite using a different algorithm. The previous version split into cases where both inputs were aligned, the inputs were mutually aligned and unaligned using a byte loop. The new version combines all these cases, while small inputs of less than 8 bytes are handled separately. =20=20=20=20 This allows the main code to be sped up using unaligned loads since there are now at least 8 bytes to be compared. After the first 8 bytes, align the first input. This ensures each iteration does at most one unaligned access and mutually aligned inputs behave as aligned. After the main loop, process the last 8 bytes using unaligned accesses. =20=20=20=20 This improves performance of (mutually) aligned cases by 25% and unaligned by >500% (yes >6 times faster) on large inputs. =20=20=20=20 * sysdeps/aarch64/memcmp.S (memcmp): Rewrite of optimized memcmp. =20=20=20=20 (cherry picked from commit 922369032c604b4dcfd535e1bcddd4687e7126a5) diff --git a/ChangeLog b/ChangeLog index 4cd747a..204d047 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a + time. + +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/memcmp.S: Use L() macro for labels. + +2019-09-06 Wilco Dijkstra + + * sysdeps/aarch64/memcmp.S (memcmp): + Rewrite of optimized memcmp. + 2019-07-12 Adhemerval Zanella =20 [BZ #24699] diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index 4cfcb89..d074c98 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -22,132 +22,132 @@ =20 /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, unaligned accesses. */ =20 /* Parameters and result. */ #define src1 x0 #define src2 x1 #define limit x2 -#define result x0 +#define result w0 =20 /* Internal variables. */ #define data1 x3 #define data1w w3 -#define data2 x4 -#define data2w w4 -#define has_nul x5 -#define diff x6 -#define endloop x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define pos x11 -#define limit_wd x12 -#define mask x13 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 =20 ENTRY_ALIGN (memcmp, 6) DELOUSE (0) DELOUSE (1) DELOUSE (2) - cbz limit, L(ret0) - eor tmp1, src1, src2 - tst tmp1, #7 - b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) - add limit_wd, limit, #7 - lsr limit_wd, limit_wd, #3 - /* Start of performance-critical section -- one 64B cache line. */ -L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 -L(start_realigned): - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /* Non-zero if differences found. = */ - csinv endloop, diff, xzr, ne /* Last Dword or differences. */ - cbz endloop, L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ - - /* Not reached the limit, must have found a diff. */ - cbnz limit_wd, L(not_limit) - - /* Limit % 8 =3D=3D 0 =3D> all bytes significant. */ - ands limit, limit, #7 - b.eq L(not_limit) - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - orr diff, diff, mask -L(not_limit): =20 -#ifndef __AARCH64EB__ - rev diff, diff + subs limit, limit, 8 + b.lo L(less8) + + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) + + /* Jump directly to comparing the last 16 bytes for 32 byte (or les= s) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when = we + try to align, so limit it only to strings larger than 128 bytes.= */ + cmp limit, 96 + b.ls L(loop8) + + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <=3D 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) + + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return): +#ifndef __AARCH64EB__ rev data1, data1 rev data2, data2 #endif - /* The MS-non-zero bit of DIFF marks either the first bit - that is different, or the end of the significant data. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, diff - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - RET - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - add limit, limit, tmp1 /* Adjust the limit for the extra. = */ - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. = */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit, #7 - orr data1, data1, tmp2 - orr data2, data2, tmp2 - lsr limit_wd, limit_wd, #3 - b L(start_realigned) - -L(ret0): - mov result, #0 - RET - - .p2align 6 -L(misaligned8): - sub limit, limit, #1 -1: - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, data2w, #0, cs /* NZCV =3D 0b0000. */ - b.eq 1b - sub result, data1, data2 - RET + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne L(return) + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV =3D 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w + ret + END (memcmp) #undef bcmp weak_alias (memcmp, bcmp)