From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 69949 invoked by alias); 6 Sep 2019 18:41:20 -0000 Mailing-List: contact libc-stable-help@sourceware.org; run by ezmlm Precedence: bulk List-Post: List-Help: List-Subscribe: List-Archive: Sender: libc-stable-owner@sourceware.org Received: (qmail 69934 invoked by uid 89); 6 Sep 2019 18:41:20 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Checked: by ClamAV 0.100.3 on sourceware.org X-Virus-Found: No X-Spam-SWARE-Status: No, score=-19.8 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_0,GIT_PATCH_1,GIT_PATCH_2,GIT_PATCH_3,RCVD_IN_DNSWL_NONE,SPF_HELO_PASS,SPF_PASS autolearn=ham version=3.3.1 spammy= X-Spam-Status: No, score=-19.8 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_0,GIT_PATCH_1,GIT_PATCH_2,GIT_PATCH_3,RCVD_IN_DNSWL_NONE,SPF_HELO_PASS,SPF_PASS autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on sourceware.org X-Spam-Level: X-HELO: EUR02-VE1-obe.outbound.protection.outlook.com Received: from mail-eopbgr20080.outbound.protection.outlook.com (HELO EUR02-VE1-obe.outbound.protection.outlook.com) (40.107.2.80) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 06 Sep 2019 18:41:17 +0000 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=armh.onmicrosoft.com; s=selector2-armh-onmicrosoft-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=AScBmVCOCSt4J/Lrm9zqIN2jZREhrwN4RyvIvqGUx74=; b=qR3/64mJQESFa+GvtnCftKro8MOprTHWNrcjnQBCcRy8mdB6NAzV3McEHjjYUG5zkIru/qkCcHN8F1jSleJfMEM4Jd8/UDt9UmqMnqRQ3Rgtd+AwZ+llEoeda5rPnQmDz6yIqNMZ7MuUzKEnCnvI6FUd7y4OwSuhUSUpEwYXEx0= Received: from AM6PR08CA0001.eurprd08.prod.outlook.com (2603:10a6:20b:b2::13) by VI1PR08MB3102.eurprd08.prod.outlook.com (2603:10a6:803:46::19) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.2220.18; Fri, 6 Sep 2019 18:41:11 +0000 Received: from DB5EUR03FT010.eop-EUR03.prod.protection.outlook.com (2a01:111:f400:7e0a::203) by AM6PR08CA0001.outlook.office365.com (2603:10a6:20b:b2::13) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.2241.14 via Frontend Transport; Fri, 6 Sep 2019 18:41:11 +0000 Authentication-Results: spf=temperror (sender IP is 63.35.35.123) smtp.mailfrom=arm.com; sourceware.org; dkim=pass (signature was verified) header.d=armh.onmicrosoft.com;sourceware.org; dmarc=temperror action=none header.from=arm.com; Received-SPF: TempError (protection.outlook.com: error in processing during lookup of arm.com: DNS Timeout) Received: from 64aa7808-outbound-1.mta.getcheckrecipient.com (63.35.35.123) by DB5EUR03FT010.mail.protection.outlook.com (10.152.20.96) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.2241.14 via Frontend Transport; Fri, 6 Sep 2019 18:41:10 +0000 Received: ("Tessian outbound ea3fc1501f20:v27"); Fri, 06 Sep 2019 18:41:10 +0000 X-CheckRecipientChecked: true X-CR-MTA-CID: d8d767459feb6049 X-CR-MTA-TID: 64aa7808 Received: from 00ca00c78f7a.1 (cr-mta-lb-1.cr-mta-net [104.47.9.52]) by 64aa7808-outbound-1.mta.getcheckrecipient.com id 713930D7-4DC9-43B6-927B-01990172D50F.1; Fri, 06 Sep 2019 18:41:04 +0000 Received: from EUR03-VE1-obe.outbound.protection.outlook.com (mail-ve1eur03lp2052.outbound.protection.outlook.com [104.47.9.52]) by 64aa7808-outbound-1.mta.getcheckrecipient.com with ESMTPS id 00ca00c78f7a.1 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-SHA384); Fri, 06 Sep 2019 18:41:04 +0000 ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=jSwvmeZtOtBWOe0DmqKL8PXk7AipW7uSmNd3Kih+/S0W382mtTF3+9ZXNLf3jn43vYMGLJZHZcI4nVZrYRlC6r/m7w0hyguojPHPTQrrP0dXDHQIwiJZevZUGHCsYjGwoMtmYfvvH5RSpiWqJ28p4RY3Q8tHJehH6QPQAP2/IYVYrTBpey0zhgvoqOd2gBonXIdkjRFLpJ0n7c774yNhfTxbuSm6gW4+cnST11iPz+T87CYYbZPkaWMkCGwvV7tN4sVtG30aq0Ru/dC9qIoI27LnZdnObfhoaNmvZvHak7wqXLbwy5Z2hhwDGXKKK5CL/1/BSwdIiRRKu9tiBrzQLQ== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=AScBmVCOCSt4J/Lrm9zqIN2jZREhrwN4RyvIvqGUx74=; b=WiDmoec+tcoYGaS9mWU31KeWr1q5wr8MvVvEypYsbyk5FX46BWaChPQkjeqPu3t5YwbRWyQj8esDXQ03wTvKIuGLkt8KHVYfl+s8Z2X/iHGDs1Oe26nLx3SiwaEHC8A08wgRYXt1NDll8lYnVGB6evO01ZaKc3Jk7myyaRBzanze77Z0kb6YUnySdyAJMtQx3ySKaAN+tU0QGQHQr6GAnelw+KjhOX7NZvCtYiP8IhOr9ZKbuGd0yX/dXFjIVZjji9jgOavRGszJtj73RO+C1qctgXmzHzaODwF6q4g9znIny0M9ErYNRnVzBTiDjGpMX/ReLtgTxnPQqbiKbWAEGA== ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=armh.onmicrosoft.com; s=selector2-armh-onmicrosoft-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=AScBmVCOCSt4J/Lrm9zqIN2jZREhrwN4RyvIvqGUx74=; b=qR3/64mJQESFa+GvtnCftKro8MOprTHWNrcjnQBCcRy8mdB6NAzV3McEHjjYUG5zkIru/qkCcHN8F1jSleJfMEM4Jd8/UDt9UmqMnqRQ3Rgtd+AwZ+llEoeda5rPnQmDz6yIqNMZ7MuUzKEnCnvI6FUd7y4OwSuhUSUpEwYXEx0= Received: from VI1PR0801MB2127.eurprd08.prod.outlook.com (10.168.62.22) by VI1PR0801MB1872.eurprd08.prod.outlook.com (10.173.72.12) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.2220.20; Fri, 6 Sep 2019 18:41:00 +0000 Received: from VI1PR0801MB2127.eurprd08.prod.outlook.com ([fe80::7c75:98da:fbc1:da02]) by VI1PR0801MB2127.eurprd08.prod.outlook.com ([fe80::7c75:98da:fbc1:da02%11]) with mapi id 15.20.2241.018; Fri, 6 Sep 2019 18:41:00 +0000 From: Wilco Dijkstra To: "libc-stable@sourceware.org" CC: nd Subject: [2.27 COMMITTED][AArch64] Backport memcpy_falkor improvements Thread-Topic: [2.27 COMMITTED][AArch64] Backport memcpy_falkor improvements Thread-Index: AQHVZOJvV2AWTqR9DE+uZLP4oYu9Nw== Date: Tue, 01 Jan 2019 00:00:00 -0000 Message-ID: Accept-Language: en-GB, en-US Content-Language: en-GB X-MS-Has-Attach: X-MS-TNEF-Correlator: Authentication-Results-Original: spf=none (sender IP is ) smtp.mailfrom=Wilco.Dijkstra@arm.com; x-originating-ip: [217.140.106.54] x-ms-publictraffictype: Email X-MS-Office365-Filtering-Correlation-Id: 02194f21-5985-49cc-9293-08d732f9c949 X-MS-Office365-Filtering-HT: Tenant X-Microsoft-Antispam-Untrusted: BCL:0;PCL:0;RULEID:(2390118)(7020095)(4652040)(8989299)(4534185)(4627221)(201703031133081)(201702281549075)(8990200)(5600166)(711020)(4605104)(1401327)(4618075)(2017052603328)(7193020);SRVR:VI1PR0801MB1872; X-MS-TrafficTypeDiagnostic: VI1PR0801MB1872:|VI1PR08MB3102: X-Microsoft-Antispam-PRVS: x-checkrecipientrouted: true x-ms-oob-tlc-oobclassifiers: OLM:1265;OLM:1265; x-forefront-prvs: 0152EBA40F X-Forefront-Antispam-Report-Untrusted: SFV:NSPM;SFS:(10009020)(4636009)(39860400002)(136003)(346002)(396003)(366004)(376002)(189003)(54534003)(199004)(7736002)(33656002)(256004)(8676002)(86362001)(71200400001)(71190400001)(76116006)(66946007)(66476007)(66556008)(66446008)(81166006)(81156014)(52536014)(486006)(5660300002)(8936002)(186003)(53936002)(4326008)(476003)(66066001)(6436002)(2906002)(64756008)(6506007)(25786009)(5640700003)(2501003)(316002)(102836004)(2351001)(6916009)(26005)(55016002)(99286004)(9686003)(6116002)(305945005)(478600001)(3846002)(74316002)(7696005)(14454004);DIR:OUT;SFP:1101;SCL:1;SRVR:VI1PR0801MB1872;H:VI1PR0801MB2127.eurprd08.prod.outlook.com;FPR:;SPF:None;LANG:en;PTR:InfoNoRecords;MX:1;A:1; received-spf: None (protection.outlook.com: arm.com does not designate permitted sender hosts) X-MS-Exchange-SenderADCheck: 1 X-Microsoft-Antispam-Message-Info-Original: yFdmDIr6jUx5OSHoOR0yXKzqQXV6TKLVT5uFPFVPwmMcC1oi29x9stjZNENYZmZU9mf3U0qZD2CUQxxzDrJZnF+X5F13rie/7p5Cu0alQ86bU9J6cQRfDI6+XNhXcdN9DPWVDYtKHU+tPqt/gMWLIvPlau4z0uAjeZ4LaJVnAm1aH5d5H2z+2BGiRxz0BxLIhN4qPLub9mzHXfa1PdsQ+oPjBGVkiKtxghzp+In+2+a7Z3RJnJ1uAhN+F2MqLzqn78jGKtG3LwtnYytVnWBojQBjV65FdATUXIaOmkyFWTIFflU8M6kmrXWbaK8NmRkrn/dHfhzWc0UsiaLCf+KPOuPsLOqVQWp3vLQ7alLz3LzJsnP5XNcSbjgJu43Ls3gNLahoYuFoojuyi0Hr51Bjk96YxuaNkG5swNVuHYfFLAE= x-ms-exchange-transport-forked: True Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 X-MS-Exchange-Transport-CrossTenantHeadersStamped: VI1PR0801MB1872 Original-Authentication-Results: spf=none (sender IP is ) smtp.mailfrom=Wilco.Dijkstra@arm.com; X-EOPAttributedMessage: 0 X-MS-Exchange-Transport-CrossTenantHeadersStripped: DB5EUR03FT010.eop-EUR03.prod.protection.outlook.com X-Forefront-Antispam-Report: CIP:63.35.35.123;IPV:CAL;SCL:-1;CTRY:IE;EFV:NLI;SFV:NSPM;SFS:(10009020)(4636009)(39860400002)(136003)(346002)(376002)(396003)(2980300002)(54534003)(189003)(199004)(476003)(356004)(126002)(486006)(316002)(5660300002)(186003)(81156014)(4326008)(8936002)(8746002)(70206006)(8676002)(81166006)(50466002)(6916009)(33656002)(336012)(7696005)(22756006)(63350400001)(63370400001)(86362001)(23756003)(9686003)(2501003)(99286004)(76130400001)(102836004)(55016002)(52536014)(26826003)(6506007)(2351001)(26005)(74316002)(2906002)(47776003)(14454004)(70586007)(7736002)(305945005)(3846002)(6116002)(5640700003)(478600001)(66066001)(25786009);DIR:OUT;SFP:1101;SCL:1;SRVR:VI1PR08MB3102;H:64aa7808-outbound-1.mta.getcheckrecipient.com;FPR:;SPF:TempError;LANG:en;PTR:ec2-63-35-35-123.eu-west-1.compute.amazonaws.com;MX:1;A:1; X-MS-Office365-Filtering-Correlation-Id-Prvs: aa3278ee-d3fe-495d-36ee-08d732f9c3aa X-Microsoft-Antispam: BCL:0;PCL:0;RULEID:(2390118)(7020095)(4652040)(8989299)(4534185)(4627221)(201703031133081)(201702281549075)(8990200)(5600166)(710020)(711020)(4605104)(1401327)(4618075)(2017052603328)(7193020);SRVR:VI1PR08MB3102; NoDisclaimer: True X-Forefront-PRVS: 0152EBA40F X-Microsoft-Antispam-Message-Info: b0eCFwj35YgI9lK8O9oN9VuVsRLTeGeQ/17QyMnHM/fa7hFcUxVnQvoT2xx1da/85ZdP9gXTBwZoxqelkJ75ZZal33o8voTWhZ6gpCgn54ICrklExm/mcQ/e4wnGlrWlOee/Jm7k13s5lmj6HlVtZHmUZIDpDkbluMZHrw/OKJHll9ksBOOfrWmhHQhuP5V0vF3OylBQX8LtT8WX7lFk6KCP/U55818fCNcyWeh9Bf5WUWsnSCgVnKuZDR6LeJpr/bKbfVlfi29lQ41KA3Zp3uy1C7NIMOSKR7TxP/dTMmWmz4YTwrx5iVmiwEtOqKU/16WWMaRvrPh8MrBqrcJDCFewWTnLSy+UcFpQzuEDSzfkuOzTZCTKSMX4NGXZjDoHULRPKd4Tixsm1rxqYNwslsZnzoa9zCICpTH9G2xo6u0= X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 06 Sep 2019 18:41:10.0541 (UTC) X-MS-Exchange-CrossTenant-Network-Message-Id: 02194f21-5985-49cc-9293-08d732f9c949 X-MS-Exchange-CrossTenant-Id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: TenantId=f34e5979-57d9-4aaa-ad4d-b122a662184d;Ip=[63.35.35.123];Helo=[64aa7808-outbound-1.mta.getcheckrecipient.com] X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem X-MS-Exchange-Transport-CrossTenantHeadersStamped: VI1PR08MB3102 X-SW-Source: 2019-09/txt/msg00010.txt.bz2 commit e6b7252040755cc965e71622084b9b5ee05345ff Author: Siddhesh Poyarekar Date: Fri Jun 29 22:45:59 2018 +0530 aarch64,falkor: Use vector registers for memcpy =20=20=20=20 Vector registers perform better than scalar register pairs for copying data so prefer them instead. This results in a time reduction of over 50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk. Larger sizes show improvements of around 1% to 2%. memcpy-random shows a very small improvement, in the range of 1-2%. =20=20=20=20 * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): Use vector registers. =20=20=20=20 (cherry picked from commit 0aec4c1d1801e8016ebe89281d16597e0557b8be) commit c74b884f705aa54998c4b94ac8b098b3ac40e465 Author: Siddhesh Poyarekar Date: Fri May 11 00:11:52 2018 +0530 aarch64,falkor: Ignore prefetcher tagging for smaller copies =20=20=20=20 For smaller and medium sized copies, the effect of hardware prefetching are not as dominant as instruction level parallelism. Hence it makes more sense to load data into multiple registers than to try and route them to the same prefetch unit. This is also the case for the loop exit where we are unable to latch on to the same prefetch unit anyway so it makes more sense to have data loaded in parallel. =20=20=20=20 The performance results are a bit mixed with memcpy-random, with numbers jumping between -1% and +3%, i.e. the numbers don't seem repeatable. memcpy-walk sees a 70% improvement (i.e. > 2x) for 128 bytes and that improvement reduces down as the impact of the tail copy decreases in comparison to the loop. =20=20=20=20 * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): Use multiple registers to copy data in loop tail. =20=20=20=20 (cherry picked from commit db725a458e1cb0e17204daa543744faf08bb2e06) diff --git a/ChangeLog b/ChangeLog index 99b6180..dd2106c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,15 @@ 2019-09-06 Siddhesh Poyarekar =20 + * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): + Use vector registers. + +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): + Use multiple registers to copy data in loop tail. + +2019-09-06 Siddhesh Poyarekar + * sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of mov + lsr. =20 diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/mu= ltiarch/memcpy_falkor.S index 8dd8c1e..cdc2de4 100644 --- a/sysdeps/aarch64/multiarch/memcpy_falkor.S +++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S @@ -29,11 +29,19 @@ #define dst x3 #define srcend x4 #define dstend x5 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 #define tmp1 x14 +#define A_x x6 +#define B_x x7 +#define A_w w6 +#define B_w w7 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 =20 /* Copies are split into 3 main cases: =20 @@ -53,9 +61,9 @@ bumping up the small copies up to 32 bytes allows us to do that without cost and also allows us to reduce the size of the prep code before loop= 64. =20 - All copies are done only via two registers r6 and r7. This is to ensure - that all loads hit a single hardware prefetcher which can get correctly - trained to prefetch a single stream. + The copy loop uses only one register q0. This is to ensure that all lo= ads + hit a single hardware prefetcher which can get correctly trained to pre= fetch + a single stream. =20 The non-temporal stores help optimize cache utilization. */ =20 @@ -66,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6) add srcend, src, count add dstend, dstin, count b.ls L(copy32) - ldp A_l, A_h, [src] + ldr A_q, [src] cmp count, 128 - stp A_l, A_h, [dstin] + str A_q, [dstin] b.hi L(copy_long) =20 /* Medium copies: 33..128 bytes. */ sub tmp1, count, 1 - ldp A_l, A_h, [src, 16] - stp A_l, A_h, [dstin, 16] + ldr A_q, [src, 16] + ldr B_q, [srcend, -32] + ldr C_q, [srcend, -16] tbz tmp1, 6, 1f - ldp A_l, A_h, [src, 32] - stp A_l, A_h, [dstin, 32] - ldp A_l, A_h, [src, 48] - stp A_l, A_h, [dstin, 48] - ldp A_l, A_h, [srcend, -64] - stp A_l, A_h, [dstend, -64] - ldp A_l, A_h, [srcend, -48] - stp A_l, A_h, [dstend, -48] + ldr D_q, [src, 32] + ldr E_q, [src, 48] + str D_q, [dstin, 32] + str E_q, [dstin, 48] + ldr F_q, [srcend, -64] + ldr G_q, [srcend, -48] + str F_q, [dstend, -64] + str G_q, [dstend, -48] 1: - ldp A_l, A_h, [srcend, -32] - stp A_l, A_h, [dstend, -32] - ldp A_l, A_h, [srcend, -16] - stp A_l, A_h, [dstend, -16] + str A_q, [dstin, 16] + str B_q, [dstend, -32] + str C_q, [dstend, -16] ret =20 .p2align 4 @@ -97,44 +105,44 @@ L(copy32): /* 16-32 */ cmp count, 16 b.lo 1f - ldp A_l, A_h, [src] - stp A_l, A_h, [dstin] - ldp A_l, A_h, [srcend, -16] - stp A_l, A_h, [dstend, -16] + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] ret .p2align 4 1: /* 8-15 */ tbz count, 3, 1f - ldr A_l, [src] - str A_l, [dstin] - ldr A_l, [srcend, -8] - str A_l, [dstend, -8] + ldr A_x, [src] + ldr B_x, [srcend, -8] + str A_x, [dstin] + str B_x, [dstend, -8] ret .p2align 4 1: /* 4-7 */ tbz count, 2, 1f - ldr A_lw, [src] - str A_lw, [dstin] - ldr A_lw, [srcend, -4] - str A_lw, [dstend, -4] + ldr A_w, [src] + ldr B_w, [srcend, -4] + str A_w, [dstin] + str B_w, [dstend, -4] ret .p2align 4 1: /* 2-3 */ tbz count, 1, 1f - ldrh A_lw, [src] - strh A_lw, [dstin] - ldrh A_lw, [srcend, -2] - strh A_lw, [dstend, -2] + ldrh A_w, [src] + ldrh B_w, [srcend, -2] + strh A_w, [dstin] + strh B_w, [dstend, -2] ret .p2align 4 1: /* 0-1 */ tbz count, 0, 1f - ldrb A_lw, [src] - strb A_lw, [dstin] + ldrb A_w, [src] + strb A_w, [dstin] 1: ret =20 @@ -153,30 +161,29 @@ L(copy_long): add count, count, tmp1 =20 L(loop64): - ldp A_l, A_h, [src, 16]! - stnp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16]! + ldr A_q, [src, 16]! + str A_q, [dst, 16] + ldr A_q, [src, 16]! subs count, count, 64 - stnp A_l, A_h, [dst, 32] - ldp A_l, A_h, [src, 16]! - stnp A_l, A_h, [dst, 48] - ldp A_l, A_h, [src, 16]! - stnp A_l, A_h, [dst, 64] - add dst, dst, 64 + str A_q, [dst, 32] + ldr A_q, [src, 16]! + str A_q, [dst, 48] + ldr A_q, [src, 16]! + str A_q, [dst, 64]! b.hi L(loop64) =20 /* Write the last full set of 64 bytes. The remainder is at most 64 bytes, so it is safe to always copy 64 bytes from the end even if there is just 1 byte left. */ L(last64): - ldp A_l, A_h, [srcend, -64] - stnp A_l, A_h, [dstend, -64] - ldp A_l, A_h, [srcend, -48] - stnp A_l, A_h, [dstend, -48] - ldp A_l, A_h, [srcend, -32] - stnp A_l, A_h, [dstend, -32] - ldp A_l, A_h, [srcend, -16] - stnp A_l, A_h, [dstend, -16] + ldr E_q, [srcend, -64] + str E_q, [dstend, -64] + ldr D_q, [srcend, -48] + str D_q, [dstend, -48] + ldr C_q, [srcend, -32] + str C_q, [dstend, -32] + ldr B_q, [srcend, -16] + str B_q, [dstend, -16] ret =20 END (__memcpy_falkor)