From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <Wilco.Dijkstra@arm.com>
Received: from EUR04-DB3-obe.outbound.protection.outlook.com
 (mail-eopbgr60071.outbound.protection.outlook.com [40.107.6.71])
 by sourceware.org (Postfix) with ESMTPS id BB11E3857C6C
 for <libc-alpha@sourceware.org>; Thu, 14 Oct 2021 15:54:11 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org BB11E3857C6C
Received: from AM6PR01CA0053.eurprd01.prod.exchangelabs.com
 (2603:10a6:20b:e0::30) by VI1PR08MB3262.eurprd08.prod.outlook.com
 (2603:10a6:803:47::28) with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4587.24; Thu, 14 Oct
 2021 15:54:06 +0000
Received: from AM5EUR03FT052.eop-EUR03.prod.protection.outlook.com
 (2603:10a6:20b:e0:cafe::3c) by AM6PR01CA0053.outlook.office365.com
 (2603:10a6:20b:e0::30) with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4608.14 via Frontend
 Transport; Thu, 14 Oct 2021 15:54:06 +0000
X-MS-Exchange-Authentication-Results: spf=temperror (sender IP is
 63.35.35.123) smtp.mailfrom=arm.com; sourceware.org; dkim=pass (signature was
 verified) header.d=armh.onmicrosoft.com;sourceware.org; dmarc=temperror
 action=none header.from=arm.com;
Received-SPF: TempError (protection.outlook.com: error in processing during
 lookup of arm.com: DNS Timeout)
Received: from 64aa7808-outbound-1.mta.getcheckrecipient.com (63.35.35.123) by
 AM5EUR03FT052.mail.protection.outlook.com (10.152.17.161) with
 Microsoft SMTP
 Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id
 15.20.4608.15 via Frontend Transport; Thu, 14 Oct 2021 15:54:04 +0000
Received: ("Tessian outbound 8e26f7114b75:v103");
 Thu, 14 Oct 2021 15:54:04 +0000
X-CheckRecipientChecked: true
X-CR-MTA-CID: 5c779279494f60f3
X-CR-MTA-TID: 64aa7808
Received: from 9789d6b4964f.1
 by 64aa7808-outbound-1.mta.getcheckrecipient.com id
 320B7FDB-AA95-4231-BFBD-BCAC1F7CF82A.1; 
 Thu, 14 Oct 2021 15:53:57 +0000
Received: from EUR02-AM5-obe.outbound.protection.outlook.com
 by 64aa7808-outbound-1.mta.getcheckrecipient.com with ESMTPS id 9789d6b4964f.1
 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384);
 Thu, 14 Oct 2021 15:53:57 +0000
ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none;
 b=WKZ7RVBh2PjpJ4A2Fk2NtmkzgdXwE1XBOaHEEovOEf0IUWyuMRUtP3QxFkxJ4lrXQVzVdHT92fX5LodC+vXvj+5czu2AI/GwMXjUpjorc0y2mhScx+M2YoOD4hroMfydRXJQYDU57krAxa2/PJ3q8zT+YRZYgNHBYqnHIxPv5/MGF92MpymRuIW2n6AQRq43Ow1THyHAgA8a036eBdzKz1RgTsdM7JlsekZx7XeLsahJTc37IxA9/S4AqVAOsLPlfcK+UPGGEfDeoiytvn1ZQ5fXW40l3zSrSJFKbr7nK4JaS/2juWj9qiVmhu6O5lfEFAxCEZ08Oqwd0p8XpY6urw==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; 
 s=arcselector9901;
 h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1;
 bh=8MBewz3/Q/Q7J09ZN5L+bzdY+6BxoWZJk8K5lk462pk=;
 b=RFeCir70FzfRVXzY4esfmvU25DbV+Lc6+asv1RM+VXpBFek+CYqjh6bSi5NC2LIm9kp2hLea4Hx3I3Yr4RSER4BD+ruiOHHsnT8Edx/g+7Nx6Rev9JGAR1R64qSCQLRbeIugrQoSZ03myAjL0tiH0/MFyLd/IDsMObeNw/pC2/EaX1fFWvDB8aQxA4EX7TIr3RJvJ5UwcCWNTrVQ5pJvqH+lDO3dJq3DJ9OXWZLCEcKesBbySgrsxUtNKFU76HyRiZt2vurcDgGBSRyx6Qa0frlcr/5J7VZ5pPo443mlttJMKx0WiHv76lvArxIt2htru55nAiFyiCFYRogLHzKc2A==
ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass
 smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass
 header.d=arm.com; arc=none
Received: from VE1PR08MB5599.eurprd08.prod.outlook.com (2603:10a6:800:1a1::12)
 by VI1PR0802MB2157.eurprd08.prod.outlook.com (2603:10a6:800:9c::10)
 with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4608.16; Thu, 14 Oct
 2021 15:53:55 +0000
Received: from VE1PR08MB5599.eurprd08.prod.outlook.com
 ([fe80::281b:cded:83ff:1856]) by VE1PR08MB5599.eurprd08.prod.outlook.com
 ([fe80::281b:cded:83ff:1856%3]) with mapi id 15.20.4587.026; Thu, 14 Oct 2021
 15:53:54 +0000
From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
To: "naohirot@fujitsu.com" <naohirot@fujitsu.com>
CC: 'GNU C Library' <libc-alpha@sourceware.org>
Subject: [PATCH v2] AArch64: Improve A64FX memcpy
Thread-Topic: [PATCH v2] AArch64: Improve A64FX memcpy
Thread-Index: AQHXwRCub9fHjQ7H0kWTdLL2Tyf4Yw==
Date: Thu, 14 Oct 2021 15:53:54 +0000
Message-ID: <VE1PR08MB55993C0B47DFDB8E27B2F55E83B89@VE1PR08MB5599.eurprd08.prod.outlook.com>
Accept-Language: en-GB, en-US
Content-Language: en-GB
X-MS-Has-Attach: 
X-MS-TNEF-Correlator: 
suggested_attachment_session_id: ab209b57-8ff6-7817-b6be-01d5579bacfc
Authentication-Results-Original: fujitsu.com; dkim=none (message not signed)
 header.d=none;fujitsu.com; dmarc=none action=none header.from=arm.com;
x-ms-publictraffictype: Email
X-MS-Office365-Filtering-Correlation-Id: 7f1b6fc8-4df4-4807-52bb-08d98f2ad961
x-ms-traffictypediagnostic: VI1PR0802MB2157:|VI1PR08MB3262:
X-Microsoft-Antispam-PRVS: <VI1PR08MB326248B7CE4D8EAA4A4311E383B89@VI1PR08MB3262.eurprd08.prod.outlook.com>
x-checkrecipientrouted: true
nodisclaimer: true
x-ms-oob-tlc-oobclassifiers: OLM:2803;OLM:2803;
X-MS-Exchange-SenderADCheck: 1
X-MS-Exchange-AntiSpam-Relay: 0
X-Microsoft-Antispam-Untrusted: BCL:0;
X-Microsoft-Antispam-Message-Info-Original: 9ZvXWAujLJEKGBCwD9tcO3Yy9Dto8rU1e+Ko77CAubGyholcRRhkVM9In2waa1eTrSKO8dLVMdt2DoYJXJOxOvI+lMenxSOsOXbdcVgBPy3wNc5RMMksw3VRbOlCa+RYXx1U8UBYNkyAMpNYGoxrotiGI/6jb5GY7j+jVJtCOafUB5+9C/KIF/cAh0kE8M7zFek52SHDcTXmB5RpMgLSRz1OIwnGVfUXaFdFYTXnGIf5CHwMC25tFBKJNj4fxYUopywuC3sy0Q0GHRe5+oTTxUXsh1pNadoraU2B1RK1zAiOh2g8FnzJVs1nYG8kQ6CG7Q6KFGNf2yOEUphnDrItH6FAnF9G3rQIYHaHNvhgqzFU/kBGHZ6ozuhmpuyD45Nz1ccAFdI7Ew880III3E3j1Z/zKWijAZX0FMtoq6iYYGij2x8BudeAiSTln7km8Qu/8Sn9k+1zxAMbXyz62bQ5iy4bzJoeLHc70+Hyra1Pj6atXrkPeTag4WM92CPBcmnkN48piHUWX75Ve5RvBejvewhQmh9q5SH1fLblFoR4TPEMdkJ0Ky2f+i+LbhAQb+CiGXw36CfG6SuHp0Y9OMhcziiyYqrLmF1cX3iU5NraOySkpi5vH/4z6bWgxp4urMpV+o6+WD0+rXuGNhk/sBQpTJRpa+60919jv8lkpdOTddPZSw2qex4DRcN+WDHTvXRst4fMVW0AbD9v2AFMxTF7U4bxYOdL2c3p23waA32mQkI=
X-Forefront-Antispam-Report-Untrusted: CIP:255.255.255.255; CTRY:; LANG:en;
 SCL:1; SRV:; IPV:NLI; SFV:NSPM; H:VE1PR08MB5599.eurprd08.prod.outlook.com;
 PTR:; CAT:NONE;
 SFS:(4636009)(366004)(5660300002)(8676002)(30864003)(6916009)(6506007)(26005)(4326008)(7696005)(2906002)(316002)(508600001)(86362001)(8936002)(71200400001)(33656002)(52536014)(66446008)(66556008)(91956017)(186003)(122000001)(38100700002)(76116006)(66476007)(38070700005)(64756008)(55016002)(9686003)(66946007)(579004)(357404004);
 DIR:OUT; SFP:1101; 
x-ms-exchange-transport-forked: True
Content-Type: text/plain; charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
X-MS-Exchange-Transport-CrossTenantHeadersStamped: VI1PR0802MB2157
Original-Authentication-Results: fujitsu.com; dkim=none (message not signed)
 header.d=none;fujitsu.com; dmarc=none action=none header.from=arm.com;
X-EOPAttributedMessage: 0
X-MS-Exchange-Transport-CrossTenantHeadersStripped: AM5EUR03FT052.eop-EUR03.prod.protection.outlook.com
X-MS-Office365-Filtering-Correlation-Id-Prvs: c0158800-e15f-4f16-5c71-08d98f2ad371
X-Microsoft-Antispam: BCL:0;
X-Microsoft-Antispam-Message-Info: A24P9lGhk7lXMi8yayvWUSZFj2Ip1ric4sO1gGYlvSSdtb7YEL2xiqhC7qRnS5M8qpfFMgSnHEmEYM957hklL/0vsUofmPOjM8UJq+SeQ/nS1KBjp5gBrqHE5n0TjLeNG7NVeFJm8YVhpaT8DlrE8Udp+WQaZ/fOlFUzrNiTbqaULrpSaTOTSM4azr03Pb+/Jxerenk5MuWuG7SFsZnx6A6JoZD+BmZHUjY9Ml0mVpzI9vqYc0q3TjBmByWxfHWSO6ZnaQ2D7ZnjXNQxPWFy9wqt4OEuEdOYfEfkGjiqzBW9BXUYDcvtzqLmBzraF0Tv8oGyC+yc7IzIvbNWXMzSB0tSbsFcSJckXl9hY1nFLrhZ97yksUgm+xoGOBFpqhkyR4jjXUlQ/rpu0bF6ffyZ+QzouA2WX1rMqGfozk5rcKM/Ttl93spKYWAT9qiECCSSzGqfJAYO3dDkItgQACygDxUxS5OYtFIUs+geAl4CwybVqiig+z5Gbsdwy4cAzSziBPSktPoIIFFyOP+IHVxIlArMwjNLqqbw2Y77L+iqa0QkAZXXtKU8aW1IrjyEfghvAQg6nIVVcHgH+i4tuusIvkAtDvbzw0u9VVhr5u7pB+n/6FoHrDp7F2W8lCQCLIqV7dE8R0rNRcqAO/qjk3BlfVNJ2+UKFb/std6plbWhIMtCfyiHZJ+HM3hWwGIw2BQ1/8DiMYm/5236s8JVP/Wsl/F/elVIUQB/o5rJUQUPZs0=
X-Forefront-Antispam-Report: CIP:63.35.35.123; CTRY:IE; LANG:en; SCL:1; SRV:;
 IPV:CAL; SFV:NSPM; H:64aa7808-outbound-1.mta.getcheckrecipient.com;
 PTR:ec2-63-35-35-123.eu-west-1.compute.amazonaws.com; CAT:NONE;
 SFS:(4636009)(46966006)(36840700001)(55016002)(6506007)(186003)(8936002)(70206006)(9686003)(336012)(82310400003)(4326008)(36860700001)(81166007)(2906002)(356005)(26005)(70586007)(6862004)(316002)(47076005)(33656002)(8676002)(86362001)(63350400001)(5660300002)(7696005)(63370400001)(52536014)(508600001)(30864003)(357404004);
 DIR:OUT; SFP:1101; 
X-OriginatorOrg: arm.com
X-MS-Exchange-CrossTenant-OriginalArrivalTime: 14 Oct 2021 15:54:04.7068 (UTC)
X-MS-Exchange-CrossTenant-Network-Message-Id: 7f1b6fc8-4df4-4807-52bb-08d98f2ad961
X-MS-Exchange-CrossTenant-Id: f34e5979-57d9-4aaa-ad4d-b122a662184d
X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: TenantId=f34e5979-57d9-4aaa-ad4d-b122a662184d; Ip=[63.35.35.123];
 Helo=[64aa7808-outbound-1.mta.getcheckrecipient.com]
X-MS-Exchange-CrossTenant-AuthSource: AM5EUR03FT052.eop-EUR03.prod.protection.outlook.com
X-MS-Exchange-CrossTenant-AuthAs: Anonymous
X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem
X-MS-Exchange-Transport-CrossTenantHeadersStamped: VI1PR08MB3262
X-Spam-Status: No, score=-12.2 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, RCVD_IN_MSPIKE_H2, SPF_HELO_PASS,
 SPF_PASS, TXREP,
 UNPARSEABLE_RELAY autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Thu, 14 Oct 2021 15:54:15 -0000

Hi Naohiro,=0A=
=0A=
This is v2 of the A64FX memcpy - in the end I decided on a complete rewrite=
.=0A=
Performance is improved by streamlining the code, aligning to vector size i=
n=0A=
large copies and using a single unrolled loop for all sizes. The codesize f=
or=0A=
memcpy and memmove goes down from 1796 bytes to 868 bytes (only 70%=0A=
larger than memcpy_advsimd.S).=0A=
=0A=
Performance is better in all cases: bench-memcpy-random is 2.3% faster over=
all,=0A=
bench-memcpy-large is 33% faster for large sizes, bench-memcpy-walk is 25%=
=0A=
faster for small sizes and 20% for the largest sizes. The geomean of all te=
sts in=0A=
bench-memcpy is 5.1% better, and total time is reduced by 4%.=0A=
=0A=
Passes GLIBC regress, OK for commit?=0A=
=0A=
Cheers,=0A=
Wilco=0A=
=0A=
---=0A=
=0A=
diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/mul=
tiarch/memcpy_a64fx.S=0A=
index 65528405bb12373731e895c7030ccef23b88c17f..99abca0a891aa3e79494c4f4a26=
8992472b91c75 100644=0A=
--- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S=0A=
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S=0A=
@@ -25,20 +25,15 @@=0A=
  *=0A=
  */=0A=
 =0A=
-#define L2_SIZE		(8*1024*1024)/2	// L2 8MB/2=0A=
-#define CACHE_LINE_SIZE	256=0A=
-#define ZF_DIST		(CACHE_LINE_SIZE * 21)	// Zerofill distance=0A=
-#define dest		x0=0A=
-#define src		x1=0A=
-#define n		x2	// size=0A=
-#define tmp1		x3=0A=
-#define tmp2		x4=0A=
-#define tmp3		x5=0A=
-#define rest		x6=0A=
-#define dest_ptr	x7=0A=
-#define src_ptr		x8=0A=
-#define vector_length	x9=0A=
-#define cl_remainder	x10	// CACHE_LINE_SIZE remainder=0A=
+#define dstin	x0=0A=
+#define src	x1=0A=
+#define n	x2=0A=
+#define dst	x3=0A=
+#define dstend	x4=0A=
+#define srcend	x5=0A=
+#define tmp	x6=0A=
+#define vlen	x7=0A=
+#define vlen8	x8=0A=
 =0A=
 #if HAVE_AARCH64_SVE_ASM=0A=
 # if IS_IN (libc)=0A=
@@ -47,45 +42,37 @@=0A=
 =0A=
 	.arch armv8.2-a+sve=0A=
 =0A=
-	.macro dc_zva times=0A=
-	dc	zva, tmp1=0A=
-	add	tmp1, tmp1, CACHE_LINE_SIZE=0A=
-	.if \times-1=0A=
-	dc_zva "(\times-1)"=0A=
-	.endif=0A=
-	.endm=0A=
-=0A=
 	.macro ld1b_unroll8=0A=
-	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]=0A=
-	ld1b	z1.b, p0/z, [src_ptr, #1, mul vl]=0A=
-	ld1b	z2.b, p0/z, [src_ptr, #2, mul vl]=0A=
-	ld1b	z3.b, p0/z, [src_ptr, #3, mul vl]=0A=
-	ld1b	z4.b, p0/z, [src_ptr, #4, mul vl]=0A=
-	ld1b	z5.b, p0/z, [src_ptr, #5, mul vl]=0A=
-	ld1b	z6.b, p0/z, [src_ptr, #6, mul vl]=0A=
-	ld1b	z7.b, p0/z, [src_ptr, #7, mul vl]=0A=
+	ld1b	z0.b, p0/z, [src, 0, mul vl]=0A=
+	ld1b	z1.b, p0/z, [src, 1, mul vl]=0A=
+	ld1b	z2.b, p0/z, [src, 2, mul vl]=0A=
+	ld1b	z3.b, p0/z, [src, 3, mul vl]=0A=
+	ld1b	z4.b, p0/z, [src, 4, mul vl]=0A=
+	ld1b	z5.b, p0/z, [src, 5, mul vl]=0A=
+	ld1b	z6.b, p0/z, [src, 6, mul vl]=0A=
+	ld1b	z7.b, p0/z, [src, 7, mul vl]=0A=
 	.endm=0A=
 =0A=
 	.macro stld1b_unroll4a=0A=
-	st1b	z0.b, p0,   [dest_ptr, #0, mul vl]=0A=
-	st1b	z1.b, p0,   [dest_ptr, #1, mul vl]=0A=
-	ld1b	z0.b, p0/z, [src_ptr,  #0, mul vl]=0A=
-	ld1b	z1.b, p0/z, [src_ptr,  #1, mul vl]=0A=
-	st1b	z2.b, p0,   [dest_ptr, #2, mul vl]=0A=
-	st1b	z3.b, p0,   [dest_ptr, #3, mul vl]=0A=
-	ld1b	z2.b, p0/z, [src_ptr,  #2, mul vl]=0A=
-	ld1b	z3.b, p0/z, [src_ptr,  #3, mul vl]=0A=
+	st1b	z0.b, p0,   [dst, 0, mul vl]=0A=
+	st1b	z1.b, p0,   [dst, 1, mul vl]=0A=
+	ld1b	z0.b, p0/z, [src, 0, mul vl]=0A=
+	ld1b	z1.b, p0/z, [src, 1, mul vl]=0A=
+	st1b	z2.b, p0,   [dst, 2, mul vl]=0A=
+	st1b	z3.b, p0,   [dst, 3, mul vl]=0A=
+	ld1b	z2.b, p0/z, [src, 2, mul vl]=0A=
+	ld1b	z3.b, p0/z, [src, 3, mul vl]=0A=
 	.endm=0A=
 =0A=
 	.macro stld1b_unroll4b=0A=
-	st1b	z4.b, p0,   [dest_ptr, #4, mul vl]=0A=
-	st1b	z5.b, p0,   [dest_ptr, #5, mul vl]=0A=
-	ld1b	z4.b, p0/z, [src_ptr,  #4, mul vl]=0A=
-	ld1b	z5.b, p0/z, [src_ptr,  #5, mul vl]=0A=
-	st1b	z6.b, p0,   [dest_ptr, #6, mul vl]=0A=
-	st1b	z7.b, p0,   [dest_ptr, #7, mul vl]=0A=
-	ld1b	z6.b, p0/z, [src_ptr,  #6, mul vl]=0A=
-	ld1b	z7.b, p0/z, [src_ptr,  #7, mul vl]=0A=
+	st1b	z4.b, p0,   [dst, 4, mul vl]=0A=
+	st1b	z5.b, p0,   [dst, 5, mul vl]=0A=
+	ld1b	z4.b, p0/z, [src, 4, mul vl]=0A=
+	ld1b	z5.b, p0/z, [src, 5, mul vl]=0A=
+	st1b	z6.b, p0,   [dst, 6, mul vl]=0A=
+	st1b	z7.b, p0,   [dst, 7, mul vl]=0A=
+	ld1b	z6.b, p0/z, [src, 6, mul vl]=0A=
+	ld1b	z7.b, p0/z, [src, 7, mul vl]=0A=
 	.endm=0A=
 =0A=
 	.macro stld1b_unroll8=0A=
@@ -94,87 +81,18 @@=0A=
 	.endm=0A=
 =0A=
 	.macro st1b_unroll8=0A=
-	st1b	z0.b, p0, [dest_ptr, #0, mul vl]=0A=
-	st1b	z1.b, p0, [dest_ptr, #1, mul vl]=0A=
-	st1b	z2.b, p0, [dest_ptr, #2, mul vl]=0A=
-	st1b	z3.b, p0, [dest_ptr, #3, mul vl]=0A=
-	st1b	z4.b, p0, [dest_ptr, #4, mul vl]=0A=
-	st1b	z5.b, p0, [dest_ptr, #5, mul vl]=0A=
-	st1b	z6.b, p0, [dest_ptr, #6, mul vl]=0A=
-	st1b	z7.b, p0, [dest_ptr, #7, mul vl]=0A=
+	st1b	z0.b, p0, [dst, 0, mul vl]=0A=
+	st1b	z1.b, p0, [dst, 1, mul vl]=0A=
+	st1b	z2.b, p0, [dst, 2, mul vl]=0A=
+	st1b	z3.b, p0, [dst, 3, mul vl]=0A=
+	st1b	z4.b, p0, [dst, 4, mul vl]=0A=
+	st1b	z5.b, p0, [dst, 5, mul vl]=0A=
+	st1b	z6.b, p0, [dst, 6, mul vl]=0A=
+	st1b	z7.b, p0, [dst, 7, mul vl]=0A=
 	.endm=0A=
 =0A=
-	.macro shortcut_for_small_size exit=0A=
-	// if rest <=3D vector_length * 2=0A=
-	whilelo	p0.b, xzr, n=0A=
-	whilelo	p1.b, vector_length, n=0A=
-	b.last	1f=0A=
-	ld1b	z0.b, p0/z, [src, #0, mul vl]=0A=
-	ld1b	z1.b, p1/z, [src, #1, mul vl]=0A=
-	st1b	z0.b, p0, [dest, #0, mul vl]=0A=
-	st1b	z1.b, p1, [dest, #1, mul vl]=0A=
-	ret=0A=
-1:	// if rest > vector_length * 8=0A=
-	cmp	n, vector_length, lsl 3 // vector_length * 8=0A=
-	b.hi	\exit=0A=
-	// if rest <=3D vector_length * 4=0A=
-	lsl	tmp1, vector_length, 1  // vector_length * 2=0A=
-	whilelo	p2.b, tmp1, n=0A=
-	incb	tmp1=0A=
-	whilelo	p3.b, tmp1, n=0A=
-	b.last	1f=0A=
-	ld1b	z0.b, p0/z, [src, #0, mul vl]=0A=
-	ld1b	z1.b, p1/z, [src, #1, mul vl]=0A=
-	ld1b	z2.b, p2/z, [src, #2, mul vl]=0A=
-	ld1b	z3.b, p3/z, [src, #3, mul vl]=0A=
-	st1b	z0.b, p0, [dest, #0, mul vl]=0A=
-	st1b	z1.b, p1, [dest, #1, mul vl]=0A=
-	st1b	z2.b, p2, [dest, #2, mul vl]=0A=
-	st1b	z3.b, p3, [dest, #3, mul vl]=0A=
-	ret=0A=
-1:	// if rest <=3D vector_length * 8=0A=
-	lsl	tmp1, vector_length, 2  // vector_length * 4=0A=
-	whilelo	p4.b, tmp1, n=0A=
-	incb	tmp1=0A=
-	whilelo	p5.b, tmp1, n=0A=
-	b.last	1f=0A=
-	ld1b	z0.b, p0/z, [src, #0, mul vl]=0A=
-	ld1b	z1.b, p1/z, [src, #1, mul vl]=0A=
-	ld1b	z2.b, p2/z, [src, #2, mul vl]=0A=
-	ld1b	z3.b, p3/z, [src, #3, mul vl]=0A=
-	ld1b	z4.b, p4/z, [src, #4, mul vl]=0A=
-	ld1b	z5.b, p5/z, [src, #5, mul vl]=0A=
-	st1b	z0.b, p0, [dest, #0, mul vl]=0A=
-	st1b	z1.b, p1, [dest, #1, mul vl]=0A=
-	st1b	z2.b, p2, [dest, #2, mul vl]=0A=
-	st1b	z3.b, p3, [dest, #3, mul vl]=0A=
-	st1b	z4.b, p4, [dest, #4, mul vl]=0A=
-	st1b	z5.b, p5, [dest, #5, mul vl]=0A=
-	ret=0A=
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4=0A=
-	incb	tmp1			// vector_length * 5=0A=
-	incb	tmp1			// vector_length * 6=0A=
-	whilelo	p6.b, tmp1, n=0A=
-	incb	tmp1=0A=
-	whilelo	p7.b, tmp1, n=0A=
-	ld1b	z0.b, p0/z, [src, #0, mul vl]=0A=
-	ld1b	z1.b, p1/z, [src, #1, mul vl]=0A=
-	ld1b	z2.b, p2/z, [src, #2, mul vl]=0A=
-	ld1b	z3.b, p3/z, [src, #3, mul vl]=0A=
-	ld1b	z4.b, p4/z, [src, #4, mul vl]=0A=
-	ld1b	z5.b, p5/z, [src, #5, mul vl]=0A=
-	ld1b	z6.b, p6/z, [src, #6, mul vl]=0A=
-	ld1b	z7.b, p7/z, [src, #7, mul vl]=0A=
-	st1b	z0.b, p0, [dest, #0, mul vl]=0A=
-	st1b	z1.b, p1, [dest, #1, mul vl]=0A=
-	st1b	z2.b, p2, [dest, #2, mul vl]=0A=
-	st1b	z3.b, p3, [dest, #3, mul vl]=0A=
-	st1b	z4.b, p4, [dest, #4, mul vl]=0A=
-	st1b	z5.b, p5, [dest, #5, mul vl]=0A=
-	st1b	z6.b, p6, [dest, #6, mul vl]=0A=
-	st1b	z7.b, p7, [dest, #7, mul vl]=0A=
-	ret=0A=
-	.endm=0A=
+#undef BTI_C=0A=
+#define BTI_C=0A=
 =0A=
 ENTRY (MEMCPY)=0A=
 =0A=
@@ -182,223 +100,209 @@ ENTRY (MEMCPY)=0A=
 	PTR_ARG (1)=0A=
 	SIZE_ARG (2)=0A=
 =0A=
-L(memcpy):=0A=
-	cntb	vector_length=0A=
-	// shortcut for less than vector_length * 8=0A=
-	// gives a free ptrue to p0.b for n >=3D vector_length=0A=
-	shortcut_for_small_size L(vl_agnostic)=0A=
-	// end of shortcut=0A=
-=0A=
-L(vl_agnostic): // VL Agnostic=0A=
-	mov	rest, n=0A=
-	mov	dest_ptr, dest=0A=
-	mov	src_ptr, src=0A=
-	// if rest >=3D L2_SIZE && vector_length =3D=3D 64 then L(L2)=0A=
-	mov	tmp1, 64=0A=
-	cmp	rest, L2_SIZE=0A=
-	ccmp	vector_length, tmp1, 0, cs=0A=
-	b.eq	L(L2)=0A=
-=0A=
-L(unroll8): // unrolling and software pipeline=0A=
-	lsl	tmp1, vector_length, 3	// vector_length * 8=0A=
-	.p2align 3=0A=
-	cmp	 rest, tmp1=0A=
-	b.cc	L(last)=0A=
+	cntb	vlen=0A=
+	cmp	n, vlen, lsl 1=0A=
+	b.hi	L(copy_small)=0A=
+	whilelo	p1.b, vlen, n=0A=
+	whilelo	p0.b, xzr, n=0A=
+	ld1b	z0.b, p0/z, [src, 0, mul vl]=0A=
+	ld1b	z1.b, p1/z, [src, 1, mul vl]=0A=
+	st1b	z0.b, p0, [dstin, 0, mul vl]=0A=
+	st1b	z1.b, p1, [dstin, 1, mul vl]=0A=
+	ret=0A=
+=0A=
+	.p2align 4=0A=
+=0A=
+L(copy_small):=0A=
+	cmp	n, vlen, lsl 3=0A=
+	b.hi	L(copy_large)=0A=
+	add	dstend, dstin, n=0A=
+	add	srcend, src, n=0A=
+	cmp	n, vlen, lsl 2=0A=
+	b.hi	1f=0A=
+=0A=
+	/* Copy 2-4 vectors.  */=0A=
+	ptrue	p0.b=0A=
+	ld1b	z0.b, p0/z, [src, 0, mul vl]=0A=
+	ld1b	z1.b, p0/z, [src, 1, mul vl]=0A=
+	ld1b	z2.b, p0/z, [srcend, -2, mul vl]=0A=
+	ld1b	z3.b, p0/z, [srcend, -1, mul vl]=0A=
+	st1b	z0.b, p0, [dstin, 0, mul vl]=0A=
+	st1b	z1.b, p0, [dstin, 1, mul vl]=0A=
+	st1b	z2.b, p0, [dstend, -2, mul vl]=0A=
+	st1b	z3.b, p0, [dstend, -1, mul vl]=0A=
+	ret=0A=
+=0A=
+	.p2align 4=0A=
+	/* Copy 4-8 vectors.  */=0A=
+1:	ptrue	p0.b=0A=
+	ld1b	z0.b, p0/z, [src, 0, mul vl]=0A=
+	ld1b	z1.b, p0/z, [src, 1, mul vl]=0A=
+	ld1b	z2.b, p0/z, [src, 2, mul vl]=0A=
+	ld1b	z3.b, p0/z, [src, 3, mul vl]=0A=
+	ld1b	z4.b, p0/z, [srcend, -4, mul vl]=0A=
+	ld1b	z5.b, p0/z, [srcend, -3, mul vl]=0A=
+	ld1b	z6.b, p0/z, [srcend, -2, mul vl]=0A=
+	ld1b	z7.b, p0/z, [srcend, -1, mul vl]=0A=
+	st1b	z0.b, p0, [dstin, 0, mul vl]=0A=
+	st1b	z1.b, p0, [dstin, 1, mul vl]=0A=
+	st1b	z2.b, p0, [dstin, 2, mul vl]=0A=
+	st1b	z3.b, p0, [dstin, 3, mul vl]=0A=
+	st1b	z4.b, p0, [dstend, -4, mul vl]=0A=
+	st1b	z5.b, p0, [dstend, -3, mul vl]=0A=
+	st1b	z6.b, p0, [dstend, -2, mul vl]=0A=
+	st1b	z7.b, p0, [dstend, -1, mul vl]=0A=
+	ret=0A=
+=0A=
+	.p2align 4=0A=
+	/* At least 8 vectors - always align to vector length for=0A=
+	   higher and consistent write performance.  */=0A=
+L(copy_large):=0A=
+	sub	tmp, vlen, 1=0A=
+	and	tmp, dstin, tmp=0A=
+	sub	tmp, vlen, tmp=0A=
+	whilelo	p1.b, xzr, tmp=0A=
+	ld1b	z1.b, p1/z, [src]=0A=
+	st1b	z1.b, p1, [dstin]=0A=
+	add	dst, dstin, tmp=0A=
+	add	src, src, tmp=0A=
+	sub	n, n, tmp=0A=
+	ptrue	p0.b=0A=
+=0A=
+	lsl	vlen8, vlen, 3=0A=
+	subs	n, n, vlen8=0A=
+	b.ls	3f=0A=
 	ld1b_unroll8=0A=
-	add	src_ptr, src_ptr, tmp1=0A=
-	sub	rest, rest, tmp1=0A=
-	cmp	rest, tmp1=0A=
-	b.cc	2f=0A=
-	.p2align 3=0A=
+	add	src, src, vlen8=0A=
+	subs	n, n, vlen8=0A=
+	b.ls	2f=0A=
+=0A=
+	.p2align 4=0A=
+	/* 8x unrolled and software pipelined loop.  */=0A=
 1:	stld1b_unroll8=0A=
-	add	dest_ptr, dest_ptr, tmp1=0A=
-	add	src_ptr, src_ptr, tmp1=0A=
-	sub	rest, rest, tmp1=0A=
-	cmp	rest, tmp1=0A=
-	b.ge	1b=0A=
+	add	dst, dst, vlen8=0A=
+	add	src, src, vlen8=0A=
+	subs	n, n, vlen8=0A=
+	b.hi	1b=0A=
 2:	st1b_unroll8=0A=
-	add	dest_ptr, dest_ptr, tmp1=0A=
-=0A=
-	.p2align 3=0A=
-L(last):=0A=
-	whilelo	p0.b, xzr, rest=0A=
-	whilelo	p1.b, vector_length, rest=0A=
-	b.last	1f=0A=
-	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]=0A=
-	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]=0A=
-	st1b	z0.b, p0, [dest_ptr, #0, mul vl]=0A=
-	st1b	z1.b, p1, [dest_ptr, #1, mul vl]=0A=
-	ret=0A=
-1:	lsl	tmp1, vector_length, 1	// vector_length * 2=0A=
-	whilelo	p2.b, tmp1, rest=0A=
-	incb	tmp1=0A=
-	whilelo	p3.b, tmp1, rest=0A=
-	b.last	1f=0A=
-	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]=0A=
-	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]=0A=
-	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]=0A=
-	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]=0A=
-	st1b	z0.b, p0, [dest_ptr, #0, mul vl]=0A=
-	st1b	z1.b, p1, [dest_ptr, #1, mul vl]=0A=
-	st1b	z2.b, p2, [dest_ptr, #2, mul vl]=0A=
-	st1b	z3.b, p3, [dest_ptr, #3, mul vl]=0A=
+	add	dst, dst, vlen8=0A=
+3:	add	n, n, vlen8=0A=
+=0A=
+	/* Move last 0-8 vectors.  */=0A=
+L(last_bytes):=0A=
+	cmp	n, vlen, lsl 1=0A=
+	b.hi	1f=0A=
+	whilelo	p0.b, xzr, n=0A=
+	whilelo	p1.b, vlen, n=0A=
+	ld1b	z0.b, p0/z, [src, 0, mul vl]=0A=
+	ld1b	z1.b, p1/z, [src, 1, mul vl]=0A=
+	st1b	z0.b, p0, [dst, 0, mul vl]=0A=
+	st1b	z1.b, p1, [dst, 1, mul vl]=0A=
 	ret=0A=
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4=0A=
-	whilelo	p4.b, tmp1, rest=0A=
-	incb	tmp1=0A=
-	whilelo	p5.b, tmp1, rest=0A=
-	incb	tmp1=0A=
-	whilelo	p6.b, tmp1, rest=0A=
-	incb	tmp1=0A=
-	whilelo	p7.b, tmp1, rest=0A=
-	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]=0A=
-	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]=0A=
-	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]=0A=
-	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]=0A=
-	ld1b	z4.b, p4/z, [src_ptr, #4, mul vl]=0A=
-	ld1b	z5.b, p5/z, [src_ptr, #5, mul vl]=0A=
-	ld1b	z6.b, p6/z, [src_ptr, #6, mul vl]=0A=
-	ld1b	z7.b, p7/z, [src_ptr, #7, mul vl]=0A=
-	st1b	z0.b, p0, [dest_ptr, #0, mul vl]=0A=
-	st1b	z1.b, p1, [dest_ptr, #1, mul vl]=0A=
-	st1b	z2.b, p2, [dest_ptr, #2, mul vl]=0A=
-	st1b	z3.b, p3, [dest_ptr, #3, mul vl]=0A=
-	st1b	z4.b, p4, [dest_ptr, #4, mul vl]=0A=
-	st1b	z5.b, p5, [dest_ptr, #5, mul vl]=0A=
-	st1b	z6.b, p6, [dest_ptr, #6, mul vl]=0A=
-	st1b	z7.b, p7, [dest_ptr, #7, mul vl]=0A=
+=0A=
+	.p2align 4=0A=
+=0A=
+1:	add	srcend, src, n=0A=
+	add	dstend, dst, n=0A=
+	ld1b	z0.b, p0/z, [src, 0, mul vl]=0A=
+	ld1b	z1.b, p0/z, [src, 1, mul vl]=0A=
+	ld1b	z2.b, p0/z, [srcend, -2, mul vl]=0A=
+	ld1b	z3.b, p0/z, [srcend, -1, mul vl]=0A=
+	cmp	n, vlen, lsl 2=0A=
+	b.hi	1f=0A=
+=0A=
+	st1b	z0.b, p0, [dst, 0, mul vl]=0A=
+	st1b	z1.b, p0, [dst, 1, mul vl]=0A=
+	st1b	z2.b, p0, [dstend, -2, mul vl]=0A=
+	st1b	z3.b, p0, [dstend, -1, mul vl]=0A=
 	ret=0A=
 =0A=
-L(L2):=0A=
-	// align dest address at CACHE_LINE_SIZE byte boundary=0A=
-	mov	tmp1, CACHE_LINE_SIZE=0A=
-	ands	tmp2, dest_ptr, CACHE_LINE_SIZE - 1=0A=
-	// if cl_remainder =3D=3D 0=0A=
-	b.eq	L(L2_dc_zva)=0A=
-	sub	cl_remainder, tmp1, tmp2=0A=
-	// process remainder until the first CACHE_LINE_SIZE boundary=0A=
-	whilelo	p1.b, xzr, cl_remainder	// keep p0.b all true=0A=
-	whilelo	p2.b, vector_length, cl_remainder=0A=
-	b.last	1f=0A=
-	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]=0A=
-	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]=0A=
-	st1b	z1.b, p1, [dest_ptr, #0, mul vl]=0A=
-	st1b	z2.b, p2, [dest_ptr, #1, mul vl]=0A=
-	b	2f=0A=
-1:	lsl	tmp1, vector_length, 1	// vector_length * 2=0A=
-	whilelo	p3.b, tmp1, cl_remainder=0A=
-	incb	tmp1=0A=
-	whilelo	p4.b, tmp1, cl_remainder=0A=
-	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]=0A=
-	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]=0A=
-	ld1b	z3.b, p3/z, [src_ptr, #2, mul vl]=0A=
-	ld1b	z4.b, p4/z, [src_ptr, #3, mul vl]=0A=
-	st1b	z1.b, p1, [dest_ptr, #0, mul vl]=0A=
-	st1b	z2.b, p2, [dest_ptr, #1, mul vl]=0A=
-	st1b	z3.b, p3, [dest_ptr, #2, mul vl]=0A=
-	st1b	z4.b, p4, [dest_ptr, #3, mul vl]=0A=
-2:	add	dest_ptr, dest_ptr, cl_remainder=0A=
-	add	src_ptr, src_ptr, cl_remainder=0A=
-	sub	rest, rest, cl_remainder=0A=
-=0A=
-L(L2_dc_zva):=0A=
-	// zero fill=0A=
-	and	tmp1, dest, 0xffffffffffffff=0A=
-	and	tmp2, src, 0xffffffffffffff=0A=
-	subs	tmp1, tmp1, tmp2	// diff=0A=
-	b.ge	1f=0A=
-	neg	tmp1, tmp1=0A=
-1:	mov	tmp3, ZF_DIST + CACHE_LINE_SIZE * 2=0A=
-	cmp	tmp1, tmp3=0A=
-	b.lo	L(unroll8)=0A=
-	mov	tmp1, dest_ptr=0A=
-	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1=0A=
-	// unroll=0A=
-	ld1b_unroll8	// this line has to be after "b.lo L(unroll8)"=0A=
-	add	 src_ptr, src_ptr, CACHE_LINE_SIZE * 2=0A=
-	sub	 rest, rest, CACHE_LINE_SIZE * 2=0A=
-	mov	 tmp1, ZF_DIST=0A=
-	.p2align 3=0A=
-1:	stld1b_unroll4a=0A=
-	add	tmp2, dest_ptr, tmp1	// dest_ptr + ZF_DIST=0A=
-	dc	zva, tmp2=0A=
-	stld1b_unroll4b=0A=
-	add	tmp2, tmp2, CACHE_LINE_SIZE=0A=
-	dc	zva, tmp2=0A=
-	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2=0A=
-	add	src_ptr, src_ptr, CACHE_LINE_SIZE * 2=0A=
-	sub	rest, rest, CACHE_LINE_SIZE * 2=0A=
-	cmp	rest, tmp3	// ZF_DIST + CACHE_LINE_SIZE * 2=0A=
-	b.ge	1b=0A=
-	st1b_unroll8=0A=
-	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2=0A=
-	b	L(unroll8)=0A=
+1:	ld1b	z4.b, p0/z, [src, 2, mul vl]=0A=
+	ld1b	z5.b, p0/z, [src, 3, mul vl]=0A=
+	ld1b	z6.b, p0/z, [srcend, -4, mul vl]=0A=
+	ld1b	z7.b, p0/z, [srcend, -3, mul vl]=0A=
+	st1b	z0.b, p0, [dst, 0, mul vl]=0A=
+	st1b	z1.b, p0, [dst, 1, mul vl]=0A=
+	st1b	z4.b, p0, [dst, 2, mul vl]=0A=
+	st1b	z5.b, p0, [dst, 3, mul vl]=0A=
+	st1b	z6.b, p0, [dstend, -4, mul vl]=0A=
+	st1b	z7.b, p0, [dstend, -3, mul vl]=0A=
+	st1b	z2.b, p0, [dstend, -2, mul vl]=0A=
+	st1b	z3.b, p0, [dstend, -1, mul vl]=0A=
+	ret=0A=
 =0A=
 END (MEMCPY)=0A=
 libc_hidden_builtin_def (MEMCPY)=0A=
 =0A=
 =0A=
-ENTRY (MEMMOVE)=0A=
+ENTRY_ALIGN (MEMMOVE, 4)=0A=
 =0A=
 	PTR_ARG (0)=0A=
 	PTR_ARG (1)=0A=
 	SIZE_ARG (2)=0A=
 =0A=
-	// remove tag address=0A=
-	// dest has to be immutable because it is the return value=0A=
-	// src has to be immutable because it is used in L(bwd_last)=0A=
-	and	tmp2, dest, 0xffffffffffffff	// save dest_notag into tmp2=0A=
-	and	tmp3, src, 0xffffffffffffff	// save src_notag intp tmp3=0A=
-	cmp	n, 0=0A=
-	ccmp	tmp2, tmp3, 4, ne=0A=
-	b.ne	1f=0A=
+	/* Fast case for up to 2 vectors.  */=0A=
+	cntb	vlen=0A=
+	cmp	n, vlen, lsl 1=0A=
+	b.hi	1f=0A=
+	whilelo	p0.b, xzr, n=0A=
+	whilelo	p1.b, vlen, n=0A=
+	ld1b	z0.b, p0/z, [src, 0, mul vl]=0A=
+	ld1b	z1.b, p1/z, [src, 1, mul vl]=0A=
+	st1b	z0.b, p0, [dstin, 0, mul vl]=0A=
+	st1b	z1.b, p1, [dstin, 1, mul vl]=0A=
+L(full_overlap):=0A=
 	ret=0A=
-1:	cntb	vector_length=0A=
-	// shortcut for less than vector_length * 8=0A=
-	// gives a free ptrue to p0.b for n >=3D vector_length=0A=
-	// tmp2 and tmp3 should not be used in this macro to keep=0A=
-	// notag addresses=0A=
-	shortcut_for_small_size L(dispatch)=0A=
-	// end of shortcut=0A=
-=0A=
-L(dispatch):=0A=
-	// tmp2 =3D dest_notag, tmp3 =3D src_notag=0A=
-	// diff =3D dest_notag - src_notag=0A=
-	sub	tmp1, tmp2, tmp3=0A=
-	// if diff <=3D 0 || diff >=3D n then memcpy=0A=
-	cmp	tmp1, 0=0A=
-	ccmp	tmp1, n, 2, gt=0A=
-	b.cs	L(vl_agnostic)=0A=
-=0A=
-L(bwd_start):=0A=
-	mov	rest, n=0A=
-	add	dest_ptr, dest, n	// dest_end=0A=
-	add	src_ptr, src, n		// src_end=0A=
-=0A=
-L(bwd_unroll8): // unrolling and software pipeline=0A=
-	lsl	tmp1, vector_length, 3	// vector_length * 8=0A=
-	.p2align 3=0A=
-	cmp	rest, tmp1=0A=
-	b.cc	L(bwd_last)=0A=
-	sub	src_ptr, src_ptr, tmp1=0A=
+=0A=
+	.p2align 4=0A=
+	/* Check for overlapping moves. Return if there is a full overlap.=0A=
+	   Small moves up to 8 vectors use the overlap-safe copy_small code.=0A=
+	   Non-overlapping or overlapping moves with dst < src use memcpy.=0A=
+	   Overlapping moves with dst > src use a backward copy loop.  */=0A=
+1:	sub	tmp, dstin, src=0A=
+	ands	tmp, tmp, 0xffffffffffffff	/* Clear special tag bits.  */=0A=
+	b.eq	L(full_overlap)=0A=
+	cmp	n, vlen, lsl 3=0A=
+	b.ls	L(copy_small)=0A=
+	cmp	tmp, n=0A=
+	b.hs	L(copy_large)=0A=
+=0A=
+	/* Align to vector length.  */=0A=
+	add	dst, dstin, n=0A=
+	sub	tmp, vlen, 1=0A=
+	ands	tmp, dst, tmp=0A=
+	csel	tmp, tmp, vlen, ne=0A=
+	whilelo	p1.b, xzr, tmp=0A=
+	sub	n, n, tmp=0A=
+	ld1b	z1.b, p1/z, [src, n]=0A=
+	st1b	z1.b, p1, [dstin, n]=0A=
+	add	src, src, n=0A=
+	add	dst, dstin, n=0A=
+=0A=
+	ptrue	p0.b=0A=
+	lsl	vlen8, vlen, 3=0A=
+	subs	n, n, vlen8=0A=
+	b.ls	3f=0A=
+	sub	src, src, vlen8=0A=
 	ld1b_unroll8=0A=
-	sub	rest, rest, tmp1=0A=
-	cmp	rest, tmp1=0A=
-	b.cc	2f=0A=
-	.p2align 3=0A=
-1:	sub	src_ptr, src_ptr, tmp1=0A=
-	sub	dest_ptr, dest_ptr, tmp1=0A=
+	subs	n, n, vlen8=0A=
+	b.ls	2f=0A=
+=0A=
+	.p2align 4=0A=
+	/* 8x unrolled and software pipelined backward copy loop.  */=0A=
+1:	sub	src, src, vlen8=0A=
+	sub	dst, dst, vlen8=0A=
 	stld1b_unroll8=0A=
-	sub	rest, rest, tmp1=0A=
-	cmp	rest, tmp1=0A=
-	b.ge	1b=0A=
-2:	sub	dest_ptr, dest_ptr, tmp1=0A=
+	subs	n, n, vlen8=0A=
+	b.hi	1b=0A=
+2:	sub	dst, dst, vlen8=0A=
 	st1b_unroll8=0A=
+3:	add	n, n, vlen8=0A=
 =0A=
-L(bwd_last):=0A=
-	mov	dest_ptr, dest=0A=
-	mov	src_ptr, src=0A=
-	b	L(last)=0A=
+	/* Adjust src/dst for last 0-8 vectors.  */=0A=
+	sub	src, src, n=0A=
+	mov	dst, dstin=0A=
+	b	L(last_bytes)=0A=
 =0A=
 END (MEMMOVE)=0A=
 libc_hidden_builtin_def (MEMMOVE)=0A=