From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7814) id 321C639518BA; Mon, 16 Aug 2021 17:14:19 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 321C639518BA Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Fangrui Song To: glibc-cvs@sourceware.org Subject: [glibc/maskray/lld] [3/5] AArch64: Improve A64FX memset for remaining bytes X-Act-Checkin: glibc X-Git-Author: Wilco Dijkstra X-Git-Refname: refs/heads/maskray/lld X-Git-Oldrev: 9bc2ed8f46d80859a5596789cc9e8cc2de84b0e7 X-Git-Newrev: 186092c6ba8825598ffdbf15dbf0823c771f560d Message-Id: <20210816171419.321C639518BA@sourceware.org> Date: Mon, 16 Aug 2021 17:14:19 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 16 Aug 2021 17:14:19 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=186092c6ba8825598ffdbf15dbf0823c771f560d commit 186092c6ba8825598ffdbf15dbf0823c771f560d Author: Wilco Dijkstra Date: Tue Aug 10 13:42:07 2021 +0100 [3/5] AArch64: Improve A64FX memset for remaining bytes Simplify handling of remaining bytes. Avoid lots of taken branches and complex whilelo computations, instead unconditionally write vectors from the end. Reviewed-by: Naohiro Tamura Diff: --- sysdeps/aarch64/multiarch/memset_a64fx.S | 46 +++++++++----------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S index 75cf43ae79..337c86be6f 100644 --- a/sysdeps/aarch64/multiarch/memset_a64fx.S +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S @@ -130,38 +130,19 @@ L(unroll8): b 1b L(last): - whilelo p0.b, xzr, rest - whilelo p1.b, vector_length, rest - b.last 1f - st1b z0.b, p0, [dst, #0, mul vl] - st1b z0.b, p1, [dst, #1, mul vl] - ret -1: lsl tmp1, vector_length, 1 // vector_length * 2 - whilelo p2.b, tmp1, rest - incb tmp1 - whilelo p3.b, tmp1, rest - b.last 1f - st1b z0.b, p0, [dst, #0, mul vl] - st1b z0.b, p1, [dst, #1, mul vl] - st1b z0.b, p2, [dst, #2, mul vl] - st1b z0.b, p3, [dst, #3, mul vl] - ret -1: lsl tmp1, vector_length, 2 // vector_length * 4 - whilelo p4.b, tmp1, rest - incb tmp1 - whilelo p5.b, tmp1, rest - incb tmp1 - whilelo p6.b, tmp1, rest - incb tmp1 - whilelo p7.b, tmp1, rest - st1b z0.b, p0, [dst, #0, mul vl] - st1b z0.b, p1, [dst, #1, mul vl] - st1b z0.b, p2, [dst, #2, mul vl] - st1b z0.b, p3, [dst, #3, mul vl] - st1b z0.b, p4, [dst, #4, mul vl] - st1b z0.b, p5, [dst, #5, mul vl] - st1b z0.b, p6, [dst, #6, mul vl] - st1b z0.b, p7, [dst, #7, mul vl] + cmp count, vector_length, lsl 1 + b.ls 2f + add tmp2, vector_length, vector_length, lsl 2 + cmp count, tmp2 + b.ls 5f + st1b z0.b, p0, [dstend, -8, mul vl] + st1b z0.b, p0, [dstend, -7, mul vl] + st1b z0.b, p0, [dstend, -6, mul vl] +5: st1b z0.b, p0, [dstend, -5, mul vl] + st1b z0.b, p0, [dstend, -4, mul vl] + st1b z0.b, p0, [dstend, -3, mul vl] +2: st1b z0.b, p0, [dstend, -2, mul vl] + st1b z0.b, p0, [dstend, -1, mul vl] ret L(L1_prefetch): // if rest >= L1_SIZE @@ -199,7 +180,6 @@ L(L2): subs count, count, CACHE_LINE_SIZE b.hi 1b add count, count, CACHE_LINE_SIZE - add dst, dst, CACHE_LINE_SIZE b L(last) END (MEMSET)