From: Ian Lance Taylor <iant@golang.org>
To: Andreas Schwab <schwab@suse.de>
Cc: gcc-patches <gcc-patches@gcc.gnu.org>, gofrontend-dev@googlegroups.com
Subject: Re: libgo patch committed: Update to 1.11 release
Date: Mon, 01 Oct 2018 20:18:00 -0000 [thread overview]
Message-ID: <CAOyqgcUoxyNenCK5qrz09yf1N12nBP0-9ZJ8T9XLyFpph0C3Gg@mail.gmail.com> (raw)
In-Reply-To: <mvmsh1w8r5u.fsf@suse.de>
[-- Attachment #1: Type: text/plain, Size: 432 bytes --]
On Wed, Sep 26, 2018 at 3:54 AM, Andreas Schwab <schwab@suse.de> wrote:
> All execution tests are now failing with "fatal error: impossible call
> to aeshashbody".
Thanks. Fixed by this patch, which adds AES hash code for arm64 using
intrinsics. Bootstrapped and tested on x86_64-pc-linux-gnu and
aarch4-unknown-linux-gnu. Some other aarch64 tests failed; I'm not
sure if they failed before or not. Committed to mainline.
Ian
[-- Attachment #2: patch.txt --]
[-- Type: text/plain, Size: 12937 bytes --]
Index: gcc/go/gofrontend/MERGE
===================================================================
--- gcc/go/gofrontend/MERGE (revision 264690)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-f4a224ec481957ca4f14d0e8cc4fe59cc95b3a49
+013a9e68c9a31f888733d46182d19f9e5d956f27
The first line of this file holds the git revision number of the last
merge done from the gofrontend repository.
Index: libgo/runtime/aeshash.c
===================================================================
--- libgo/runtime/aeshash.c (revision 264648)
+++ libgo/runtime/aeshash.c (working copy)
@@ -573,13 +573,412 @@ uintptr aeshashbody(void* p, uintptr see
#endif // !defined(__x86_64__)
-#else // !defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)
+#elif defined(__aarch64__)
+
+// Undefine some identifiers that we pick up from the Go runtime package that
+// are used in arm_neon.h.
+
+#undef t1
+#undef tx
+#undef t2
+#undef t3
+#undef t4
+#undef t5
+
+#include <arm_neon.h>
+
+// Force appropriate CPU level. We won't call here unless the CPU
+// supports it.
+
+#pragma GCC target("+crypto")
+
+// The arm64 version of aeshashbody.
+
+uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
+ uint8x16_t *pseed;
+ uint32x4_t vinit32;
+ uint8x16_t vinit;
+ uint8x16_t vseed, vseed2, vseed3, vseed4;
+ uint8x16_t vseed5, vseed6, vseed7, vseed8;
+ uint8x16_t vval, vval2, vval3, vval4;
+ uint8x16_t vval5, vval6, vval7, vval8;
+ uint8x16_t vvalLoop, vvalLoop2, vvalLoop3, vvalLoop4;
+ uint8x16_t vvalLoop5, vvalLoop6, vvalLoop7, vvalLoop8;
+ uint8x16x2_t avval2;
+ uint8x16x3_t avseed3;
+
+ pseed = (uint8x16_t*)(aeskeysched.__values);
+
+ // Combined hash seed and length.
+ vinit32 = vdupq_n_u32(0);
+ vinit32[0] = (uint32)seed;
+ vinit32[1] = (uint32)size;
+ vinit = vreinterpretq_u8_u32(vinit32);
+
+ // Mix in per-process seed.
+ vseed = vaeseq_u8(*pseed, vinit);
+ ++pseed;
+ // Scramble seed.
+ vseed = vaesmcq_u8(vseed);
+
+ if (size <= 16) {
+ if (size == 0) {
+ // Return 64 bits of scrambled input seed.
+ return vreinterpretq_u64_u8(vseed)[0];
+ } else if (size < 16) {
+ vval = vreinterpretq_u8_u32(vdupq_n_u32(0));
+ if ((size & 8) != 0) {
+ vval = vreinterpretq_u8_u64(vld1q_lane_u64((uint64_t*)(p), vreinterpretq_u64_u8(vval), 0));
+ p = (void*)((uint64_t*)(p) + 1);
+ }
+ if ((size & 4) != 0) {
+ vval = vreinterpretq_u8_u32(vld1q_lane_u32((uint32_t*)(p), vreinterpretq_u32_u8(vval), 2));
+ p = (void*)((uint32_t*)(p) + 1);
+ }
+ if ((size & 2) != 0) {
+ vval = vreinterpretq_u8_u16(vld1q_lane_u16((uint16_t*)(p), vreinterpretq_u16_u8(vval), 6));
+ p = (void*)((uint16_t*)(p) + 1);
+ }
+ if ((size & 1) != 0) {
+ vval = vld1q_lane_u8((uint8*)(p), vval, 14);
+ }
+ } else {
+ vval = *(uint8x16_t*)(p);
+ }
+ vval = vaeseq_u8(vval, vseed);
+ vval = vaesmcq_u8(vval);
+ vval = vaeseq_u8(vval, vseed);
+ vval = vaesmcq_u8(vval);
+ vval = vaeseq_u8(vval, vseed);
+ return vreinterpretq_u64_u8(vval)[0];
+ } else if (size <= 32) {
+ // Make a second seed.
+ vseed2 = vaeseq_u8(*pseed, vinit);
+ vseed2 = vaesmcq_u8(vseed2);
+ vval = *(uint8x16_t*)(p);
+ vval2 = *(uint8x16_t*)((char*)(p) + (size - 16));
+
+ vval = vaeseq_u8(vval, vseed);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vseed2);
+ vval2 = vaesmcq_u8(vval2);
+
+ vval = vaeseq_u8(vval, vseed);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vseed2);
+ vval2 = vaesmcq_u8(vval2);
+
+ vval = vaeseq_u8(vval, vseed);
+ vval2 = vaeseq_u8(vval2, vseed2);
+
+ vval ^= vval2;
+
+ return vreinterpretq_u64_u8(vval)[0];
+ } else if (size <= 64) {
+ avseed3 = vld1q_u8_x3((uint8*)(pseed));
+ vseed2 = avseed3.val[0];
+ vseed3 = avseed3.val[1];
+ vseed4 = avseed3.val[2];
+
+ vseed2 = vaeseq_u8(vseed2, vinit);
+ vseed2 = vaesmcq_u8(vseed2);
+ vseed3 = vaeseq_u8(vseed3, vinit);
+ vseed3 = vaesmcq_u8(vseed3);
+ vseed4 = vaeseq_u8(vseed4, vinit);
+ vseed4 = vaesmcq_u8(vseed4);
+
+ avval2 = vld1q_u8_x2((uint8*)(p));
+ vval = avval2.val[0];
+ vval2 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
+ vval3 = avval2.val[0];
+ vval4 = avval2.val[1];
+
+ vval = vaeseq_u8(vval, vseed);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vseed2);
+ vval2 = vaesmcq_u8(vval2);
+ vval3 = vaeseq_u8(vval3, vseed3);
+ vval3 = vaesmcq_u8(vval3);
+ vval4 = vaeseq_u8(vval4, vseed4);
+ vval4 = vaesmcq_u8(vval4);
+
+ vval = vaeseq_u8(vval, vseed);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vseed2);
+ vval2 = vaesmcq_u8(vval2);
+ vval3 = vaeseq_u8(vval3, vseed3);
+ vval3 = vaesmcq_u8(vval3);
+ vval4 = vaeseq_u8(vval4, vseed4);
+ vval4 = vaesmcq_u8(vval4);
+
+ vval = vaeseq_u8(vval, vseed);
+ vval2 = vaeseq_u8(vval2, vseed2);
+ vval3 = vaeseq_u8(vval3, vseed3);
+ vval4 = vaeseq_u8(vval4, vseed4);
+
+ vval ^= vval3;
+ vval2 ^= vval4;
+ vval ^= vval2;
+
+ return vreinterpretq_u64_u8(vval)[0];
+ } else if (size <= 128) {
+ // For some reason vld1q_u8_x4 is missing.
+ avseed3 = vld1q_u8_x3((uint8*)(pseed));
+ vseed2 = avseed3.val[0];
+ vseed3 = avseed3.val[1];
+ vseed4 = avseed3.val[2];
+ avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
+ vseed5 = avseed3.val[0];
+ vseed6 = avseed3.val[1];
+ vseed7 = avseed3.val[2];
+ vseed8 = *(pseed + 6);
+
+ vseed2 = vaeseq_u8(vseed2, vinit);
+ vseed2 = vaesmcq_u8(vseed2);
+ vseed3 = vaeseq_u8(vseed3, vinit);
+ vseed3 = vaesmcq_u8(vseed3);
+ vseed4 = vaeseq_u8(vseed4, vinit);
+ vseed4 = vaesmcq_u8(vseed4);
+ vseed5 = vaeseq_u8(vseed5, vinit);
+ vseed5 = vaesmcq_u8(vseed5);
+ vseed6 = vaeseq_u8(vseed6, vinit);
+ vseed6 = vaesmcq_u8(vseed6);
+ vseed7 = vaeseq_u8(vseed7, vinit);
+ vseed7 = vaesmcq_u8(vseed7);
+ vseed8 = vaeseq_u8(vseed8, vinit);
+ vseed8 = vaesmcq_u8(vseed8);
+
+ avval2 = vld1q_u8_x2((uint8*)(p));
+ vval = avval2.val[0];
+ vval2 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + 32);
+ vval3 = avval2.val[0];
+ vval4 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
+ vval5 = avval2.val[0];
+ vval6 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
+ vval7 = avval2.val[0];
+ vval8 = avval2.val[1];
+
+ vval = vaeseq_u8(vval, vseed);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vseed2);
+ vval2 = vaesmcq_u8(vval2);
+ vval3 = vaeseq_u8(vval3, vseed3);
+ vval3 = vaesmcq_u8(vval3);
+ vval4 = vaeseq_u8(vval4, vseed4);
+ vval4 = vaesmcq_u8(vval4);
+ vval5 = vaeseq_u8(vval5, vseed5);
+ vval5 = vaesmcq_u8(vval5);
+ vval6 = vaeseq_u8(vval6, vseed6);
+ vval6 = vaesmcq_u8(vval6);
+ vval7 = vaeseq_u8(vval7, vseed7);
+ vval7 = vaesmcq_u8(vval7);
+ vval8 = vaeseq_u8(vval8, vseed8);
+ vval8 = vaesmcq_u8(vval8);
+
+ vval = vaeseq_u8(vval, vseed);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vseed2);
+ vval2 = vaesmcq_u8(vval2);
+ vval3 = vaeseq_u8(vval3, vseed3);
+ vval3 = vaesmcq_u8(vval3);
+ vval4 = vaeseq_u8(vval4, vseed4);
+ vval4 = vaesmcq_u8(vval4);
+ vval5 = vaeseq_u8(vval5, vseed5);
+ vval5 = vaesmcq_u8(vval5);
+ vval6 = vaeseq_u8(vval6, vseed6);
+ vval6 = vaesmcq_u8(vval6);
+ vval7 = vaeseq_u8(vval7, vseed7);
+ vval7 = vaesmcq_u8(vval7);
+ vval8 = vaeseq_u8(vval8, vseed8);
+ vval8 = vaesmcq_u8(vval8);
+
+ vval = vaeseq_u8(vval, vseed);
+ vval2 = vaeseq_u8(vval2, vseed2);
+ vval3 = vaeseq_u8(vval3, vseed3);
+ vval4 = vaeseq_u8(vval4, vseed4);
+ vval5 = vaeseq_u8(vval5, vseed5);
+ vval6 = vaeseq_u8(vval6, vseed6);
+ vval7 = vaeseq_u8(vval7, vseed7);
+ vval8 = vaeseq_u8(vval8, vseed8);
+
+ vval ^= vval5;
+ vval2 ^= vval6;
+ vval3 ^= vval7;
+ vval4 ^= vval8;
+ vval ^= vval3;
+ vval2 ^= vval4;
+ vval ^= vval2;
+
+ return vreinterpretq_u64_u8(vval)[0];
+ } else {
+ // For some reason vld1q_u8_x4 is missing.
+ avseed3 = vld1q_u8_x3((uint8*)(pseed));
+ vseed2 = avseed3.val[0];
+ vseed3 = avseed3.val[1];
+ vseed4 = avseed3.val[2];
+ avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
+ vseed5 = avseed3.val[0];
+ vseed6 = avseed3.val[1];
+ vseed7 = avseed3.val[2];
+ vseed8 = *(pseed + 6);
+
+ vseed2 = vaeseq_u8(vseed2, vinit);
+ vseed2 = vaesmcq_u8(vseed2);
+ vseed3 = vaeseq_u8(vseed3, vinit);
+ vseed3 = vaesmcq_u8(vseed3);
+ vseed4 = vaeseq_u8(vseed4, vinit);
+ vseed4 = vaesmcq_u8(vseed4);
+ vseed5 = vaeseq_u8(vseed5, vinit);
+ vseed5 = vaesmcq_u8(vseed5);
+ vseed6 = vaeseq_u8(vseed6, vinit);
+ vseed6 = vaesmcq_u8(vseed6);
+ vseed7 = vaeseq_u8(vseed7, vinit);
+ vseed7 = vaesmcq_u8(vseed7);
+ vseed8 = vaeseq_u8(vseed8, vinit);
+ vseed8 = vaesmcq_u8(vseed8);
+
+ avval2 = vld1q_u8_x2((uint8*)(p) + (size - 128));
+ vval = avval2.val[0];
+ vval2 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + (size - 96));
+ vval3 = avval2.val[0];
+ vval4 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
+ vval5 = avval2.val[0];
+ vval6 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
+ vval7 = avval2.val[0];
+ vval8 = avval2.val[1];
+
+ vvalLoop = vseed;
+ vvalLoop2 = vseed2;
+ vvalLoop3 = vseed3;
+ vvalLoop4 = vseed4;
+ vvalLoop5 = vseed5;
+ vvalLoop6 = vseed6;
+ vvalLoop7 = vseed7;
+ vvalLoop8 = vseed8;
+
+ size--;
+ size >>= 7;
+ do {
+ vval = vaeseq_u8(vval, vvalLoop);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vvalLoop2);
+ vval2 = vaesmcq_u8(vval2);
+ vval3 = vaeseq_u8(vval3, vvalLoop3);
+ vval3 = vaesmcq_u8(vval3);
+ vval4 = vaeseq_u8(vval4, vvalLoop4);
+ vval4 = vaesmcq_u8(vval4);
+ vval5 = vaeseq_u8(vval5, vvalLoop5);
+ vval5 = vaesmcq_u8(vval5);
+ vval6 = vaeseq_u8(vval6, vvalLoop6);
+ vval6 = vaesmcq_u8(vval6);
+ vval7 = vaeseq_u8(vval7, vvalLoop7);
+ vval7 = vaesmcq_u8(vval7);
+ vval8 = vaeseq_u8(vval8, vvalLoop8);
+ vval8 = vaesmcq_u8(vval8);
+
+ avval2 = vld1q_u8_x2((uint8*)(p));
+ vvalLoop = avval2.val[0];
+ vvalLoop2 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + 32);
+ vvalLoop3 = avval2.val[0];
+ vvalLoop4 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + 64);
+ vvalLoop5 = avval2.val[0];
+ vvalLoop6 = avval2.val[1];
+ avval2 = vld1q_u8_x2((uint8*)(p) + 96);
+ vvalLoop7 = avval2.val[0];
+ vvalLoop8 = avval2.val[1];
+
+ p = (void *)((uint8*)(p) + 128);
+
+ vval = vaeseq_u8(vval, vvalLoop);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vvalLoop2);
+ vval2 = vaesmcq_u8(vval2);
+ vval3 = vaeseq_u8(vval3, vvalLoop3);
+ vval3 = vaesmcq_u8(vval3);
+ vval4 = vaeseq_u8(vval4, vvalLoop4);
+ vval4 = vaesmcq_u8(vval4);
+ vval5 = vaeseq_u8(vval5, vvalLoop5);
+ vval5 = vaesmcq_u8(vval5);
+ vval6 = vaeseq_u8(vval6, vvalLoop6);
+ vval6 = vaesmcq_u8(vval6);
+ vval7 = vaeseq_u8(vval7, vvalLoop7);
+ vval7 = vaesmcq_u8(vval7);
+ vval8 = vaeseq_u8(vval8, vvalLoop8);
+ vval8 = vaesmcq_u8(vval8);
+ } while (--size > 0);
+
+ vval = vaeseq_u8(vval, vvalLoop);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vvalLoop2);
+ vval2 = vaesmcq_u8(vval2);
+ vval3 = vaeseq_u8(vval3, vvalLoop3);
+ vval3 = vaesmcq_u8(vval3);
+ vval4 = vaeseq_u8(vval4, vvalLoop4);
+ vval4 = vaesmcq_u8(vval4);
+ vval5 = vaeseq_u8(vval5, vvalLoop5);
+ vval5 = vaesmcq_u8(vval5);
+ vval6 = vaeseq_u8(vval6, vvalLoop6);
+ vval6 = vaesmcq_u8(vval6);
+ vval7 = vaeseq_u8(vval7, vvalLoop7);
+ vval7 = vaesmcq_u8(vval7);
+ vval8 = vaeseq_u8(vval8, vvalLoop8);
+ vval8 = vaesmcq_u8(vval8);
+
+
+ vval = vaeseq_u8(vval, vvalLoop);
+ vval = vaesmcq_u8(vval);
+ vval2 = vaeseq_u8(vval2, vvalLoop2);
+ vval2 = vaesmcq_u8(vval2);
+ vval3 = vaeseq_u8(vval3, vvalLoop3);
+ vval3 = vaesmcq_u8(vval3);
+ vval4 = vaeseq_u8(vval4, vvalLoop4);
+ vval4 = vaesmcq_u8(vval4);
+ vval5 = vaeseq_u8(vval5, vvalLoop5);
+ vval5 = vaesmcq_u8(vval5);
+ vval6 = vaeseq_u8(vval6, vvalLoop6);
+ vval6 = vaesmcq_u8(vval6);
+ vval7 = vaeseq_u8(vval7, vvalLoop7);
+ vval7 = vaesmcq_u8(vval7);
+ vval8 = vaeseq_u8(vval8, vvalLoop8);
+ vval8 = vaesmcq_u8(vval8);
+
+ vval = vaeseq_u8(vval, vvalLoop);
+ vval2 = vaeseq_u8(vval2, vvalLoop2);
+ vval3 = vaeseq_u8(vval3, vvalLoop3);
+ vval4 = vaeseq_u8(vval4, vvalLoop4);
+ vval5 = vaeseq_u8(vval5, vvalLoop5);
+ vval6 = vaeseq_u8(vval6, vvalLoop6);
+ vval7 = vaeseq_u8(vval7, vvalLoop7);
+ vval8 = vaeseq_u8(vval8, vvalLoop8);
+
+ vval ^= vval5;
+ vval2 ^= vval6;
+ vval3 ^= vval7;
+ vval4 ^= vval8;
+ vval ^= vval3;
+ vval2 ^= vval4;
+ vval ^= vval2;
+
+ return vreinterpretq_u64_u8(vval)[0];
+ }
+}
+
+#else // (!defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)) && !defined(__aarch64__)
uintptr aeshashbody(void* p __attribute__((unused)),
uintptr seed __attribute__((unused)),
uintptr size __attribute__((unused)),
Slice aeskeysched __attribute__((unused))) {
- // We should never get here on a non-x86 system.
+ // We should never get here on a non-x86, non-arm64 system.
runtime_throw("impossible call to aeshashbody");
}
next prev parent reply other threads:[~2018-10-01 20:16 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-09-24 21:49 Ian Lance Taylor
2018-09-25 7:06 ` Andreas Schwab
2018-09-25 7:57 ` Andreas Schwab
2018-09-25 9:05 ` Andreas Schwab
2018-09-25 14:39 ` Ian Lance Taylor
2018-09-25 13:10 ` Rainer Orth
2018-09-25 13:34 ` Rainer Orth
2018-09-26 5:04 ` Ian Lance Taylor
2018-09-26 8:55 ` Rainer Orth
2018-09-26 7:57 ` Andreas Schwab
2018-09-26 8:59 ` Rainer Orth
2018-09-26 12:54 ` Ian Lance Taylor
2018-09-26 11:12 ` Andreas Schwab
2018-09-26 12:49 ` Ian Lance Taylor
2018-10-01 20:18 ` Ian Lance Taylor [this message]
2018-09-26 14:52 ` H.J. Lu
2018-09-26 15:16 ` H.J. Lu
2018-10-01 20:28 ` Ian Lance Taylor
2018-10-01 20:59 ` H.J. Lu
2018-10-02 0:06 ` H.J. Lu
2018-10-02 0:59 ` Ian Lance Taylor
2018-10-02 2:37 ` H.J. Lu
2018-10-02 4:09 ` Ian Lance Taylor
2018-10-03 14:09 ` H.J. Lu
2018-10-03 22:04 ` Ian Lance Taylor
2018-09-28 14:28 ` Rainer Orth
2018-10-02 17:13 ` Ian Lance Taylor
2018-10-04 12:05 ` Rainer Orth
2018-09-26 7:58 Uros Bizjak
2018-09-26 15:22 ` Ian Lance Taylor
2018-09-27 20:26 ` Uros Bizjak
2018-09-28 7:13 ` Uros Bizjak
2018-10-02 15:13 ` Ian Lance Taylor
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAOyqgcUoxyNenCK5qrz09yf1N12nBP0-9ZJ8T9XLyFpph0C3Gg@mail.gmail.com \
--to=iant@golang.org \
--cc=gcc-patches@gcc.gnu.org \
--cc=gofrontend-dev@googlegroups.com \
--cc=schwab@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).