From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2153) id 8E54E3858C50; Tue, 28 Nov 2023 12:30:24 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 8E54E3858C50 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable From: Jakub Jelinek To: bfd-cvs@sourceware.org, gdb-cvs@sourceware.org Subject: [binutils-gdb] libiberty, ld: Use x86 HW optimized sha1 X-Act-Checkin: binutils-gdb X-Git-Author: Jakub Jelinek X-Git-Refname: refs/heads/master X-Git-Oldrev: e5f1ee1832ff9e970833fa5773f46c3e0b93bc04 X-Git-Newrev: 4a50820ee8f153265ec8ffd068618607d4be3a26 Message-Id: <20231128123024.8E54E3858C50@sourceware.org> Date: Tue, 28 Nov 2023 12:30:24 +0000 (GMT) X-BeenThere: binutils-cvs@sourceware.org X-Mailman-Version: 2.1.30 Precedence: list List-Id: Binutils-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 28 Nov 2023 12:30:24 -0000 https://sourceware.org/git/gitweb.cgi?p=3Dbinutils-gdb.git;h=3D4a50820ee8f1= 53265ec8ffd068618607d4be3a26 commit 4a50820ee8f153265ec8ffd068618607d4be3a26 Author: Jakub Jelinek Date: Tue Nov 28 13:29:58 2023 +0100 libiberty, ld: Use x86 HW optimized sha1 =20 The following patch attempts to use x86 SHA ISA if available to speed up in my testing about 2.5x sha1 build-id processing (in my case on AMD Ryzen 5 3600) while producing the same result. I believe AArch64 has similar HW acceleration for SHA1, perhaps it could be added similarly. =20 Note, seems lld uses BLAKE3 rather than md5/sha1. I think it would be a bad idea to lie to users, if they choose --buildid=3Dsha1, we should be using SHA1, not some other checksum, but perhaps we could add some o= ther --buildid=3D styles and perhaps make one of the new the default. =20 Tested on x86_64-linux, both on Intel i9-7960X (which doesn't have sha_ni ISA support) without/with the patch and on AMD Ryzen 5 3600 (which does have it) without/with the patch. =20 2023-11-28 Jakub Jelinek =20 include/ * sha1.h (sha1_process_bytes_fn): New typedef. (sha1_choose_process_bytes): Declare. libiberty/ * configure.ac (HAVE_X86_SHA1_HW_SUPPORT): New check. * sha1.c: If HAVE_X86_SHA1_HW_SUPPORT is defined, include x86in= trin.h and cpuid.h. (sha1_hw_process_bytes, sha1_hw_process_block, sha1_choose_process_bytes): New functions. * config.in: Regenerated. * configure: Regenerated. ld/ * ldbuildid.c (generate_build_id): Use sha1_choose_process_byte= s () instead of &sha1_process_bytes. Diff: --- include/ChangeLog | 5 + include/sha1.h | 7 ++ ld/ChangeLog | 5 + ld/ldbuildid.c | 3 +- libiberty/ChangeLog | 10 ++ libiberty/config.in | 3 + libiberty/configure | 58 ++++++++++ libiberty/configure.ac | 40 +++++++ libiberty/sha1.c | 305 +++++++++++++++++++++++++++++++++++++++++++++= ++++ 9 files changed, 435 insertions(+), 1 deletion(-) diff --git a/include/ChangeLog b/include/ChangeLog index dbef999e8fb..d9e4dc448c2 100644 --- a/include/ChangeLog +++ b/include/ChangeLog @@ -1,3 +1,8 @@ +2023-11-28 Jakub Jelinek + + * sha1.h (sha1_process_bytes_fn): New typedef. + (sha1_choose_process_bytes): Declare. + 2023-11-10 Simon Marchi =20 * elf/amdgpu.h (EF_AMDGPU_MACH_AMDGCN_GFX1100, diff --git a/include/sha1.h b/include/sha1.h index eb86fbb250a..6c94a4740a5 100644 --- a/include/sha1.h +++ b/include/sha1.h @@ -108,6 +108,13 @@ extern void sha1_process_block (const void *buffer, si= ze_t len, extern void sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx); =20 +typedef void (*sha1_process_bytes_fn) (const void *, size_t, + struct sha1_ctx *); + +/* Return sha1_process_bytes or some hardware optimized version thereof + depending on current CPU. */ +extern sha1_process_bytes_fn sha1_choose_process_bytes (void); + /* Process the remaining bytes in the buffer and put result from CTX in first 20 bytes following RESBUF. The result is always in little endian byte order, so that a byte-wise output yields to the wanted diff --git a/ld/ChangeLog b/ld/ChangeLog index 0c78ab7fee4..6834cf6d4b0 100644 --- a/ld/ChangeLog +++ b/ld/ChangeLog @@ -1,3 +1,8 @@ +2023-11-28 Jakub Jelinek + + * ldbuildid.c (generate_build_id): Use sha1_choose_process_bytes () + instead of &sha1_process_bytes. + 2023-11-28 Nick Clifton =20 * po/ro.po: New Romanian translation. diff --git a/ld/ldbuildid.c b/ld/ldbuildid.c index 92edf74cc2c..3009cb78ff4 100644 --- a/ld/ldbuildid.c +++ b/ld/ldbuildid.c @@ -114,7 +114,8 @@ generate_build_id (bfd *abfd, struct sha1_ctx ctx; =20 sha1_init_ctx (&ctx); - if (!(*checksum_contents) (abfd, (sum_fn) &sha1_process_bytes, &ctx)) + if (!(*checksum_contents) (abfd, (sum_fn) sha1_choose_process_bytes = (), + &ctx)) return false; sha1_finish_ctx (&ctx, id_bits); } diff --git a/libiberty/ChangeLog b/libiberty/ChangeLog index 3424fef99ba..a8fdc1b8f0e 100644 --- a/libiberty/ChangeLog +++ b/libiberty/ChangeLog @@ -1,3 +1,13 @@ +2023-11-28 Jakub Jelinek + + * configure.ac (HAVE_X86_SHA1_HW_SUPPORT): New check. + * sha1.c: If HAVE_X86_SHA1_HW_SUPPORT is defined, include x86intrin.h + and cpuid.h. + (sha1_hw_process_bytes, sha1_hw_process_block, + sha1_choose_process_bytes): New functions. + * config.in: Regenerated. + * configure: Regenerated. + 2023-06-15 Marek Polacek =20 * configure.ac: Also set shared when enable_host_pie. diff --git a/libiberty/config.in b/libiberty/config.in index f7052b5d958..6c4a2597e9e 100644 --- a/libiberty/config.in +++ b/libiberty/config.in @@ -432,6 +432,9 @@ /* Define to 1 if `vfork' works. */ #undef HAVE_WORKING_VFORK =20 +/* Define if you have x86 SHA1 HW acceleration support. */ +#undef HAVE_X86_SHA1_HW_SUPPORT + /* Define to 1 if you have the `_doprnt' function. */ #undef HAVE__DOPRNT =20 diff --git a/libiberty/configure b/libiberty/configure index dd896270dc6..9cdf802539e 100755 --- a/libiberty/configure +++ b/libiberty/configure @@ -7544,6 +7544,64 @@ case "${host}" in esac =20 =20 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SHA1 HW acceleration= support" >&5 +$as_echo_n "checking for SHA1 HW acceleration support... " >&6; } +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#include +#include + +__attribute__((__target__ ("sse4.1,sha"))) +void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1) +{ + __m128i abcd =3D _mm_loadu_si128 ((const __m128i *) buf); + __m128i e0 =3D _mm_set_epi32 (e, 0, 0, 0); + abcd =3D _mm_shuffle_epi32 (abcd, 0x1b); + const __m128i shuf_mask =3D _mm_set_epi64x (0x0001020304050607ULL, 0x080= 90a0b0c0d0e0fULL); + abcd =3D _mm_shuffle_epi8 (abcd, shuf_mask); + e0 =3D _mm_sha1nexte_epu32 (e0, msg1); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 0); + msg0 =3D _mm_sha1msg1_epu32 (msg0, msg1); + msg0 =3D _mm_sha1msg2_epu32 (msg0, msg1); + msg0 =3D _mm_xor_si128 (msg0, msg1); + e0 =3D _mm_add_epi32 (e0, msg0); + e0 =3D abcd; + _mm_storeu_si128 (buf, abcd); + e =3D _mm_extract_epi32 (e0, 3); +} + +int bar (void) +{ + unsigned int eax, ebx, ecx, edx; + if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx) + && (ebx & bit_SHA) !=3D 0 + && __get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_SSE4_1) !=3D 0) + return 1; + return 0; +} + +int +main () +{ +bar (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: result: x86 SHA1" >&5 +$as_echo "x86 SHA1" >&6; } + +$as_echo "#define HAVE_X86_SHA1_HW_SUPPORT 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + =20 =20 =20 diff --git a/libiberty/configure.ac b/libiberty/configure.ac index 0748c592704..e07cbb8aabf 100644 --- a/libiberty/configure.ac +++ b/libiberty/configure.ac @@ -740,6 +740,46 @@ case "${host}" in esac AC_SUBST(pexecute) =20 +AC_MSG_CHECKING([for SHA1 HW acceleration support]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +#include +#include + +__attribute__((__target__ ("sse4.1,sha"))) +void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1) +{ + __m128i abcd =3D _mm_loadu_si128 ((const __m128i *) buf); + __m128i e0 =3D _mm_set_epi32 (e, 0, 0, 0); + abcd =3D _mm_shuffle_epi32 (abcd, 0x1b); + const __m128i shuf_mask =3D _mm_set_epi64x (0x0001020304050607ULL, 0x080= 90a0b0c0d0e0fULL); + abcd =3D _mm_shuffle_epi8 (abcd, shuf_mask); + e0 =3D _mm_sha1nexte_epu32 (e0, msg1); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 0); + msg0 =3D _mm_sha1msg1_epu32 (msg0, msg1); + msg0 =3D _mm_sha1msg2_epu32 (msg0, msg1); + msg0 =3D _mm_xor_si128 (msg0, msg1); + e0 =3D _mm_add_epi32 (e0, msg0); + e0 =3D abcd; + _mm_storeu_si128 (buf, abcd); + e =3D _mm_extract_epi32 (e0, 3); +} + +int bar (void) +{ + unsigned int eax, ebx, ecx, edx; + if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx) + && (ebx & bit_SHA) !=3D 0 + && __get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_SSE4_1) !=3D 0) + return 1; + return 0; +} +]], [[bar ();]])], + [AC_MSG_RESULT([x86 SHA1]) + AC_DEFINE(HAVE_X86_SHA1_HW_SUPPORT, 1, + [Define if you have x86 SHA1 HW acceleration support.])], + [AC_MSG_RESULT([no])]) + libiberty_AC_FUNC_STRNCMP =20 # Install a library built with a cross compiler in $(tooldir) rather diff --git a/libiberty/sha1.c b/libiberty/sha1.c index 6c71e3ebb41..bb47268d21a 100644 --- a/libiberty/sha1.c +++ b/libiberty/sha1.c @@ -29,6 +29,11 @@ #include #include =20 +#ifdef HAVE_X86_SHA1_HW_SUPPORT +# include +# include +#endif + #if USE_UNLOCKED_IO # include "unlocked-io.h" #endif @@ -412,3 +417,303 @@ sha1_process_block (const void *buffer, size_t len, s= truct sha1_ctx *ctx) e =3D ctx->E +=3D e; } } + +#if defined(HAVE_X86_SHA1_HW_SUPPORT) +/* HW specific version of sha1_process_bytes. */ + +static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *= ); + +static void +sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ct= x) +{ + /* When we already have some bits in our internal buffer concatenate + both inputs first. */ + if (ctx->buflen !=3D 0) + { + size_t left_over =3D ctx->buflen; + size_t add =3D 128 - left_over > len ? len : 128 - left_over; + + memcpy (&((char *) ctx->buffer)[left_over], buffer, add); + ctx->buflen +=3D add; + + if (ctx->buflen > 64) + { + sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx); + + ctx->buflen &=3D 63; + /* The regions in the following copy operation cannot overlap. */ + memcpy (ctx->buffer, + &((char *) ctx->buffer)[(left_over + add) & ~63], + ctx->buflen); + } + + buffer =3D (const char *) buffer + add; + len -=3D add; + } + + /* Process available complete blocks. */ + if (len >=3D 64) + { +#if !_STRING_ARCH_unaligned +# define alignof(type) offsetof (struct { char c; type x; }, x) +# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) !=3D 0) + if (UNALIGNED_P (buffer)) + while (len > 64) + { + sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx); + buffer =3D (const char *) buffer + 64; + len -=3D 64; + } + else +#endif + { + sha1_hw_process_block (buffer, len & ~63, ctx); + buffer =3D (const char *) buffer + (len & ~63); + len &=3D 63; + } + } + + /* Move remaining bytes in internal buffer. */ + if (len > 0) + { + size_t left_over =3D ctx->buflen; + + memcpy (&((char *) ctx->buffer)[left_over], buffer, len); + left_over +=3D len; + if (left_over >=3D 64) + { + sha1_hw_process_block (ctx->buffer, 64, ctx); + left_over -=3D 64; + memmove (ctx->buffer, &ctx->buffer[16], left_over); + } + ctx->buflen =3D left_over; + } +} + +/* Process LEN bytes of BUFFER, accumulating context into CTX. + Using CPU specific intrinsics. */ + +#ifdef HAVE_X86_SHA1_HW_SUPPORT +__attribute__((__target__ ("sse4.1,sha"))) +#endif +static void +sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ct= x) +{ +#ifdef HAVE_X86_SHA1_HW_SUPPORT + /* Implemented from + https://www.intel.com/content/www/us/en/developer/articles/technical/= intel-sha-extensions.html */ + const __m128i *words =3D (const __m128i *) buffer; + const __m128i *endp =3D (const __m128i *) ((const char *) buffer + len); + __m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3; + const __m128i shuf_mask + =3D _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); + char check[((offsetof (struct sha1_ctx, B) + =3D=3D offsetof (struct sha1_ctx, A) + sizeof (ctx->A)) + && (offsetof (struct sha1_ctx, C) + =3D=3D offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A)) + && (offsetof (struct sha1_ctx, D) + =3D=3D offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A))) + ? 1 : -1]; + + /* First increment the byte count. RFC 1321 specifies the possible + length of the file up to 2^64 bits. Here we only compute the + number of bytes. Do a double word increment. */ + ctx->total[0] +=3D len; + ctx->total[1] +=3D ((len >> 31) >> 1) + (ctx->total[0] < len); + + (void) &check[0]; + abcd =3D _mm_loadu_si128 ((const __m128i *) &ctx->A); + e0 =3D _mm_set_epi32 (ctx->E, 0, 0, 0); + abcd =3D _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */ + + while (words < endp) + { + abcd_save =3D abcd; + e0_save =3D e0; + + /* 0..3 */ + msg0 =3D _mm_loadu_si128 (words); + msg0 =3D _mm_shuffle_epi8 (msg0, shuf_mask); + e0 =3D _mm_add_epi32 (e0, msg0); + e1 =3D abcd; + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 0); + + /* 4..7 */ + msg1 =3D _mm_loadu_si128 (words + 1); + msg1 =3D _mm_shuffle_epi8 (msg1, shuf_mask); + e1 =3D _mm_sha1nexte_epu32 (e1, msg1); + e0 =3D abcd; + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 0); + msg0 =3D _mm_sha1msg1_epu32 (msg0, msg1); + + /* 8..11 */ + msg2 =3D _mm_loadu_si128 (words + 2); + msg2 =3D _mm_shuffle_epi8 (msg2, shuf_mask); + e0 =3D _mm_sha1nexte_epu32 (e0, msg2); + e1 =3D abcd; + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 0); + msg1 =3D _mm_sha1msg1_epu32 (msg1, msg2); + msg0 =3D _mm_xor_si128 (msg0, msg2); + + /* 12..15 */ + msg3 =3D _mm_loadu_si128 (words + 3); + msg3 =3D _mm_shuffle_epi8 (msg3, shuf_mask); + e1 =3D _mm_sha1nexte_epu32 (e1, msg3); + e0 =3D abcd; + msg0 =3D _mm_sha1msg2_epu32 (msg0, msg3); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 0); + msg2 =3D _mm_sha1msg1_epu32 (msg2, msg3); + msg1 =3D _mm_xor_si128 (msg1, msg3); + + /* 16..19 */ + e0 =3D _mm_sha1nexte_epu32 (e0, msg0); + e1 =3D abcd; + msg1 =3D _mm_sha1msg2_epu32 (msg1, msg0); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 0); + msg3 =3D _mm_sha1msg1_epu32 (msg3, msg0); + msg2 =3D _mm_xor_si128 (msg2, msg0); + + /* 20..23 */ + e1 =3D _mm_sha1nexte_epu32 (e1, msg1); + e0 =3D abcd; + msg2 =3D _mm_sha1msg2_epu32 (msg2, msg1); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 1); + msg0 =3D _mm_sha1msg1_epu32 (msg0, msg1); + msg3 =3D _mm_xor_si128 (msg3, msg1); + + /* 24..27 */ + e0 =3D _mm_sha1nexte_epu32 (e0, msg2); + e1 =3D abcd; + msg3 =3D _mm_sha1msg2_epu32 (msg3, msg2); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 1); + msg1 =3D _mm_sha1msg1_epu32 (msg1, msg2); + msg0 =3D _mm_xor_si128 (msg0, msg2); + + /* 28..31 */ + e1 =3D _mm_sha1nexte_epu32 (e1, msg3); + e0 =3D abcd; + msg0 =3D _mm_sha1msg2_epu32 (msg0, msg3); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 1); + msg2 =3D _mm_sha1msg1_epu32 (msg2, msg3); + msg1 =3D _mm_xor_si128 (msg1, msg3); + + /* 32..35 */ + e0 =3D _mm_sha1nexte_epu32 (e0, msg0); + e1 =3D abcd; + msg1 =3D _mm_sha1msg2_epu32 (msg1, msg0); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 1); + msg3 =3D _mm_sha1msg1_epu32 (msg3, msg0); + msg2 =3D _mm_xor_si128 (msg2, msg0); + + /* 36..39 */ + e1 =3D _mm_sha1nexte_epu32 (e1, msg1); + e0 =3D abcd; + msg2 =3D _mm_sha1msg2_epu32 (msg2, msg1); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 1); + msg0 =3D _mm_sha1msg1_epu32 (msg0, msg1); + msg3 =3D _mm_xor_si128 (msg3, msg1); + + /* 40..43 */ + e0 =3D _mm_sha1nexte_epu32 (e0, msg2); + e1 =3D abcd; + msg3 =3D _mm_sha1msg2_epu32 (msg3, msg2); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 2); + msg1 =3D _mm_sha1msg1_epu32 (msg1, msg2); + msg0 =3D _mm_xor_si128 (msg0, msg2); + + /* 44..47 */ + e1 =3D _mm_sha1nexte_epu32 (e1, msg3); + e0 =3D abcd; + msg0 =3D _mm_sha1msg2_epu32 (msg0, msg3); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 2); + msg2 =3D _mm_sha1msg1_epu32 (msg2, msg3); + msg1 =3D _mm_xor_si128 (msg1, msg3); + + /* 48..51 */ + e0 =3D _mm_sha1nexte_epu32 (e0, msg0); + e1 =3D abcd; + msg1 =3D _mm_sha1msg2_epu32 (msg1, msg0); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 2); + msg3 =3D _mm_sha1msg1_epu32 (msg3, msg0); + msg2 =3D _mm_xor_si128 (msg2, msg0); + + /* 52..55 */ + e1 =3D _mm_sha1nexte_epu32 (e1, msg1); + e0 =3D abcd; + msg2 =3D _mm_sha1msg2_epu32 (msg2, msg1); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 2); + msg0 =3D _mm_sha1msg1_epu32 (msg0, msg1); + msg3 =3D _mm_xor_si128 (msg3, msg1); + + /* 56..59 */ + e0 =3D _mm_sha1nexte_epu32 (e0, msg2); + e1 =3D abcd; + msg3 =3D _mm_sha1msg2_epu32 (msg3, msg2); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 2); + msg1 =3D _mm_sha1msg1_epu32 (msg1, msg2); + msg0 =3D _mm_xor_si128 (msg0, msg2); + + /* 60..63 */ + e1 =3D _mm_sha1nexte_epu32 (e1, msg3); + e0 =3D abcd; + msg0 =3D _mm_sha1msg2_epu32 (msg0, msg3); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 3); + msg2 =3D _mm_sha1msg1_epu32 (msg2, msg3); + msg1 =3D _mm_xor_si128 (msg1, msg3); + + /* 64..67 */ + e0 =3D _mm_sha1nexte_epu32 (e0, msg0); + e1 =3D abcd; + msg1 =3D _mm_sha1msg2_epu32 (msg1, msg0); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 3); + msg3 =3D _mm_sha1msg1_epu32 (msg3, msg0); + msg2 =3D _mm_xor_si128 (msg2, msg0); + + /* 68..71 */ + e1 =3D _mm_sha1nexte_epu32 (e1, msg1); + e0 =3D abcd; + msg2 =3D _mm_sha1msg2_epu32 (msg2, msg1); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 3); + msg3 =3D _mm_xor_si128 (msg3, msg1); + + /* 72..75 */ + e0 =3D _mm_sha1nexte_epu32 (e0, msg2); + e1 =3D abcd; + msg3 =3D _mm_sha1msg2_epu32 (msg3, msg2); + abcd =3D _mm_sha1rnds4_epu32 (abcd, e0, 3); + + /* 76..79 */ + e1 =3D _mm_sha1nexte_epu32 (e1, msg3); + e0 =3D abcd; + abcd =3D _mm_sha1rnds4_epu32 (abcd, e1, 3); + + /* Finalize. */ + e0 =3D _mm_sha1nexte_epu32 (e0, e0_save); + abcd =3D _mm_add_epi32 (abcd, abcd_save); + + words =3D words + 4; + } + + abcd =3D _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */ + _mm_storeu_si128 ((__m128i *) &ctx->A, abcd); + ctx->E =3D _mm_extract_epi32 (e0, 3); +#endif +} +#endif + +/* Return sha1_process_bytes or some hardware optimized version thereof + depending on current CPU. */ + +sha1_process_bytes_fn +sha1_choose_process_bytes (void) +{ +#ifdef HAVE_X86_SHA1_HW_SUPPORT + unsigned int eax, ebx, ecx, edx; + if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx) + && (ebx & bit_SHA) !=3D 0 + && __get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_SSE4_1) !=3D 0) + return sha1_hw_process_bytes; +#endif + return sha1_process_bytes; +}