From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 8220 invoked by alias); 7 May 2013 22:34:21 -0000 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org Received: (qmail 8147 invoked by uid 48); 7 May 2013 22:34:16 -0000 From: "jakub at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug rtl-optimization/55278] [4.8/4.9 Regression] Botan performance regressions apparently due to LRA Date: Tue, 07 May 2013 22:34:00 -0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: rtl-optimization X-Bugzilla-Keywords: missed-optimization, ra X-Bugzilla-Severity: normal X-Bugzilla-Who: jakub at gcc dot gnu.org X-Bugzilla-Status: NEW X-Bugzilla-Priority: P2 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: 4.8.1 X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated Content-Type: text/plain; charset="UTF-8" MIME-Version: 1.0 X-SW-Source: 2013-05/txt/msg00459.txt.bz2 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55278 --- Comment #6 from Jakub Jelinek 2013-05-07 22:34:15 UTC --- To: unsigned char KASUMI_SBOX_S7[128]; unsigned short KASUMI_SBOX_S9[512]; static inline unsigned short rotate_left (unsigned short input, unsigned long rot) { return ((input << rot) | (input >> (8 * sizeof (unsigned short) - rot))); } static inline unsigned short reverse_bytes (unsigned short val) { return rotate_left (val, 8); } static inline unsigned short load_be (const unsigned char in[], unsigned long off) { return reverse_bytes (*((const unsigned short *)in + off)); } static inline void store_be (unsigned short in, unsigned char out[2]) { *(unsigned short *)out = reverse_bytes (in); } static inline void store_be4 (unsigned char out[], unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3) { store_be (x0, out + (0 * sizeof (unsigned short))); store_be (x1, out + (1 * sizeof (unsigned short))); store_be (x2, out + (2 * sizeof (unsigned short))); store_be (x3, out + (3 * sizeof (unsigned short))); } unsigned short FI (unsigned short I, unsigned short K) { unsigned short D9 = (I >> 7); unsigned char D7 = (I & 0x7F); D9 = KASUMI_SBOX_S9[D9] ^ D7; D7 = KASUMI_SBOX_S7[D7] ^ (D9 & 0x7F); D7 ^= (K >> 9); D9 = KASUMI_SBOX_S9[D9 ^ (K & 0x1FF)] ^ D7; D7 = KASUMI_SBOX_S7[D7] ^ (D9 & 0x7F); return (D7 << 9) | D9; } __attribute__((noinline, noclone)) void encrypt_n (unsigned short **EK, const unsigned char in[], unsigned char out[], unsigned long blocks) { unsigned long i, j; for (i = 0; i != blocks; ++i) { unsigned short B0 = load_be (in, 0); unsigned short B1 = load_be (in, 1); unsigned short B2 = load_be (in, 2); unsigned short B3 = load_be (in, 3); for (j = 0; j != 8; j += 2) { const unsigned short *K = &(*EK)[8 * j]; unsigned short R = B1 ^ (rotate_left (B0, 1) & K[0]); unsigned short L = B0 ^ (rotate_left (R, 1) | K[1]); L = FI (L ^ K[2], K[3]) ^ R; R = FI (R ^ K[4], K[5]) ^ L; L = FI (L ^ K[6], K[7]) ^ R; R = B2 ^= R; L = B3 ^= L; R = FI (R ^ K[10], K[11]) ^ L; L = FI (L ^ K[12], K[13]) ^ R; R = FI (R ^ K[14], K[15]) ^ L; R ^= (rotate_left (L, 1) & K[8]); L ^= (rotate_left (R, 1) | K[9]); B0 ^= L; B1 ^= R; } store_be4 (out, B0, B1, B2, B3); in += 8; out += 8; } } unsigned char in[4096], out[4096]; int main () { unsigned short EKb[64], *EK = EKb; __builtin_memset (EKb, 0, sizeof EKb); asm volatile ("" : : : "memory"); int i; for (i = 0; i < 100000; i++) encrypt_n (&EK, in, out, 4096 / 8); return 0; } actually (note different code in store_be and load_be). I'm surprised that the 16-bit rotations aren't detected/folded into rotations (or rotate_left (u16, 8) into a bswap16).