From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 6356 invoked by alias); 9 Jul 2014 12:26:54 -0000 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org Received: (qmail 6279 invoked by uid 48); 9 Jul 2014 12:26:46 -0000 From: "m.zakirov at samsung dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/43725] Poor instructions selection, scheduling and registers allocation for ARM NEON intrinsics Date: Wed, 09 Jul 2014 12:26:00 -0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 4.5.0 X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: enhancement X-Bugzilla-Who: m.zakirov at samsung dot com X-Bugzilla-Status: NEW X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: cc Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 7bit X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-SW-Source: 2014-07/txt/msg00544.txt.bz2 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43725 Marat Zakirov changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |joseph at codesourcery dot com, | |m.zakirov at samsung dot com --- Comment #7 from Marat Zakirov --- Another neon alloc issue. Code: #include #include extern uint16x8x4_t m0; extern uint16x8x4_t m1; void foo(uint16_t * in_ptr) { uint16x8x4_t t0, t1; t0 = vld4q_u16((uint16_t *)&in_ptr[0 ]); t1 = vld4q_u16((uint16_t *)&in_ptr[64]); t0.val[0] *= 333; t0.val[1] *= 333; t0.val[2] *= 333; t0.val[3] *= 333; t1.val[0] *= 333; t1.val[1] *= 333; t1.val[2] *= 333; t1.val[3] *= 333; m0 = t0; m1 = t1; } Asm file: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} add r1, r0, #160 vld4.16 {d8, d10, d12, d14}, [r0] add r0, r0, #32 .pad #64 sub sp, sp, #64 vld4.16 {d16, d18, d20, d22}, [r2] movw r3, #:lower16:m1 movw r2, #:lower16:m0 vldr d6, .L3 vldr d7, .L3+8 movt r3, #:upper16:m1 movt r2, #:upper16:m0 vld4.16 {d9, d11, d13, d15}, [r0] vld4.16 {d17, d19, d21, d23}, [r1] vmul.i16 q12, q3, q4 vstmia sp, {d16-d23} <<< * vld1.64 {d4-d5}, [sp:64] <<< * vmul.i16 q13, q3, q5 <<< ** vmul.i16 q9, q3, q9 vmul.i16 q14, q3, q6 <<< ** vmul.i16 q10, q3, q10 vmul.i16 q8, q3, q2 <<< **, *** vmul.i16 q15, q3, q7 <<< ** vmul.i16 q11, q3, q11 vstmia r2, {d24-d31} vstmia r3, {d16-d23} add sp, sp, #64 @ sp needed fldmfdd sp!, {d8-d15} bx lr So my qustion are: 1) Why do we need * and why compiler used q2 in *** ? 2) Why compiler didn't reuse registers q5,q6,q2,q7 in ** ? Command line: cc1 -quiet -v t.c -quiet -dumpbase t.c -mfpu=neon -mcpu=cortex-a15 -mfloat-abi=softfp -marm -mtls-dialect=gnu -auxbase-strip t.s -O3 -Wno-error=unused-local-typedefs -version -fdump-tree-all -fdump-rtl-all -funwind-tables -o t.s gcc version = 4.10.0 --build=x86_64-pc-linux-gnu --host=x86_64-pc-linux-gnu --target=arm-v7a15v5r2-linux-gnueabi --Marat