public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/43725]  New: Poor instructions selection, scheduling and registers allocation for ARM NEON intrinsics
@ 2010-04-12  7:27 siarhei dot siamashka at gmail dot com
  2010-05-11  7:35 ` [Bug target/43725] " ramana at gcc dot gnu dot org
  0 siblings, 1 reply; 13+ messages in thread
From: siarhei dot siamashka at gmail dot com @ 2010-04-12  7:27 UTC (permalink / raw)
  To: gcc-bugs

gcc version 4.5.0-rc20100406

/**************/
#include <arm_neon.h>

void x(int32x4_t a, int32x4_t b, int32x4_t *p)
{
        #define X(n) p[n] = vaddq_s32(p[n], a); p[n] = vorrq_s32(p[n], b);
        X(0); X(1); X(2); X(3); X(4); X(5); X(6); X(7);
        X(8); X(9); X(10); X(11); X(12);
}
/**************/

# gcc -O2 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=hard -c test.c
# objdump -d test.o

00000000 <x>:
   0:   edd0eb2c        vldr    d30, [r0, #176] ; 0xb0
   4:   edd0fb2e        vldr    d31, [r0, #184] ; 0xb8
   8:   ecd02b04        vldmia  r0, {d18-d19}         
   c:   ed2d8b10        vpush   {d8-d15}              
  10:   ed904b30        vldr    d4, [r0, #192]  ; 0xc0
  14:   ed905b32        vldr    d5, [r0, #200]  ; 0xc8
  18:   e24dd020        sub     sp, sp, #32           
  1c:   f22ec8c0        vadd.i32        q6, q15, q0   
  20:   f26228c0        vadd.i32        q9, q9, q0    
  24:   edd04b08        vldr    d20, [r0, #32]        
  28:   edd05b0a        vldr    d21, [r0, #40]  ; 0x28
  2c:   edd0cb18        vldr    d28, [r0, #96]  ; 0x60
  30:   edd0db1a        vldr    d29, [r0, #104] ; 0x68
  34:   f264e840        vadd.i32        q15, q2, q0   
  38:   f26448c0        vadd.i32        q10, q10, q0  
  3c:   f26cc8c0        vadd.i32        q14, q14, q0  
  40:   edd00b04        vldr    d16, [r0, #16]        
  44:   edd01b06        vldr    d17, [r0, #24]        
  48:   edd0ab14        vldr    d26, [r0, #80]  ; 0x50
  4c:   edd0bb16        vldr    d27, [r0, #88]  ; 0x58
  50:   ec8dcb04        vstmia  sp, {d12-d13}         
  54:   f222c1d2        vorr    q6, q9, q1            
  58:   f26008c0        vadd.i32        q8, q8, q0    
  5c:   f26aa8c0        vadd.i32        q13, q13, q0  
  60:   edd06b0c        vldr    d22, [r0, #48]  ; 0x30
  64:   edd07b0e        vldr    d23, [r0, #56]  ; 0x38
  68:   edd08b10        vldr    d24, [r0, #64]  ; 0x40
  6c:   edd09b12        vldr    d25, [r0, #72]  ; 0x48
  70:   ed906b1c        vldr    d6, [r0, #112]  ; 0x70
  74:   ed907b1e        vldr    d7, [r0, #120]  ; 0x78
  78:   ed908b20        vldr    d8, [r0, #128]  ; 0x80
  7c:   ed909b22        vldr    d9, [r0, #136]  ; 0x88
  80:   ed90ab24        vldr    d10, [r0, #144] ; 0x90
  84:   ed90bb26        vldr    d11, [r0, #152] ; 0x98
  88:   ed90eb28        vldr    d14, [r0, #160] ; 0xa0
  8c:   ed90fb2a        vldr    d15, [r0, #168] ; 0xa8
  90:   edcdeb04        vstr    d30, [sp, #16]        
  94:   edcdfb06        vstr    d31, [sp, #24]        
  98:   ec80cb04        vstmia  r0, {d12-d13}         
  9c:   f224c1d2        vorr    q6, q10, q1           
  a0:   f26c41d2        vorr    q10, q14, q1          
  a4:   ecddcb04        vldmia  sp, {d28-d29}         
  a8:   f26021d2        vorr    q9, q8, q1            
  ac:   f26888c0        vadd.i32        q12, q12, q0  
  b0:   f26ae1d2        vorr    q15, q13, q1          
  b4:   f26668c0        vadd.i32        q11, q11, q0  
  b8:   f26ca1d2        vorr    q13, q14, q1          
  bc:   f2266840        vadd.i32        q3, q3, q0    
  c0:   f2288840        vadd.i32        q4, q4, q0    
  c4:   f22aa840        vadd.i32        q5, q5, q0    
  c8:   f22ee840        vadd.i32        q7, q7, q0    
  cc:   edddcb04        vldr    d28, [sp, #16]        
  d0:   eddddb06        vldr    d29, [sp, #24]        
  d4:   f22601d2        vorr    q0, q11, q1           
  d8:   f22841d2        vorr    q2, q12, q1           
  dc:   f2680152        vorr    q8, q4, q1            
  e0:   f26a6152        vorr    q11, q5, q1           
  e4:   f26e8152        vorr    q12, q7, q1           
  e8:   edc02b04        vstr    d18, [r0, #16]        
  ec:   edc03b06        vstr    d19, [r0, #24]        
  f0:   f2662152        vorr    q9, q3, q1            
  f4:   f22c21d2        vorr    q1, q14, q1           
  f8:   ed80cb08        vstr    d12, [r0, #32]        
  fc:   ed80db0a        vstr    d13, [r0, #40]  ; 0x28
 100:   ed800b0c        vstr    d0, [r0, #48]   ; 0x30
 104:   ed801b0e        vstr    d1, [r0, #56]   ; 0x38
 108:   ed804b10        vstr    d4, [r0, #64]   ; 0x40
 10c:   ed805b12        vstr    d5, [r0, #72]   ; 0x48
 110:   edc0eb14        vstr    d30, [r0, #80]  ; 0x50
 114:   edc0fb16        vstr    d31, [r0, #88]  ; 0x58
 118:   edc04b18        vstr    d20, [r0, #96]  ; 0x60
 11c:   edc05b1a        vstr    d21, [r0, #104] ; 0x68
 120:   edc02b1c        vstr    d18, [r0, #112] ; 0x70
 124:   edc03b1e        vstr    d19, [r0, #120] ; 0x78
 128:   edc00b20        vstr    d16, [r0, #128] ; 0x80
 12c:   edc01b22        vstr    d17, [r0, #136] ; 0x88
 130:   edc06b24        vstr    d22, [r0, #144] ; 0x90
 134:   edc07b26        vstr    d23, [r0, #152] ; 0x98
 138:   edc08b28        vstr    d24, [r0, #160] ; 0xa0
 13c:   edc09b2a        vstr    d25, [r0, #168] ; 0xa8
 140:   edc0ab2c        vstr    d26, [r0, #176] ; 0xb0
 144:   edc0bb2e        vstr    d27, [r0, #184] ; 0xb8
 148:   ed802b30        vstr    d2, [r0, #192]  ; 0xc0
 14c:   ed803b32        vstr    d3, [r0, #200]  ; 0xc8
 150:   e28dd020        add     sp, sp, #32
 154:   ecbd8b10        vpop    {d8-d15}
 158:   e12fff1e        bx      lr

This shows multiple performance problems:
1. The use of inherently slower VLDR/VSTR instructions instead of VLD1/VST1
2. Failure to make proper use of ARM Cortex-A8 NEON LS/ALU dual issue
3. Unnecessary spills to stack

This is a general issue with NEON intrinsics, causing serious performance
problems for practically any nontrivial code. I guess this itself can be a
meta-bug, with each individual performance issue tracked separately.


-- 
           Summary: Poor instructions selection, scheduling and registers
                    allocation for ARM NEON intrinsics
           Product: gcc
           Version: 4.5.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: target
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: siarhei dot siamashka at gmail dot com
GCC target triplet: armv7l-unknown-linux-gnueabi


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43725


^ permalink raw reply	[flat|nested] 13+ messages in thread
[parent not found: <bug-43725-4@http.gcc.gnu.org/bugzilla/>]

end of thread, other threads:[~2021-09-27  7:21 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-12  7:27 [Bug target/43725] New: Poor instructions selection, scheduling and registers allocation for ARM NEON intrinsics siarhei dot siamashka at gmail dot com
2010-05-11  7:35 ` [Bug target/43725] " ramana at gcc dot gnu dot org
     [not found] <bug-43725-4@http.gcc.gnu.org/bugzilla/>
2010-09-29 20:50 ` rearnsha at gcc dot gnu.org
2010-10-04 23:00 ` siarhei.siamashka at gmail dot com
2010-10-04 23:46 ` joseph at codesourcery dot com
2010-10-05  7:16 ` ramana at gcc dot gnu.org
2010-10-08 14:13 ` siarhei.siamashka at gmail dot com
2011-06-29 13:35 ` siarhei.siamashka at gmail dot com
2014-07-09 12:26 ` m.zakirov at samsung dot com
2014-07-29 11:35 ` m.zakirov at samsung dot com
2014-07-29 11:46 ` m.zakirov at samsung dot com
2014-08-20 16:44 ` mkuvyrkov at gcc dot gnu.org
2021-09-27  7:21 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).