[Bug target/43364] Suboptimal code for the use of ARM NEON intrinsic "vset_lane

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug target/43364] Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32"
       [not found] <bug-43364-4@http.gcc.gnu.org/bugzilla/>
@ 2012-12-10  2:12 ` siarhei.siamashka at gmail dot com
  0 siblings, 0 replies; 5+ messages in thread
From: siarhei.siamashka at gmail dot com @ 2012-12-10  2:12 UTC (permalink / raw)
  To: gcc-bugs


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43364

Siarhei Siamashka <siarhei.siamashka at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|NEW                         |RESOLVED
         Resolution|                            |FIXED
      Known to fail|                            |

--- Comment #5 from Siarhei Siamashka <siarhei.siamashka at gmail dot com> 2012-12-10 02:12:05 UTC ---
This seems to have improved a lot. Thanks for your hard work.

    .cpu cortex-a8
    .eabi_attribute 27, 3
    .eabi_attribute 28, 1
    .fpu neon
    .eabi_attribute 20, 1
    .eabi_attribute 21, 1
    .eabi_attribute 23, 3
    .eabi_attribute 24, 1
    .eabi_attribute 25, 1
    .eabi_attribute 26, 1
    .eabi_attribute 30, 2
    .eabi_attribute 34, 1
    .eabi_attribute 18, 4
    .file    "test.c"
    .text
    .align    2
    .global    neon_add
    .type    neon_add, %function
neon_add:
    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    vmov.f32    d16, #0.0  @ v2sf
    vmov    d17, d16  @ v2sf
    vld1.32    {d16[0]}, [r1]
    vld1.32    {d17[0]}, [r2]
    vadd.f32    d16, d16, d17
    vst1.32    {d16[0]}, [r0]
    bx    lr
    .size    neon_add, .-neon_add
    .ident    "GCC: (GNU) 4.8.0 20121209 (experimental)"


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/43364] Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32"
  2010-03-14 13:02 [Bug c/43364] New: " siarhei dot siamashka at gmail dot com
                   ` (2 preceding siblings ...)
  2010-06-15 20:15 ` siarhei dot siamashka at gmail dot com
@ 2010-06-15 20:35 ` siarhei dot siamashka at gmail dot com
  3 siblings, 0 replies; 5+ messages in thread
From: siarhei dot siamashka at gmail dot com @ 2010-06-15 20:35 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #4 from siarhei dot siamashka at gmail dot com  2010-06-15 20:34 -------
(In reply to comment #3)
> Or use multiple alternatives feature for inline assembly constraints to emit either VMOV or VLD1?

Well, this kind of works :) But is very ugly and fragile:

/***************************************/
#include <arm_neon.h>

/* Override a slow 'vdup_n_f32' intrinsic with something better */

static inline float32x2_t vdup_n_f32_fast(float x)
{
    float32x2_t result;
    asm (
    ".set vdup_n_f32_fast_CODE_EMITTED,0\n"
    ".irp regname,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14\n"
    ".ifeqs \"\\regname\", \"%1\"\n"
    "    vdup.32 %P0, %1\n"
    "    .set vdup_n_f32_fast_CODE_EMITTED,1\n"
    ".endif\n"
    ".ifeqs \"[\\regname, #0]\", \"%1\"\n"
    "    vld1.f32 {%P0[]}, [\\regname, :32]\n"
    "    .set vdup_n_f32_fast_CODE_EMITTED,1\n"
    ".endif\n"
    ".endr\n"
    ".if vdup_n_f32_fast_CODE_EMITTED == 0\n"
    ".error \"Fixme: icky macros from 'vdup_n_f32_fast' failed\"\n"
    ".endif\n"
    : "=w,w" (result) : "r,Q" (x) : "memory");
    return result;
}

#define vdup_n_f32(x) vdup_n_f32_fast(x)

/* Now let's test it for accessing data in registers */

float neon_add_regs(float a, float b)
{
    float32x2_t tmp1, tmp2;
    tmp1 = vdup_n_f32(a);
    tmp2 = vdup_n_f32(b);
    tmp1 = vadd_f32(tmp1, tmp2);
    return vget_lane_f32(tmp1, 0);
}

/* ... and in memory */

void neon_add_mem(float * __restrict out,
                  float * __restrict a,
                  float * __restrict b)
{
    float32x2_t tmp1, tmp2;
    tmp1 = vdup_n_f32(*a);
    tmp2 = vdup_n_f32(*b);
    tmp1 = vadd_f32(tmp1, tmp2);
    *out = vget_lane_f32(tmp1, 0);
}
/***************************************/

$ objdump -d test.o

00000000 <neon_add_mem>:
   0:   f4e10c9f        vld1.32 {d16[]}, [r1, :32]
   4:   f4e21c9f        vld1.32 {d17[]}, [r2, :32]
   8:   f2400da1        vadd.f32        d16, d16, d17
   c:   f4c0080f        vst1.32 {d16[0]}, [r0]
  10:   e12fff1e        bx      lr

00000014 <neon_add_regs>:
  14:   ee800b90        vdup.32 d16, r0
  18:   ee811b90        vdup.32 d17, r1
  1c:   f2400da1        vadd.f32        d16, d16, d17
  20:   ee100b90        vmov.32 r0, d16[0]
  24:   e12fff1e        bx      lr


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43364


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/43364] Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32"
  2010-03-14 13:02 [Bug c/43364] New: " siarhei dot siamashka at gmail dot com
  2010-03-15 11:58 ` [Bug target/43364] " ramana at gcc dot gnu dot org
  2010-04-12  5:26 ` siarhei dot siamashka at gmail dot com
@ 2010-06-15 20:15 ` siarhei dot siamashka at gmail dot com
  2010-06-15 20:35 ` siarhei dot siamashka at gmail dot com
  3 siblings, 0 replies; 5+ messages in thread
From: siarhei dot siamashka at gmail dot com @ 2010-06-15 20:15 UTC (permalink / raw)
  To: gcc-bugs

------- Comment #3 from siarhei dot siamashka at gmail dot com  2010-06-15 20:14 -------
The whole point of submitting this PR was to find an efficient way to use NEON
instructions to operate on any arbitrary scalar floating point values in order
to overcome Cortex-A8 VFP Lite inherent slowness (maybe make it transparent via
wrapping it into a C++ class and use operator overloading).

Using 'vdup_n_f32' to load a single floating point value seems to be better
than 'vset_lane_f32' here because we don't have to deal with uninitialized part
of the register. But 'vdup_n_f32' suffers from the similar performance issues
(VLD1 instruction is not used directly) and results in redundant instructions
emitted when the value is loaded from memory. Optimistically, something like
this should have been used instead of 'vdup_n_f32' in this case:

static inline float32x2_t vdup_n_f32_mem(float *p)
{
    float32x2_t result;
    asm ("vld1.f32 {%P0[]}, [%1, :32]" : "=w" (result) : "r" (p) : "memory");
    return result;
}

If wonder if it is possible to check at compile time whether the operand comes
from memory or from a register? Something similar to '__builtin_constant_p'
builtin-function? Or use multiple alternatives feature for inline assembly
constraints to emit either VMOV or VLD1? Anything else?

-- 

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43364

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/43364] Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32"
  2010-03-14 13:02 [Bug c/43364] New: " siarhei dot siamashka at gmail dot com
  2010-03-15 11:58 ` [Bug target/43364] " ramana at gcc dot gnu dot org
@ 2010-04-12  5:26 ` siarhei dot siamashka at gmail dot com
  2010-06-15 20:15 ` siarhei dot siamashka at gmail dot com
  2010-06-15 20:35 ` siarhei dot siamashka at gmail dot com
  3 siblings, 0 replies; 5+ messages in thread
From: siarhei dot siamashka at gmail dot com @ 2010-04-12  5:26 UTC (permalink / raw)
  To: gcc-bugs

------- Comment #2 from siarhei dot siamashka at gmail dot com  2010-04-12 05:26 -------
(In reply to comment #1)
>         mov     r3, #0
>         vdup.32 d16, r3

Also maybe "veor.32 d16, d16, d16" here?

Or drop this NEON register initialization completely because it is a redundant
operation and was not explicitly requested in the original C code?

After all, from IHI0042D_aapcs.pdf:
"The FPSCR is the only status register that may be accessed by conforming code.
It is a global register with the following properties:
* The condition code bits (28-31), the cumulative saturation (QC) bit (27) and
the cumulative exception-status bits (0-4) are not preserved across a public
interface."

and from ARM ARM:
"Advanced SIMD arithmetic always uses untrapped exception handling"

Tracking the cumulative exception-status bits may be tricky in general (using
ununitialized value for NEON arithmetics can set them arbitrarily), but as long
as they are not used in any way in the function itself, they are irrelevant.

-- 

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43364

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/43364] Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32"
  2010-03-14 13:02 [Bug c/43364] New: " siarhei dot siamashka at gmail dot com
@ 2010-03-15 11:58 ` ramana at gcc dot gnu dot org
  2010-04-12  5:26 ` siarhei dot siamashka at gmail dot com
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: ramana at gcc dot gnu dot org @ 2010-03-15 11:58 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #1 from ramana at gcc dot gnu dot org  2010-03-15 11:58 -------

 though trunk of a recent vintage generates the following bit of code which is
slightly better. There's no reason why we can't expand in a better manner and
generate the vld1.f32. Marking as an enhancement.


        mov     r3, #0
        ldr     r2, [r2, #0]    @ float
        vdup.32 d16, r3
        ldr     r3, [r1, #0]    @ float
        vmov    d17, d16  @ v2sf
        vmov.32 d17[0], r2
        vmov.32 d16[0], r3
        vadd.f32        d16, d16, d17
        vst1.32 {d16[0]}, [r0]
        bx      lr


-- 

ramana at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Severity|normal                      |enhancement
             Status|UNCONFIRMED                 |NEW
          Component|c                           |target
     Ever Confirmed|0                           |1
           Keywords|                            |missed-optimization
      Known to fail|                            |4.4.3 4.5.0
   Last reconfirmed|0000-00-00 00:00:00         |2010-03-15 11:58:16
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43364


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2012-12-10  2:12 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <bug-43364-4@http.gcc.gnu.org/bugzilla/>
2012-12-10  2:12 ` [Bug target/43364] Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32" siarhei.siamashka at gmail dot com
2010-03-14 13:02 [Bug c/43364] New: " siarhei dot siamashka at gmail dot com
2010-03-15 11:58 ` [Bug target/43364] " ramana at gcc dot gnu dot org
2010-04-12  5:26 ` siarhei dot siamashka at gmail dot com
2010-06-15 20:15 ` siarhei dot siamashka at gmail dot com
2010-06-15 20:35 ` siarhei dot siamashka at gmail dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).