public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, ARM] Support NEON's VABD with combine pass
@ 2011-07-29 11:58 Dmitry Melnik
  2011-08-05  3:50 ` Ramana Radhakrishnan
  0 siblings, 1 reply; 6+ messages in thread
From: Dmitry Melnik @ 2011-07-29 11:58 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Earnshaw, Andrey Belevantsev

[-- Attachment #1: Type: text/plain, Size: 591 bytes --]

This patch adds two define_insn patterns for NEON vabd instruction to 
make combine pass recognize expressions matching (vabs (vsub ...)) 
patterns as vabd.
This patch reduces code size of x264 binary from 649143 to 648343 (800 
bytes, or 0.12%) and increases its performance on average by 2.5% on 
plain C version of x264 with -O2 -ftree-vectorize.
On SPEC2K it didn't make any difference -- all vabs instructions found 
in SPEC2K binaries are either using .f64 mode or scalar .f32 which are 
not supported by NEON's vabd.
Regtested with QEMU.

Ok for trunk?


-- 
Best regards,
    Dmitry

[-- Attachment #2: neon-vabd.diff --]
[-- Type: text/x-diff, Size: 3632 bytes --]

    2011-07-21  Sevak Sargsyan <sevak.sargsyan@ispras.ru>
    
        * config/arm/neon.md (neon_vabd<mode>_2, neon_vabd<mode>_3): New define_insn patterns for combine.
    
    gcc/testsuite:
    
        * gcc.target/arm/neon-combine-sub-abs-into-vabd.c: New test.

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index a8c1b87..f457365 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5607,3 +5607,32 @@
   emit_insn (gen_neon_vec_pack_trunc_<V_double> (operands[0], tempreg));
   DONE;
 })
+
+(define_insn "neon_vabd<mode>_2"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+       (abs:VDQ (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+                           (match_operand:VDQ 2 "s_register_operand" "w"))))]
+ "TARGET_NEON"
+ "vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+       (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                   (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                     (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vabd<mode>_3"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+       (abs:VDQ (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w")
+                             (match_operand:VDQ 2 "s_register_operand" "w")]
+                 UNSPEC_VSUB)))]
+ "TARGET_NEON"
+ "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+       (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                   (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                     (const_string "neon_int_5")))]
+)
diff --git a/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
new file mode 100644
index 0000000..aae4117
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2 -funsafe-math-optimizations" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+float32x2_t f_sub_abs_to_vabd_32()
+{
+
+       float32x2_t val1 = vdup_n_f32 (10); 
+       float32x2_t val2 = vdup_n_f32 (30);
+       float32x2_t sres = vsub_f32(val1, val2);
+       float32x2_t res = vabs_f32 (sres); 
+
+   return res;
+}
+/* { dg-final { scan-assembler "vabd\.f32" } }*/
+
+#include <arm_neon.h>
+int8x8_t sub_abs_to_vabd_8()
+{
+       
+       int8x8_t val1 = vdup_n_s8 (10); 
+        int8x8_t val2 = vdup_n_s8 (30);
+        int8x8_t sres = vsub_s8(val1, val2);
+        int8x8_t res = vabs_s8 (sres); 
+
+   return res;
+}
+/* { dg-final { scan-assembler "vabd\.s8" } }*/
+
+int16x4_t sub_abs_to_vabd_16()
+{
+       
+       int16x4_t val1 = vdup_n_s16 (10); 
+        int16x4_t val2 = vdup_n_s16 (30);
+        int16x4_t sres = vsub_s16(val1, val2);
+        int16x4_t res = vabs_s16 (sres); 
+
+   return res;
+}
+/* { dg-final { scan-assembler "vabd\.s16" } }*/
+
+int32x2_t sub_abs_to_vabd_32()
+{
+
+        int32x2_t val1 = vdup_n_s32 (10);
+        int32x2_t val2 = vdup_n_s32 (30);
+        int32x2_t sres = vsub_s32(val1, val2);
+        int32x2_t res = vabs_s32 (sres);
+
+   return res;
+}
+/* { dg-final { scan-assembler "vabd\.s32" } }*/

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH, ARM] Support NEON's VABD with combine pass
  2011-07-29 11:58 [PATCH, ARM] Support NEON's VABD with combine pass Dmitry Melnik
@ 2011-08-05  3:50 ` Ramana Radhakrishnan
  2011-08-05 15:08   ` Joseph S. Myers
  2011-09-12 17:50   ` Dmitry Melnik
  0 siblings, 2 replies; 6+ messages in thread
From: Ramana Radhakrishnan @ 2011-08-05  3:50 UTC (permalink / raw)
  To: Dmitry Melnik; +Cc: gcc-patches, Richard Earnshaw, Andrey Belevantsev

On 29 July 2011 10:58, Dmitry Melnik <dm@ispras.ru> wrote:
> This patch adds two define_insn patterns for NEON vabd instruction to make
> combine pass recognize expressions matching (vabs (vsub ...)) patterns as
> vabd.

Interesting but I would be a bit defensive and make sure that this
matches only if -ffast-math in the FP case. You are sort of relying on
the fact that vsub wouldn't be generated without ffast-math but I'd
rather be defensive about it . (This is in case it's not clear in the
non-intrinsics case).

  I've had a couple of conversations about what the intrinsics
behaviour should in such cases with folks. Should we try to match vabs
(vsub)  even for intrinsics and generate a vabd or desist from doing
this and generate only what was asked for. My personal preference is
the former but it would be interesting to see what others think .

BTW was SPEC2k built with -Ofast ? Maybe then you'll see a bit of vectorization.

cheers
Ramana

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH, ARM] Support NEON's VABD with combine pass
  2011-08-05  3:50 ` Ramana Radhakrishnan
@ 2011-08-05 15:08   ` Joseph S. Myers
  2011-09-12 17:50   ` Dmitry Melnik
  1 sibling, 0 replies; 6+ messages in thread
From: Joseph S. Myers @ 2011-08-05 15:08 UTC (permalink / raw)
  To: Ramana Radhakrishnan
  Cc: Dmitry Melnik, gcc-patches, Richard Earnshaw, Andrey Belevantsev

On Fri, 5 Aug 2011, Ramana Radhakrishnan wrote:

>   I've had a couple of conversations about what the intrinsics
> behaviour should in such cases with folks. Should we try to match vabs
> (vsub)  even for intrinsics and generate a vabd or desist from doing
> this and generate only what was asked for. My personal preference is
> the former but it would be interesting to see what others think .

Intrinsics should generally be considered to have C-level semantics (that 
may happen to be the same as those of a particular machine instruction), 
rather than the semantics that they must generate a particular 
instruction.  I.e., the former.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH, ARM] Support NEON's VABD with combine pass
  2011-08-05  3:50 ` Ramana Radhakrishnan
  2011-08-05 15:08   ` Joseph S. Myers
@ 2011-09-12 17:50   ` Dmitry Melnik
  2011-09-12 18:02     ` Ramana Radhakrishnan
  1 sibling, 1 reply; 6+ messages in thread
From: Dmitry Melnik @ 2011-09-12 17:50 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: gcc-patches, Richard Earnshaw, Andrey Belevantsev

[-- Attachment #1: Type: text/plain, Size: 546 bytes --]


> Interesting but I would be a bit defensive and make sure that this
> matches only if -ffast-math in the FP case. You are sort of relying on
> the fact that vsub wouldn't be generated without ffast-math but I'd
> rather be defensive about it . (This is in case it's not clear in the
> non-intrinsics case).
Fixed.
> BTW was SPEC2k built with -Ofast ? Maybe then you'll see a bit of vectorization.
Yes, I built it with -Ofast. I think it's because SPEC2K tests mostly 
use doubles, which are not supported by vabd.

-- 
Best regards,
    Dmitry

[-- Attachment #2: neon-vabd.diff --]
[-- Type: text/x-diff, Size: 3354 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index a8c1b87..aceb564 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5607,3 +5607,32 @@
   emit_insn (gen_neon_vec_pack_trunc_<V_double> (operands[0], tempreg));
   DONE;
 })
+
+(define_insn "neon_vabd<mode>_2"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+       (abs:VDQ (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+                           (match_operand:VDQ 2 "s_register_operand" "w"))))]
+ "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+ "vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+       (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                   (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                     (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vabd<mode>_3"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+       (abs:VDQ (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w")
+                             (match_operand:VDQ 2 "s_register_operand" "w")]
+                 UNSPEC_VSUB)))]
+ "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+ "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+       (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                   (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                     (const_string "neon_int_5")))]
+)
diff --git a/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
new file mode 100644
index 0000000..ad6ba75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2 -funsafe-math-optimizations" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+float32x2_t f_sub_abs_to_vabd_32()
+{
+  float32x2_t val1 = vdup_n_f32 (10);
+  float32x2_t val2 = vdup_n_f32 (30);
+  float32x2_t sres = vsub_f32(val1, val2);
+  float32x2_t res = vabs_f32 (sres);
+
+  return res;
+}
+/* { dg-final { scan-assembler "vabd\.f32" } }*/
+
+#include <arm_neon.h>
+int8x8_t sub_abs_to_vabd_8()
+{
+  int8x8_t val1 = vdup_n_s8 (10);
+  int8x8_t val2 = vdup_n_s8 (30);
+  int8x8_t sres = vsub_s8(val1, val2);
+  int8x8_t res = vabs_s8 (sres);
+
+  return res;
+}
+/* { dg-final { scan-assembler "vabd\.s8" } }*/
+
+int16x4_t sub_abs_to_vabd_16()
+{
+  int16x4_t val1 = vdup_n_s16 (10);
+  int16x4_t val2 = vdup_n_s16 (30);
+  int16x4_t sres = vsub_s16(val1, val2);
+  int16x4_t res = vabs_s16 (sres);
+
+  return res;
+}
+/* { dg-final { scan-assembler "vabd\.s16" } }*/
+
+int32x2_t sub_abs_to_vabd_32()
+{
+  int32x2_t val1 = vdup_n_s32 (10);
+  int32x2_t val2 = vdup_n_s32 (30);
+  int32x2_t sres = vsub_s32(val1, val2);
+  int32x2_t res = vabs_s32 (sres);
+
+   return res;
+}
+/* { dg-final { scan-assembler "vabd\.s32" } }*/

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH, ARM] Support NEON's VABD with combine pass
  2011-09-12 17:50   ` Dmitry Melnik
@ 2011-09-12 18:02     ` Ramana Radhakrishnan
  0 siblings, 0 replies; 6+ messages in thread
From: Ramana Radhakrishnan @ 2011-09-12 18:02 UTC (permalink / raw)
  To: Dmitry Melnik; +Cc: gcc-patches, Richard Earnshaw, Andrey Belevantsev

On 12 September 2011 17:11, Dmitry Melnik <dm@ispras.ru> wrote:
>
>> Interesting but I would be a bit defensive and make sure that this
>> matches only if -ffast-math in the FP case. You are sort of relying on
>> the fact that vsub wouldn't be generated without ffast-math but I'd
>> rather be defensive about it . (This is in case it's not clear in the
>> non-intrinsics case).
>
> Fixed.
>>
>> BTW was SPEC2k built with -Ofast ? Maybe then you'll see a bit of
>> vectorization.
>
> Yes, I built it with -Ofast. I think it's because SPEC2K tests mostly use
> doubles, which are not supported by vabd.

OK.

cheers
Ramana

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH, ARM] Support NEON's VABD with combine pass
@ 2011-07-29 12:09 Dmitry Melnik
  0 siblings, 0 replies; 6+ messages in thread
From: Dmitry Melnik @ 2011-07-29 12:09 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Earnshaw

[-- Attachment #1: Type: text/plain, Size: 591 bytes --]

This patch adds two define_insn patterns for NEON vabd instruction to 
make combine pass recognize expressions matching (vabs (vsub ...)) 
patterns as vabd.
This patch reduces code size of x264 binary from 649143 to 648343 (800 
bytes, or 0.12%) and increases its performance on average by 2.5% on 
plain C version of x264 with -O2 -ftree-vectorize.
On SPEC2K it didn't make any difference -- all vabs instructions found 
in SPEC2K binaries are either using .f64 mode or scalar .f32 which are 
not supported by NEON's vabd.
Regtested with QEMU.

Ok for trunk?


--
Best regards,
    Dmitry


[-- Attachment #2: neon-vabd.diff --]
[-- Type: text/x-diff, Size: 3632 bytes --]

    2011-07-21  Sevak Sargsyan <sevak.sargsyan@ispras.ru>
    
        * config/arm/neon.md (neon_vabd<mode>_2, neon_vabd<mode>_3): New define_insn patterns for combine.
    
    gcc/testsuite:
    
        * gcc.target/arm/neon-combine-sub-abs-into-vabd.c: New test.

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index a8c1b87..f457365 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5607,3 +5607,32 @@
   emit_insn (gen_neon_vec_pack_trunc_<V_double> (operands[0], tempreg));
   DONE;
 })
+
+(define_insn "neon_vabd<mode>_2"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+       (abs:VDQ (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+                           (match_operand:VDQ 2 "s_register_operand" "w"))))]
+ "TARGET_NEON"
+ "vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+       (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                   (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                     (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vabd<mode>_3"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+       (abs:VDQ (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w")
+                             (match_operand:VDQ 2 "s_register_operand" "w")]
+                 UNSPEC_VSUB)))]
+ "TARGET_NEON"
+ "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+       (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+                     (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+                                   (const_string "neon_fp_vadd_ddd_vabs_dd")
+                                   (const_string "neon_fp_vadd_qqq_vabs_qq"))
+                     (const_string "neon_int_5")))]
+)
diff --git a/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
new file mode 100644
index 0000000..aae4117
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2 -funsafe-math-optimizations" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+float32x2_t f_sub_abs_to_vabd_32()
+{
+
+       float32x2_t val1 = vdup_n_f32 (10); 
+       float32x2_t val2 = vdup_n_f32 (30);
+       float32x2_t sres = vsub_f32(val1, val2);
+       float32x2_t res = vabs_f32 (sres); 
+
+   return res;
+}
+/* { dg-final { scan-assembler "vabd\.f32" } }*/
+
+#include <arm_neon.h>
+int8x8_t sub_abs_to_vabd_8()
+{
+       
+       int8x8_t val1 = vdup_n_s8 (10); 
+        int8x8_t val2 = vdup_n_s8 (30);
+        int8x8_t sres = vsub_s8(val1, val2);
+        int8x8_t res = vabs_s8 (sres); 
+
+   return res;
+}
+/* { dg-final { scan-assembler "vabd\.s8" } }*/
+
+int16x4_t sub_abs_to_vabd_16()
+{
+       
+       int16x4_t val1 = vdup_n_s16 (10); 
+        int16x4_t val2 = vdup_n_s16 (30);
+        int16x4_t sres = vsub_s16(val1, val2);
+        int16x4_t res = vabs_s16 (sres); 
+
+   return res;
+}
+/* { dg-final { scan-assembler "vabd\.s16" } }*/
+
+int32x2_t sub_abs_to_vabd_32()
+{
+
+        int32x2_t val1 = vdup_n_s32 (10);
+        int32x2_t val2 = vdup_n_s32 (30);
+        int32x2_t sres = vsub_s32(val1, val2);
+        int32x2_t res = vabs_s32 (sres);
+
+   return res;
+}
+/* { dg-final { scan-assembler "vabd\.s32" } }*/

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2011-09-12 17:01 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-29 11:58 [PATCH, ARM] Support NEON's VABD with combine pass Dmitry Melnik
2011-08-05  3:50 ` Ramana Radhakrishnan
2011-08-05 15:08   ` Joseph S. Myers
2011-09-12 17:50   ` Dmitry Melnik
2011-09-12 18:02     ` Ramana Radhakrishnan
2011-07-29 12:09 Dmitry Melnik

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).