Re: [ARM] Use vector wide add for mixed-mode adds

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Michael Collison <michael.collison@linaro.org>
To: Ramana Radhakrishnan <ramana.radhakrishnan@foss.arm.com>,
	 gcc-patches@gcc.gnu.org
Subject: Re: [ARM] Use vector wide add for mixed-mode adds
Date: Sun, 23 Aug 2015 04:16:00 -0000	[thread overview]
Message-ID: <55D8EBD9.20408@linaro.org> (raw)
In-Reply-To: <55D3339C.1080807@foss.arm.com>

This is a modified version of the previous patch that addresses issue 
raised by Ramana. The patch now uses vect_select instead of unspec.

I had to fix an unrelated issue to the read_name function in read-md.c. 
The fix corrects broken support for mode iterators inside '<>'. Without 
this fix support for rtl expression such 'plus:<VW:V_widen>' were broken.

A second unrelated issue to this patch is correcting the documentation 
for the standard names for wide add support

This patch is designed to address code that was not being vectorized due 
to missing widening patterns in the ARM backend. Code such as:

int t6(int len, void * dummy, short * __restrict x)
{
   len = len & ~31;
   int result = 0;
   __asm volatile ("");
   for (int i = 0; i < len; i++)
     result += x[i];
   return result;
}

Validated on arm-none-eabi, arm-none-linux-gnueabi, 
arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.


--------------------------------------------------------------------------------------------------------------------------------------------------
2015-08-21  Michael Collison  <michael.collison@linaro.org>

     * config/arm/neon.md (widen_<us>sum<mode>): New patterns
     where mode is VQI to improve mixed mode vectorization.
     * read-md.c (read_name): Allow mode iterators inside '<>' in rtl 
expressions.
     * doc/md.texi: Rename [su]sum_widen to widen_[su]sum to reflect 
correct standard name
     * gcc.target/arm/neon-vaddws16.c: New test.
     * gcc.target/arm/neon-vaddws32.c: New test.
     * gcc.target/arm/neon-vaddwu16.c: New test.
     * gcc.target/arm/neon-vaddwu32.c: New test.
     * gcc.target/arm/neon-vaddwu8.c: New test.

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 654d9d5..54623fe 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,57 @@

  ;; Widening operations

+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+    (plus:<V_double_width> (sign_extend:<V_double_width> 
(match_operand:VQI 1 "s_register_operand" ""))
+                   (match_operand:<V_double_width> 2 
"s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    int i;
+    int half_elem = <V_mode_nunits>/2;
+    rtvec v1 = rtvec_alloc (half_elem);
+    rtvec v2 = rtvec_alloc (half_elem);
+    rtx p1, p2;
+
+    for (i = 0; i < half_elem; i++)
+      RTVEC_ELT (v1, i) = GEN_INT (i);
+    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+    for (i = half_elem; i < <V_mode_nunits>; i++)
+      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], 
operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], 
operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+    (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW 
(match_operand:VQI 1 "s_register_operand" "%w")
+                           (match_operand:VQI 2 "vect_par_constant_low" 
"")))
+                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %e1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+    (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW 
(match_operand:VQI 1 "s_register_operand" "%w")
+                           (match_operand:VQI 2 
"vect_par_constant_high" "")))
+                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
  (define_insn "widen_ssum<mode>3"
    [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
      (plus:<V_widen> (sign_extend:<V_widen>
@@ -1184,6 +1235,57 @@
    [(set_attr "type" "neon_add_widen")]
  )

+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+    (plus:<V_double_width> (zero_extend:<V_double_width> 
(match_operand:VQI 1 "s_register_operand" ""))
+                   (match_operand:<V_double_width> 2 
"s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    int i;
+    int half_elem = <V_mode_nunits>/2;
+    rtvec v1 = rtvec_alloc (half_elem);
+    rtvec v2 = rtvec_alloc (half_elem);
+    rtx p1, p2;
+
+    for (i = 0; i < half_elem; i++)
+      RTVEC_ELT (v1, i) = GEN_INT (i);
+    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+    for (i = half_elem; i < <V_mode_nunits>; i++)
+      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0], 
operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0], 
operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+    (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW 
(match_operand:VQI 1 "s_register_operand" "%w")
+                           (match_operand:VQI 2 "vect_par_constant_low" 
"")))
+                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_u_elem>\t%q0, %q3, %e1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+    (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW 
(match_operand:VQI 1 "s_register_operand" "%w")
+                           (match_operand:VQI 2 
"vect_par_constant_high" "")))
+                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_u_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 0ec229f..7af4183 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4939,10 +4939,10 @@ is of a wider mode, is computed and added to 
operand 3. Operand 3 is of a mode
  equal or wider than the mode of the absolute difference. The result is 
placed
  in operand 0, which is of the same mode as operand 3.

-@cindex @code{ssum_widen@var{m3}} instruction pattern
-@item @samp{ssum_widen@var{m3}}
-@cindex @code{usum_widen@var{m3}} instruction pattern
-@itemx @samp{usum_widen@var{m3}}
+@cindex @code{widen_ssum@var{m3}} instruction pattern
+@item @samp{widen_ssum@var{m3}}
+@cindex @code{widen_usum@var{m3}} instruction pattern
+@itemx @samp{widen_usum@var{m3}}
  Operands 0 and 2 are of the same mode, which is wider than the mode of
  operand 1. Add operand 1 to operand 2 and place the widened result in
  operand 0. (This is used express accumulation of elements into an 
accumulator
diff --git a/gcc/read-md.c b/gcc/read-md.c
index 9f158ec..df5748f 100644
--- a/gcc/read-md.c
+++ b/gcc/read-md.c
@@ -399,16 +399,24 @@ read_name (struct md_name *name)
  {
    int c;
    size_t i;
+  int in_angle_bracket;

    c = read_skip_spaces ();

    i = 0;
+  in_angle_bracket = 0;
    while (1)
      {
+      if (c == '<')
+    in_angle_bracket = 1;
+
+      if (c == '>')
+    in_angle_bracket = 0;
+
        if (c == ' ' || c == '\n' || c == '\t' || c == '\f' || c == '\r'
        || c == EOF)
      break;
-      if (c == ':' || c == ')' || c == ']' || c == '"' || c == '/'
+      if (((c == ':') and (in_angle_bracket == 0)) || c == ')' || c == 
']' || c == '"' || c == '/'
        || c == '(' || c == '[')
      {
        unread_char (c);
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..ed10669
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..94bf0c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..98f8768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..2e9af56
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, unsigned int * __restrict x)
+{
+  len = len & ~31;
+  unsigned long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..de2ad8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
+
+
+


On 08/18/2015 06:31 AM, Ramana Radhakrishnan wrote:
>
> On 18/08/15 08:53, Michael Collison wrote:
>> This patch is designed to address code that was not being vectorized due to missing widening patterns in the ARM backend. Code such as:
>>
>> int t6(int len, void * dummy, short * __restrict x)
>> {
>>    len = len & ~31;
>>    int result = 0;
>>    __asm volatile ("");
>>    for (int i = 0; i < len; i++)
>>      result += x[i];
>>    return result;
>> }
>>
>> Validated on arm-none-eabi, arm-none-linux-gnueabi, arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
>>
>> There is one regression on gcc.dg/vect/slp-reduc-3.c that only occurs when -flto is enabled:
>>
>> gcc.dg/vect/slp-reduc-3.c -flto -ffat-lto-objects  scan-tree-dump-times vect "vectorizing stmts using SLP" 1
>> gcc.dg/vect/slp-reduc-3.c scan-tree-dump-times vect "vectorizing stmts using SLP" 1
>>
> Interesting, though not sure why that happens without some digging further.
>
>> I could use some feedback on whether this is a regression or issue with the test case.
>> -------------------------------------------------------------------------------------------------------------
>> 2015-08-18  Michael Collison  <michael.collison@linaro.org>
>>
>>      * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>>      where mode is VQI to improve mixed mode vectorization.
>>      * config/arm/unspec.md: Add new unspecs: UNSPEC_VZERO_EXTEND and
>>      UNSPEC_VSIGN_EXTEND.
>>      * gcc.target/arm/neon-vaddws16.c: New test.
>>      * gcc.target/arm/neon-vaddws32.c: New test.
>>      * gcc.target/arm/neon-vaddwu16.c: New test.
>>      * gcc.target/arm/neon-vaddwu32.c: New test.
>>      * gcc.target/arm/neon-vaddwu8.c: New test.
>>
>>
>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>> index 654d9d5..50cb409 100644
>> --- a/gcc/config/arm/neon.md
>> +++ b/gcc/config/arm/neon.md
>> @@ -1174,6 +1174,27 @@
>>
>>   ;; Widening operations
>>
>> +(define_insn_and_split "widen_ssum<mode>3"
>> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
>> +    (plus:<V_double_width> (unspec:<V_double_width>
>> +                   [(match_operand:VQI 1 "s_register_operand" "w")]
>> +                UNSPEC_VSIGN_EXTEND)
>> +                (match_operand:<V_double_width> 2 "s_register_operand" "0")))]
>> +  "TARGET_NEON"
>> +  "#"
>> +  "&& reload_completed"
>
> I notice widen_ssum and widen_usum do not have any documentation with it - can you look to provide some kind of followup documentation for these patterns in md.texi while you are here ?
>
>
>> +  [(const_int 0)]
>> +{
>> +    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0);
>> +    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
>> +
>> +    emit_insn (gen_widen_ssum<V_half>3 (operands[0], loreg, operands[2]));
>> +    emit_insn (gen_widen_ssum<V_half>3 (operands[0], hireg, operands[2]));
>> +    DONE;
>> +  }
>> +  [(set_attr "type" "neon_add_widen")
>> +   (set_attr "length" "8")])
> Isn't it better to expand this into
>
> (set (reg:V4SI reg) (plus:V4SI (sign_extend:V4SI (vec_select:V4HI (reg:V8HI ...)
> 						                  (parallel:V8HI (const_vector { 4, 5, 6, 7})))
> 			 	(reg:V4SI reg)))
>
> (set (reg:V4SI reg) (plus:V4SI (sign_extend: V4SI (vec_select:V4HI (reg:V8HI)
> 								   (parallel: V8HI (const_vector { 0, 1, 2, 3}))))
>
>
>
> That way we can "combine" cases where we have this kind of expressions from the intrinsics - I'm wondering about combinations from vmovl / vadd / vget_low ?
>
> I'd like us to avoid unspecs where we can...
>
>
> regards
> Ramana
>
>> +
>>   (define_insn "widen_ssum<mode>3"
>>     [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
>>       (plus:<V_widen> (sign_extend:<V_widen>
>> @@ -1184,6 +1205,27 @@
>>     [(set_attr "type" "neon_add_widen")]
>>   )
>>
>> +(define_insn_and_split "widen_usum<mode>3"
>> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
>> +    (plus:<V_double_width> (unspec:<V_double_width>
>> +                   [(match_operand:VQI 1 "s_register_operand" "w")]
>> +                UNSPEC_VZERO_EXTEND)
>> +                (match_operand:<V_double_width> 2 "s_register_operand" "0")))]
>> +  "TARGET_NEON"
>> +  "#"
>> +  "&& reload_completed"
>> +  [(const_int 0)]
>> +{
>> +    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0);
>> +    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
>> +
>> +    emit_insn (gen_widen_usum<V_half>3 (operands[0], loreg, operands[2]));
>> +    emit_insn (gen_widen_usum<V_half>3 (operands[0], hireg, operands[2]));
>> +    DONE;
>> +  }
>> +  [(set_attr "type" "neon_add_widen")
>> +   (set_attr "length" "8")])
>> +
>>   (define_insn "widen_usum<mode>3"
>>     [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
>>       (plus:<V_widen> (zero_extend:<V_widen>
>> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
>> index 0ec2c48..e9cf836 100644
>> --- a/gcc/config/arm/unspecs.md
>> +++ b/gcc/config/arm/unspecs.md
>> @@ -358,5 +358,7 @@
>>     UNSPEC_NVRINTX
>>     UNSPEC_NVRINTA
>>     UNSPEC_NVRINTN
>> +  UNSPEC_VZERO_EXTEND
>> +  UNSPEC_VSIGN_EXTEND
>>   ])
>>
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>> new file mode 100644
>> index 0000000..ed10669
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>> @@ -0,0 +1,21 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +
>> +int
>> +t6(int len, void * dummy, short * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  int result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw\.s16" } } */
>> +
>> +
>> +
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
>> new file mode 100644
>> index 0000000..94bf0c9
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
>> @@ -0,0 +1,19 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +int
>> +t6(int len, void * dummy, int * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  long long result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw\.s32" } } */
>> +
>> +
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
>> new file mode 100644
>> index 0000000..98f8768
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +
>> +int
>> +t6(int len, void * dummy, unsigned short * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  unsigned int result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw.u16" } } */
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
>> new file mode 100644
>> index 0000000..2e9af56
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +int
>> +t6(int len, void * dummy, unsigned int * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  unsigned long long result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw\.u32" } } */
>> +
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
>> new file mode 100644
>> index 0000000..de2ad8a
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
>> @@ -0,0 +1,21 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +
>> +int
>> +t6(int len, void * dummy, char * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  unsigned short result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw\.u8" } } */
>> +
>> +
>> +

-- 
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org

next prev parent reply	other threads:[~2015-08-22 21:38 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-08-18  8:02 Michael Collison
2015-08-18 13:46 ` Ramana Radhakrishnan
2015-08-23  4:16   ` Michael Collison [this message]
2015-08-24  8:37     ` Ramana Radhakrishnan
2015-09-23  2:40 Michael Collison
2015-09-23  8:59 ` Kyrill Tkachov
2015-10-01 10:05   ` Michael Collison
2015-10-08 11:02     ` Kyrill Tkachov
2015-10-20  8:11       ` Michael Collison
2015-10-21 15:14         ` Charles Baylis
2015-11-30  6:59 Michael Collison
2015-12-10 15:09 ` Kyrill Tkachov
2015-12-17  0:02   ` Michael Collison
2016-02-09 16:27     ` Kyrill Tkachov
2016-02-15  6:32       ` Michael Collison

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=55D8EBD9.20408@linaro.org \
    --to=michael.collison@linaro.org \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=ramana.radhakrishnan@foss.arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).