[ARM] Use vector wide add for mixed-mode adds

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [ARM] Use vector wide add for mixed-mode adds
@ 2015-08-18  8:02 Michael Collison
  2015-08-18 13:46 ` Ramana Radhakrishnan
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2015-08-18  8:02 UTC (permalink / raw)
  To: gcc-patches


This patch is designed to address code that was not being vectorized due 
to missing widening patterns in the ARM backend. Code such as:

int t6(int len, void * dummy, short * __restrict x)
{
   len = len & ~31;
   int result = 0;
   __asm volatile ("");
   for (int i = 0; i < len; i++)
     result += x[i];
   return result;
}

Validated on arm-none-eabi, arm-none-linux-gnueabi, 
arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.

There is one regression on gcc.dg/vect/slp-reduc-3.c that only occurs 
when -flto is enabled:

gcc.dg/vect/slp-reduc-3.c -flto -ffat-lto-objects  scan-tree-dump-times vect "vectorizing stmts using SLP" 1
gcc.dg/vect/slp-reduc-3.c scan-tree-dump-times vect "vectorizing stmts using SLP" 1


I could use some feedback on whether this is a regression or issue with 
the test case.
-------------------------------------------------------------------------------------------------------------
2015-08-18  Michael Collison  <michael.collison@linaro.org>

     * config/arm/neon.md (widen_<us>sum<mode>): New patterns
     where mode is VQI to improve mixed mode vectorization.
     * config/arm/unspec.md: Add new unspecs: UNSPEC_VZERO_EXTEND and
     UNSPEC_VSIGN_EXTEND.
     * gcc.target/arm/neon-vaddws16.c: New test.
     * gcc.target/arm/neon-vaddws32.c: New test.
     * gcc.target/arm/neon-vaddwu16.c: New test.
     * gcc.target/arm/neon-vaddwu32.c: New test.
     * gcc.target/arm/neon-vaddwu8.c: New test.


diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 654d9d5..50cb409 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,27 @@

  ;; Widening operations

+(define_insn_and_split "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
+    (plus:<V_double_width> (unspec:<V_double_width>
+                   [(match_operand:VQI 1 "s_register_operand" "w")]
+                UNSPEC_VSIGN_EXTEND)
+                (match_operand:<V_double_width> 2 "s_register_operand" 
"0")))]
+  "TARGET_NEON"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], 
<MODE>mode, 0);
+    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], 
<MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
+
+    emit_insn (gen_widen_ssum<V_half>3 (operands[0], loreg, operands[2]));
+    emit_insn (gen_widen_ssum<V_half>3 (operands[0], hireg, operands[2]));
+    DONE;
+  }
+  [(set_attr "type" "neon_add_widen")
+   (set_attr "length" "8")])
+
  (define_insn "widen_ssum<mode>3"
    [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
      (plus:<V_widen> (sign_extend:<V_widen>
@@ -1184,6 +1205,27 @@
    [(set_attr "type" "neon_add_widen")]
  )

+(define_insn_and_split "widen_usum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
+    (plus:<V_double_width> (unspec:<V_double_width>
+                   [(match_operand:VQI 1 "s_register_operand" "w")]
+                UNSPEC_VZERO_EXTEND)
+                (match_operand:<V_double_width> 2 "s_register_operand" 
"0")))]
+  "TARGET_NEON"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], 
<MODE>mode, 0);
+    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], 
<MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
+
+    emit_insn (gen_widen_usum<V_half>3 (operands[0], loreg, operands[2]));
+    emit_insn (gen_widen_usum<V_half>3 (operands[0], hireg, operands[2]));
+    DONE;
+  }
+  [(set_attr "type" "neon_add_widen")
+   (set_attr "length" "8")])
+
  (define_insn "widen_usum<mode>3"
    [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
      (plus:<V_widen> (zero_extend:<V_widen>
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 0ec2c48..e9cf836 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -358,5 +358,7 @@
    UNSPEC_NVRINTX
    UNSPEC_NVRINTA
    UNSPEC_NVRINTN
+  UNSPEC_VZERO_EXTEND
+  UNSPEC_VSIGN_EXTEND
  ])

diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..ed10669
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..94bf0c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..98f8768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..2e9af56
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, unsigned int * __restrict x)
+{
+  len = len & ~31;
+  unsigned long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..de2ad8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
+
+
+
-- 
1.9.1


-- 
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-08-18  8:02 [ARM] Use vector wide add for mixed-mode adds Michael Collison
@ 2015-08-18 13:46 ` Ramana Radhakrishnan
  2015-08-23  4:16   ` Michael Collison
  0 siblings, 1 reply; 15+ messages in thread
From: Ramana Radhakrishnan @ 2015-08-18 13:46 UTC (permalink / raw)
  To: Michael Collison, gcc-patches



On 18/08/15 08:53, Michael Collison wrote:
> 
> This patch is designed to address code that was not being vectorized due to missing widening patterns in the ARM backend. Code such as:
> 
> int t6(int len, void * dummy, short * __restrict x)
> {
>   len = len & ~31;
>   int result = 0;
>   __asm volatile ("");
>   for (int i = 0; i < len; i++)
>     result += x[i];
>   return result;
> }
> 
> Validated on arm-none-eabi, arm-none-linux-gnueabi, arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
> 
> There is one regression on gcc.dg/vect/slp-reduc-3.c that only occurs when -flto is enabled:
> 
> gcc.dg/vect/slp-reduc-3.c -flto -ffat-lto-objects  scan-tree-dump-times vect "vectorizing stmts using SLP" 1
> gcc.dg/vect/slp-reduc-3.c scan-tree-dump-times vect "vectorizing stmts using SLP" 1
> 

Interesting, though not sure why that happens without some digging further.

> 
> I could use some feedback on whether this is a regression or issue with the test case.
> -------------------------------------------------------------------------------------------------------------
> 2015-08-18  Michael Collison  <michael.collison@linaro.org>
> 
>     * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>     where mode is VQI to improve mixed mode vectorization.
>     * config/arm/unspec.md: Add new unspecs: UNSPEC_VZERO_EXTEND and
>     UNSPEC_VSIGN_EXTEND.
>     * gcc.target/arm/neon-vaddws16.c: New test.
>     * gcc.target/arm/neon-vaddws32.c: New test.
>     * gcc.target/arm/neon-vaddwu16.c: New test.
>     * gcc.target/arm/neon-vaddwu32.c: New test.
>     * gcc.target/arm/neon-vaddwu8.c: New test.
> 
> 
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 654d9d5..50cb409 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -1174,6 +1174,27 @@
> 
>  ;; Widening operations
> 
> +(define_insn_and_split "widen_ssum<mode>3"
> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
> +    (plus:<V_double_width> (unspec:<V_double_width>
> +                   [(match_operand:VQI 1 "s_register_operand" "w")]
> +                UNSPEC_VSIGN_EXTEND)
> +                (match_operand:<V_double_width> 2 "s_register_operand" "0")))]
> +  "TARGET_NEON"
> +  "#"
> +  "&& reload_completed"


I notice widen_ssum and widen_usum do not have any documentation with it - can you look to provide some kind of followup documentation for these patterns in md.texi while you are here ?


> +  [(const_int 0)]
> +{
> +    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0);
> +    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
> +
> +    emit_insn (gen_widen_ssum<V_half>3 (operands[0], loreg, operands[2]));
> +    emit_insn (gen_widen_ssum<V_half>3 (operands[0], hireg, operands[2]));
> +    DONE;
> +  }
> +  [(set_attr "type" "neon_add_widen")
> +   (set_attr "length" "8")])

Isn't it better to expand this into 

(set (reg:V4SI reg) (plus:V4SI (sign_extend:V4SI (vec_select:V4HI (reg:V8HI ...) 
						                  (parallel:V8HI (const_vector { 4, 5, 6, 7})))
			 	(reg:V4SI reg)))

(set (reg:V4SI reg) (plus:V4SI (sign_extend: V4SI (vec_select:V4HI (reg:V8HI)
								   (parallel: V8HI (const_vector { 0, 1, 2, 3}))))



That way we can "combine" cases where we have this kind of expressions from the intrinsics - I'm wondering about combinations from vmovl / vadd / vget_low ? 

I'd like us to avoid unspecs where we can...


regards
Ramana

> +
>  (define_insn "widen_ssum<mode>3"
>    [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
>      (plus:<V_widen> (sign_extend:<V_widen>
> @@ -1184,6 +1205,27 @@
>    [(set_attr "type" "neon_add_widen")]
>  )
> 
> +(define_insn_and_split "widen_usum<mode>3"
> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
> +    (plus:<V_double_width> (unspec:<V_double_width>
> +                   [(match_operand:VQI 1 "s_register_operand" "w")]
> +                UNSPEC_VZERO_EXTEND)
> +                (match_operand:<V_double_width> 2 "s_register_operand" "0")))]
> +  "TARGET_NEON"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +{
> +    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0);
> +    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
> +
> +    emit_insn (gen_widen_usum<V_half>3 (operands[0], loreg, operands[2]));
> +    emit_insn (gen_widen_usum<V_half>3 (operands[0], hireg, operands[2]));
> +    DONE;
> +  }
> +  [(set_attr "type" "neon_add_widen")
> +   (set_attr "length" "8")])
> +
>  (define_insn "widen_usum<mode>3"
>    [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
>      (plus:<V_widen> (zero_extend:<V_widen>
> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> index 0ec2c48..e9cf836 100644
> --- a/gcc/config/arm/unspecs.md
> +++ b/gcc/config/arm/unspecs.md
> @@ -358,5 +358,7 @@
>    UNSPEC_NVRINTX
>    UNSPEC_NVRINTA
>    UNSPEC_NVRINTN
> +  UNSPEC_VZERO_EXTEND
> +  UNSPEC_VSIGN_EXTEND
>  ])
> 
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
> new file mode 100644
> index 0000000..ed10669
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +
> +int
> +t6(int len, void * dummy, short * __restrict x)
> +{
> +  len = len & ~31;
> +  int result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw\.s16" } } */
> +
> +
> +
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
> new file mode 100644
> index 0000000..94bf0c9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +int
> +t6(int len, void * dummy, int * __restrict x)
> +{
> +  len = len & ~31;
> +  long long result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw\.s32" } } */
> +
> +
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
> new file mode 100644
> index 0000000..98f8768
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +
> +int
> +t6(int len, void * dummy, unsigned short * __restrict x)
> +{
> +  len = len & ~31;
> +  unsigned int result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw.u16" } } */
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
> new file mode 100644
> index 0000000..2e9af56
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +int
> +t6(int len, void * dummy, unsigned int * __restrict x)
> +{
> +  len = len & ~31;
> +  unsigned long long result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw\.u32" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
> new file mode 100644
> index 0000000..de2ad8a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +
> +int
> +t6(int len, void * dummy, char * __restrict x)
> +{
> +  len = len & ~31;
> +  unsigned short result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw\.u8" } } */
> +
> +
> +

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-08-18 13:46 ` Ramana Radhakrishnan
@ 2015-08-23  4:16   ` Michael Collison
  2015-08-24  8:37     ` Ramana Radhakrishnan
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2015-08-23  4:16 UTC (permalink / raw)
  To: Ramana Radhakrishnan, gcc-patches

This is a modified version of the previous patch that addresses issue 
raised by Ramana. The patch now uses vect_select instead of unspec.

I had to fix an unrelated issue to the read_name function in read-md.c. 
The fix corrects broken support for mode iterators inside '<>'. Without 
this fix support for rtl expression such 'plus:<VW:V_widen>' were broken.

A second unrelated issue to this patch is correcting the documentation 
for the standard names for wide add support

This patch is designed to address code that was not being vectorized due 
to missing widening patterns in the ARM backend. Code such as:

int t6(int len, void * dummy, short * __restrict x)
{
   len = len & ~31;
   int result = 0;
   __asm volatile ("");
   for (int i = 0; i < len; i++)
     result += x[i];
   return result;
}

Validated on arm-none-eabi, arm-none-linux-gnueabi, 
arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.


--------------------------------------------------------------------------------------------------------------------------------------------------
2015-08-21  Michael Collison  <michael.collison@linaro.org>

     * config/arm/neon.md (widen_<us>sum<mode>): New patterns
     where mode is VQI to improve mixed mode vectorization.
     * read-md.c (read_name): Allow mode iterators inside '<>' in rtl 
expressions.
     * doc/md.texi: Rename [su]sum_widen to widen_[su]sum to reflect 
correct standard name
     * gcc.target/arm/neon-vaddws16.c: New test.
     * gcc.target/arm/neon-vaddws32.c: New test.
     * gcc.target/arm/neon-vaddwu16.c: New test.
     * gcc.target/arm/neon-vaddwu32.c: New test.
     * gcc.target/arm/neon-vaddwu8.c: New test.

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 654d9d5..54623fe 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,57 @@

  ;; Widening operations

+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+    (plus:<V_double_width> (sign_extend:<V_double_width> 
(match_operand:VQI 1 "s_register_operand" ""))
+                   (match_operand:<V_double_width> 2 
"s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    int i;
+    int half_elem = <V_mode_nunits>/2;
+    rtvec v1 = rtvec_alloc (half_elem);
+    rtvec v2 = rtvec_alloc (half_elem);
+    rtx p1, p2;
+
+    for (i = 0; i < half_elem; i++)
+      RTVEC_ELT (v1, i) = GEN_INT (i);
+    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+    for (i = half_elem; i < <V_mode_nunits>; i++)
+      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], 
operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], 
operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+    (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW 
(match_operand:VQI 1 "s_register_operand" "%w")
+                           (match_operand:VQI 2 "vect_par_constant_low" 
"")))
+                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %e1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+    (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW 
(match_operand:VQI 1 "s_register_operand" "%w")
+                           (match_operand:VQI 2 
"vect_par_constant_high" "")))
+                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
  (define_insn "widen_ssum<mode>3"
    [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
      (plus:<V_widen> (sign_extend:<V_widen>
@@ -1184,6 +1235,57 @@
    [(set_attr "type" "neon_add_widen")]
  )

+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+    (plus:<V_double_width> (zero_extend:<V_double_width> 
(match_operand:VQI 1 "s_register_operand" ""))
+                   (match_operand:<V_double_width> 2 
"s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    int i;
+    int half_elem = <V_mode_nunits>/2;
+    rtvec v1 = rtvec_alloc (half_elem);
+    rtvec v2 = rtvec_alloc (half_elem);
+    rtx p1, p2;
+
+    for (i = 0; i < half_elem; i++)
+      RTVEC_ELT (v1, i) = GEN_INT (i);
+    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+    for (i = half_elem; i < <V_mode_nunits>; i++)
+      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0], 
operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0], 
operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+    (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW 
(match_operand:VQI 1 "s_register_operand" "%w")
+                           (match_operand:VQI 2 "vect_par_constant_low" 
"")))
+                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_u_elem>\t%q0, %q3, %e1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+    (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW 
(match_operand:VQI 1 "s_register_operand" "%w")
+                           (match_operand:VQI 2 
"vect_par_constant_high" "")))
+                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_u_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 0ec229f..7af4183 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4939,10 +4939,10 @@ is of a wider mode, is computed and added to 
operand 3. Operand 3 is of a mode
  equal or wider than the mode of the absolute difference. The result is 
placed
  in operand 0, which is of the same mode as operand 3.

-@cindex @code{ssum_widen@var{m3}} instruction pattern
-@item @samp{ssum_widen@var{m3}}
-@cindex @code{usum_widen@var{m3}} instruction pattern
-@itemx @samp{usum_widen@var{m3}}
+@cindex @code{widen_ssum@var{m3}} instruction pattern
+@item @samp{widen_ssum@var{m3}}
+@cindex @code{widen_usum@var{m3}} instruction pattern
+@itemx @samp{widen_usum@var{m3}}
  Operands 0 and 2 are of the same mode, which is wider than the mode of
  operand 1. Add operand 1 to operand 2 and place the widened result in
  operand 0. (This is used express accumulation of elements into an 
accumulator
diff --git a/gcc/read-md.c b/gcc/read-md.c
index 9f158ec..df5748f 100644
--- a/gcc/read-md.c
+++ b/gcc/read-md.c
@@ -399,16 +399,24 @@ read_name (struct md_name *name)
  {
    int c;
    size_t i;
+  int in_angle_bracket;

    c = read_skip_spaces ();

    i = 0;
+  in_angle_bracket = 0;
    while (1)
      {
+      if (c == '<')
+    in_angle_bracket = 1;
+
+      if (c == '>')
+    in_angle_bracket = 0;
+
        if (c == ' ' || c == '\n' || c == '\t' || c == '\f' || c == '\r'
        || c == EOF)
      break;
-      if (c == ':' || c == ')' || c == ']' || c == '"' || c == '/'
+      if (((c == ':') and (in_angle_bracket == 0)) || c == ')' || c == 
']' || c == '"' || c == '/'
        || c == '(' || c == '[')
      {
        unread_char (c);
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..ed10669
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..94bf0c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..98f8768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..2e9af56
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, unsigned int * __restrict x)
+{
+  len = len & ~31;
+  unsigned long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c 
b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..de2ad8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
+
+
+


On 08/18/2015 06:31 AM, Ramana Radhakrishnan wrote:
>
> On 18/08/15 08:53, Michael Collison wrote:
>> This patch is designed to address code that was not being vectorized due to missing widening patterns in the ARM backend. Code such as:
>>
>> int t6(int len, void * dummy, short * __restrict x)
>> {
>>    len = len & ~31;
>>    int result = 0;
>>    __asm volatile ("");
>>    for (int i = 0; i < len; i++)
>>      result += x[i];
>>    return result;
>> }
>>
>> Validated on arm-none-eabi, arm-none-linux-gnueabi, arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
>>
>> There is one regression on gcc.dg/vect/slp-reduc-3.c that only occurs when -flto is enabled:
>>
>> gcc.dg/vect/slp-reduc-3.c -flto -ffat-lto-objects  scan-tree-dump-times vect "vectorizing stmts using SLP" 1
>> gcc.dg/vect/slp-reduc-3.c scan-tree-dump-times vect "vectorizing stmts using SLP" 1
>>
> Interesting, though not sure why that happens without some digging further.
>
>> I could use some feedback on whether this is a regression or issue with the test case.
>> -------------------------------------------------------------------------------------------------------------
>> 2015-08-18  Michael Collison  <michael.collison@linaro.org>
>>
>>      * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>>      where mode is VQI to improve mixed mode vectorization.
>>      * config/arm/unspec.md: Add new unspecs: UNSPEC_VZERO_EXTEND and
>>      UNSPEC_VSIGN_EXTEND.
>>      * gcc.target/arm/neon-vaddws16.c: New test.
>>      * gcc.target/arm/neon-vaddws32.c: New test.
>>      * gcc.target/arm/neon-vaddwu16.c: New test.
>>      * gcc.target/arm/neon-vaddwu32.c: New test.
>>      * gcc.target/arm/neon-vaddwu8.c: New test.
>>
>>
>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>> index 654d9d5..50cb409 100644
>> --- a/gcc/config/arm/neon.md
>> +++ b/gcc/config/arm/neon.md
>> @@ -1174,6 +1174,27 @@
>>
>>   ;; Widening operations
>>
>> +(define_insn_and_split "widen_ssum<mode>3"
>> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
>> +    (plus:<V_double_width> (unspec:<V_double_width>
>> +                   [(match_operand:VQI 1 "s_register_operand" "w")]
>> +                UNSPEC_VSIGN_EXTEND)
>> +                (match_operand:<V_double_width> 2 "s_register_operand" "0")))]
>> +  "TARGET_NEON"
>> +  "#"
>> +  "&& reload_completed"
>
> I notice widen_ssum and widen_usum do not have any documentation with it - can you look to provide some kind of followup documentation for these patterns in md.texi while you are here ?
>
>
>> +  [(const_int 0)]
>> +{
>> +    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0);
>> +    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
>> +
>> +    emit_insn (gen_widen_ssum<V_half>3 (operands[0], loreg, operands[2]));
>> +    emit_insn (gen_widen_ssum<V_half>3 (operands[0], hireg, operands[2]));
>> +    DONE;
>> +  }
>> +  [(set_attr "type" "neon_add_widen")
>> +   (set_attr "length" "8")])
> Isn't it better to expand this into
>
> (set (reg:V4SI reg) (plus:V4SI (sign_extend:V4SI (vec_select:V4HI (reg:V8HI ...)
> 						                  (parallel:V8HI (const_vector { 4, 5, 6, 7})))
> 			 	(reg:V4SI reg)))
>
> (set (reg:V4SI reg) (plus:V4SI (sign_extend: V4SI (vec_select:V4HI (reg:V8HI)
> 								   (parallel: V8HI (const_vector { 0, 1, 2, 3}))))
>
>
>
> That way we can "combine" cases where we have this kind of expressions from the intrinsics - I'm wondering about combinations from vmovl / vadd / vget_low ?
>
> I'd like us to avoid unspecs where we can...
>
>
> regards
> Ramana
>
>> +
>>   (define_insn "widen_ssum<mode>3"
>>     [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
>>       (plus:<V_widen> (sign_extend:<V_widen>
>> @@ -1184,6 +1205,27 @@
>>     [(set_attr "type" "neon_add_widen")]
>>   )
>>
>> +(define_insn_and_split "widen_usum<mode>3"
>> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
>> +    (plus:<V_double_width> (unspec:<V_double_width>
>> +                   [(match_operand:VQI 1 "s_register_operand" "w")]
>> +                UNSPEC_VZERO_EXTEND)
>> +                (match_operand:<V_double_width> 2 "s_register_operand" "0")))]
>> +  "TARGET_NEON"
>> +  "#"
>> +  "&& reload_completed"
>> +  [(const_int 0)]
>> +{
>> +    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0);
>> +    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
>> +
>> +    emit_insn (gen_widen_usum<V_half>3 (operands[0], loreg, operands[2]));
>> +    emit_insn (gen_widen_usum<V_half>3 (operands[0], hireg, operands[2]));
>> +    DONE;
>> +  }
>> +  [(set_attr "type" "neon_add_widen")
>> +   (set_attr "length" "8")])
>> +
>>   (define_insn "widen_usum<mode>3"
>>     [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
>>       (plus:<V_widen> (zero_extend:<V_widen>
>> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
>> index 0ec2c48..e9cf836 100644
>> --- a/gcc/config/arm/unspecs.md
>> +++ b/gcc/config/arm/unspecs.md
>> @@ -358,5 +358,7 @@
>>     UNSPEC_NVRINTX
>>     UNSPEC_NVRINTA
>>     UNSPEC_NVRINTN
>> +  UNSPEC_VZERO_EXTEND
>> +  UNSPEC_VSIGN_EXTEND
>>   ])
>>
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>> new file mode 100644
>> index 0000000..ed10669
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>> @@ -0,0 +1,21 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +
>> +int
>> +t6(int len, void * dummy, short * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  int result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw\.s16" } } */
>> +
>> +
>> +
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
>> new file mode 100644
>> index 0000000..94bf0c9
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
>> @@ -0,0 +1,19 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +int
>> +t6(int len, void * dummy, int * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  long long result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw\.s32" } } */
>> +
>> +
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
>> new file mode 100644
>> index 0000000..98f8768
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +
>> +int
>> +t6(int len, void * dummy, unsigned short * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  unsigned int result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw.u16" } } */
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
>> new file mode 100644
>> index 0000000..2e9af56
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +int
>> +t6(int len, void * dummy, unsigned int * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  unsigned long long result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw\.u32" } } */
>> +
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
>> new file mode 100644
>> index 0000000..de2ad8a
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
>> @@ -0,0 +1,21 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>> +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +
>> +int
>> +t6(int len, void * dummy, char * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  unsigned short result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw\.u8" } } */
>> +
>> +
>> +

-- 
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-08-23  4:16   ` Michael Collison
@ 2015-08-24  8:37     ` Ramana Radhakrishnan
  0 siblings, 0 replies; 15+ messages in thread
From: Ramana Radhakrishnan @ 2015-08-24  8:37 UTC (permalink / raw)
  To: Michael Collison, gcc-patches



On 22/08/15 22:38, Michael Collison wrote:
> This is a modified version of the previous patch that addresses issue raised by Ramana. The patch now uses vect_select instead of unspec.
> 
> I had to fix an unrelated issue to the read_name function in read-md.c. The fix corrects broken support for mode iterators inside '<>'. Without this fix support for rtl expression such 'plus:<VW:V_widen>' were broken.

Please pull out unrelated patches out as separate patch submissions. I note that this fixes PR 57915 and therefore a fix for that must go as a separate patch. Please indicate that this PR is fixed by saying so in the Changelog. Furthermore, this isn't an area that I'm able to review or approve and therefore should be kept separate. 

> 
> A second unrelated issue to this patch is correcting the documentation for the standard names for wide add support

Again unrelated issue - separate patch please.

> 
> This patch is designed to address code that was not being vectorized due to missing widening patterns in the ARM backend. Code such as:
> 
> int t6(int len, void * dummy, short * __restrict x)
> {
>   len = len & ~31;
>   int result = 0;
>   __asm volatile ("");
>   for (int i = 0; i < len; i++)
>     result += x[i];
>   return result;
> }
> 
> Validated on arm-none-eabi, arm-none-linux-gnueabi, arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.

I have this on my stack of patches to look at and will get back once I've had a chance to go through it in some detail. A skim read suggests this is the direction I wanted to go with this patch, but I'd like to read the logic a bit carefully and also look at whether this makes some other patterns in the backend now redundant.

Thanks,
Ramana

> 
> 
> --------------------------------------------------------------------------------------------------------------------------------------------------
> 2015-08-21  Michael Collison  <michael.collison@linaro.org>
> 
>     * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>     where mode is VQI to improve mixed mode vectorization.
>     * read-md.c (read_name): Allow mode iterators inside '<>' in rtl expressions.
>     * doc/md.texi: Rename [su]sum_widen to widen_[su]sum to reflect correct standard name
>     * gcc.target/arm/neon-vaddws16.c: New test.
>     * gcc.target/arm/neon-vaddws32.c: New test.
>     * gcc.target/arm/neon-vaddwu16.c: New test.
>     * gcc.target/arm/neon-vaddwu32.c: New test.
>     * gcc.target/arm/neon-vaddwu8.c: New test.
> 
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 654d9d5..54623fe 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -1174,6 +1174,57 @@
> 
>  ;; Widening operations
> 
> +(define_expand "widen_ssum<mode>3"
> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
> +    (plus:<V_double_width> (sign_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
> +                   (match_operand:<V_double_width> 2 "s_register_operand" "")))]
> +  "TARGET_NEON"
> +  {
> +    int i;
> +    int half_elem = <V_mode_nunits>/2;
> +    rtvec v1 = rtvec_alloc (half_elem);
> +    rtvec v2 = rtvec_alloc (half_elem);
> +    rtx p1, p2;
> +
> +    for (i = 0; i < half_elem; i++)
> +      RTVEC_ELT (v1, i) = GEN_INT (i);
> +    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
> +
> +    for (i = half_elem; i < <V_mode_nunits>; i++)
> +      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
> +    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
> +
> +    if (operands[0] != operands[2])
> +      emit_move_insn (operands[0], operands[2]);
> +
> +    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
> +    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
> +    DONE;
> +  }
> +)
> +
> +(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
> +  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
> +    (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
> +                           (match_operand:VQI 2 "vect_par_constant_low" "")))
> +                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
> +  "TARGET_NEON"
> +  "vaddw.<V_s_elem>\t%q0, %q3, %e1"
> +  [(set_attr "type" "neon_add_widen")
> +  (set_attr "length" "8")]
> +)
> +
> +(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
> +  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
> +    (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
> +                           (match_operand:VQI 2 "vect_par_constant_high" "")))
> +                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
> +  "TARGET_NEON"
> +  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
> +  [(set_attr "type" "neon_add_widen")
> +  (set_attr "length" "8")]
> +)
> +
>  (define_insn "widen_ssum<mode>3"
>    [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
>      (plus:<V_widen> (sign_extend:<V_widen>
> @@ -1184,6 +1235,57 @@
>    [(set_attr "type" "neon_add_widen")]
>  )
> 
> +(define_expand "widen_usum<mode>3"
> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
> +    (plus:<V_double_width> (zero_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
> +                   (match_operand:<V_double_width> 2 "s_register_operand" "")))]
> +  "TARGET_NEON"
> +  {
> +    int i;
> +    int half_elem = <V_mode_nunits>/2;
> +    rtvec v1 = rtvec_alloc (half_elem);
> +    rtvec v2 = rtvec_alloc (half_elem);
> +    rtx p1, p2;
> +
> +    for (i = 0; i < half_elem; i++)
> +      RTVEC_ELT (v1, i) = GEN_INT (i);
> +    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
> +
> +    for (i = half_elem; i < <V_mode_nunits>; i++)
> +      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
> +    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
> +
> +    if (operands[0] != operands[2])
> +      emit_move_insn (operands[0], operands[2]);
> +
> +    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
> +    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
> +    DONE;
> +  }
> +)
> +
> +(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
> +  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
> +    (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
> +                           (match_operand:VQI 2 "vect_par_constant_low" "")))
> +                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
> +  "TARGET_NEON"
> +  "vaddw.<V_u_elem>\t%q0, %q3, %e1"
> +  [(set_attr "type" "neon_add_widen")
> +  (set_attr "length" "8")]
> +)
> +
> +(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
> +  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
> +    (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
> +                           (match_operand:VQI 2 "vect_par_constant_high" "")))
> +                (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
> +  "TARGET_NEON"
> +  "vaddw.<V_u_elem>\t%q0, %q3, %f1"
> +  [(set_attr "type" "neon_add_widen")
> +  (set_attr "length" "8")]
> +)
> +
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 0ec229f..7af4183 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4939,10 +4939,10 @@ is of a wider mode, is computed and added to operand 3. Operand 3 is of a mode
>  equal or wider than the mode of the absolute difference. The result is placed
>  in operand 0, which is of the same mode as operand 3.
> 
> -@cindex @code{ssum_widen@var{m3}} instruction pattern
> -@item @samp{ssum_widen@var{m3}}
> -@cindex @code{usum_widen@var{m3}} instruction pattern
> -@itemx @samp{usum_widen@var{m3}}
> +@cindex @code{widen_ssum@var{m3}} instruction pattern
> +@item @samp{widen_ssum@var{m3}}
> +@cindex @code{widen_usum@var{m3}} instruction pattern
> +@itemx @samp{widen_usum@var{m3}}
>  Operands 0 and 2 are of the same mode, which is wider than the mode of
>  operand 1. Add operand 1 to operand 2 and place the widened result in
>  operand 0. (This is used express accumulation of elements into an accumulator
> diff --git a/gcc/read-md.c b/gcc/read-md.c
> index 9f158ec..df5748f 100644
> --- a/gcc/read-md.c
> +++ b/gcc/read-md.c
> @@ -399,16 +399,24 @@ read_name (struct md_name *name)
>  {
>    int c;
>    size_t i;
> +  int in_angle_bracket;
> 
>    c = read_skip_spaces ();
> 
>    i = 0;
> +  in_angle_bracket = 0;
>    while (1)
>      {
> +      if (c == '<')
> +    in_angle_bracket = 1;
> +
> +      if (c == '>')
> +    in_angle_bracket = 0;
> +
>        if (c == ' ' || c == '\n' || c == '\t' || c == '\f' || c == '\r'
>        || c == EOF)
>      break;
> -      if (c == ':' || c == ')' || c == ']' || c == '"' || c == '/'
> +      if (((c == ':') and (in_angle_bracket == 0)) || c == ')' || c == ']' || c == '"' || c == '/'
>        || c == '(' || c == '[')
>      {
>        unread_char (c);
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
> new file mode 100644
> index 0000000..ed10669
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +
> +int
> +t6(int len, void * dummy, short * __restrict x)
> +{
> +  len = len & ~31;
> +  int result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw\.s16" } } */
> +
> +
> +
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
> new file mode 100644
> index 0000000..94bf0c9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +int
> +t6(int len, void * dummy, int * __restrict x)
> +{
> +  len = len & ~31;
> +  long long result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw\.s32" } } */
> +
> +
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
> new file mode 100644
> index 0000000..98f8768
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +
> +int
> +t6(int len, void * dummy, unsigned short * __restrict x)
> +{
> +  len = len & ~31;
> +  unsigned int result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw.u16" } } */
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
> new file mode 100644
> index 0000000..2e9af56
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +int
> +t6(int len, void * dummy, unsigned int * __restrict x)
> +{
> +  len = len & ~31;
> +  unsigned long long result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw\.u32" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
> new file mode 100644
> index 0000000..de2ad8a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
> +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +
> +int
> +t6(int len, void * dummy, char * __restrict x)
> +{
> +  len = len & ~31;
> +  unsigned short result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw\.u8" } } */
> +
> +
> +
> 
> 
> On 08/18/2015 06:31 AM, Ramana Radhakrishnan wrote:
>>
>> On 18/08/15 08:53, Michael Collison wrote:
>>> This patch is designed to address code that was not being vectorized due to missing widening patterns in the ARM backend. Code such as:
>>>
>>> int t6(int len, void * dummy, short * __restrict x)
>>> {
>>>    len = len & ~31;
>>>    int result = 0;
>>>    __asm volatile ("");
>>>    for (int i = 0; i < len; i++)
>>>      result += x[i];
>>>    return result;
>>> }
>>>
>>> Validated on arm-none-eabi, arm-none-linux-gnueabi, arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
>>>
>>> There is one regression on gcc.dg/vect/slp-reduc-3.c that only occurs when -flto is enabled:
>>>
>>> gcc.dg/vect/slp-reduc-3.c -flto -ffat-lto-objects  scan-tree-dump-times vect "vectorizing stmts using SLP" 1
>>> gcc.dg/vect/slp-reduc-3.c scan-tree-dump-times vect "vectorizing stmts using SLP" 1
>>>
>> Interesting, though not sure why that happens without some digging further.
>>
>>> I could use some feedback on whether this is a regression or issue with the test case.
>>> -------------------------------------------------------------------------------------------------------------
>>> 2015-08-18  Michael Collison  <michael.collison@linaro.org>
>>>
>>>      * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>>>      where mode is VQI to improve mixed mode vectorization.
>>>      * config/arm/unspec.md: Add new unspecs: UNSPEC_VZERO_EXTEND and
>>>      UNSPEC_VSIGN_EXTEND.
>>>      * gcc.target/arm/neon-vaddws16.c: New test.
>>>      * gcc.target/arm/neon-vaddws32.c: New test.
>>>      * gcc.target/arm/neon-vaddwu16.c: New test.
>>>      * gcc.target/arm/neon-vaddwu32.c: New test.
>>>      * gcc.target/arm/neon-vaddwu8.c: New test.
>>>
>>>
>>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>>> index 654d9d5..50cb409 100644
>>> --- a/gcc/config/arm/neon.md
>>> +++ b/gcc/config/arm/neon.md
>>> @@ -1174,6 +1174,27 @@
>>>
>>>   ;; Widening operations
>>>
>>> +(define_insn_and_split "widen_ssum<mode>3"
>>> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
>>> +    (plus:<V_double_width> (unspec:<V_double_width>
>>> +                   [(match_operand:VQI 1 "s_register_operand" "w")]
>>> +                UNSPEC_VSIGN_EXTEND)
>>> +                (match_operand:<V_double_width> 2 "s_register_operand" "0")))]
>>> +  "TARGET_NEON"
>>> +  "#"
>>> +  "&& reload_completed"
>>
>> I notice widen_ssum and widen_usum do not have any documentation with it - can you look to provide some kind of followup documentation for these patterns in md.texi while you are here ?
>>
>>
>>> +  [(const_int 0)]
>>> +{
>>> +    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0);
>>> +    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
>>> +
>>> +    emit_insn (gen_widen_ssum<V_half>3 (operands[0], loreg, operands[2]));
>>> +    emit_insn (gen_widen_ssum<V_half>3 (operands[0], hireg, operands[2]));
>>> +    DONE;
>>> +  }
>>> +  [(set_attr "type" "neon_add_widen")
>>> +   (set_attr "length" "8")])
>> Isn't it better to expand this into
>>
>> (set (reg:V4SI reg) (plus:V4SI (sign_extend:V4SI (vec_select:V4HI (reg:V8HI ...)
>>                                           (parallel:V8HI (const_vector { 4, 5, 6, 7})))
>>                  (reg:V4SI reg)))
>>
>> (set (reg:V4SI reg) (plus:V4SI (sign_extend: V4SI (vec_select:V4HI (reg:V8HI)
>>                                    (parallel: V8HI (const_vector { 0, 1, 2, 3}))))
>>
>>
>>
>> That way we can "combine" cases where we have this kind of expressions from the intrinsics - I'm wondering about combinations from vmovl / vadd / vget_low ?
>>
>> I'd like us to avoid unspecs where we can...
>>
>>
>> regards
>> Ramana
>>
>>> +
>>>   (define_insn "widen_ssum<mode>3"
>>>     [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
>>>       (plus:<V_widen> (sign_extend:<V_widen>
>>> @@ -1184,6 +1205,27 @@
>>>     [(set_attr "type" "neon_add_widen")]
>>>   )
>>>
>>> +(define_insn_and_split "widen_usum<mode>3"
>>> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w")
>>> +    (plus:<V_double_width> (unspec:<V_double_width>
>>> +                   [(match_operand:VQI 1 "s_register_operand" "w")]
>>> +                UNSPEC_VZERO_EXTEND)
>>> +                (match_operand:<V_double_width> 2 "s_register_operand" "0")))]
>>> +  "TARGET_NEON"
>>> +  "#"
>>> +  "&& reload_completed"
>>> +  [(const_int 0)]
>>> +{
>>> +    rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0);
>>> +    rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, GET_MODE_SIZE (<V_HALF>mode));
>>> +
>>> +    emit_insn (gen_widen_usum<V_half>3 (operands[0], loreg, operands[2]));
>>> +    emit_insn (gen_widen_usum<V_half>3 (operands[0], hireg, operands[2]));
>>> +    DONE;
>>> +  }
>>> +  [(set_attr "type" "neon_add_widen")
>>> +   (set_attr "length" "8")])
>>> +
>>>   (define_insn "widen_usum<mode>3"
>>>     [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
>>>       (plus:<V_widen> (zero_extend:<V_widen>
>>> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
>>> index 0ec2c48..e9cf836 100644
>>> --- a/gcc/config/arm/unspecs.md
>>> +++ b/gcc/config/arm/unspecs.md
>>> @@ -358,5 +358,7 @@
>>>     UNSPEC_NVRINTX
>>>     UNSPEC_NVRINTA
>>>     UNSPEC_NVRINTN
>>> +  UNSPEC_VZERO_EXTEND
>>> +  UNSPEC_VSIGN_EXTEND
>>>   ])
>>>
>>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>>> new file mode 100644
>>> index 0000000..ed10669
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>>> @@ -0,0 +1,21 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target arm_neon_hw } */
>>> +/* { dg-add-options arm_neon_ok } */
>>> +/* { dg-options "-O3" } */
>>> +
>>> +
>>> +int
>>> +t6(int len, void * dummy, short * __restrict x)
>>> +{
>>> +  len = len & ~31;
>>> +  int result = 0;
>>> +  __asm volatile ("");
>>> +  for (int i = 0; i < len; i++)
>>> +    result += x[i];
>>> +  return result;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "vaddw\.s16" } } */
>>> +
>>> +
>>> +
>>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
>>> new file mode 100644
>>> index 0000000..94bf0c9
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
>>> @@ -0,0 +1,19 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target arm_neon_hw } */
>>> +/* { dg-add-options arm_neon_ok } */
>>> +/* { dg-options "-O3" } */
>>> +
>>> +int
>>> +t6(int len, void * dummy, int * __restrict x)
>>> +{
>>> +  len = len & ~31;
>>> +  long long result = 0;
>>> +  __asm volatile ("");
>>> +  for (int i = 0; i < len; i++)
>>> +    result += x[i];
>>> +  return result;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "vaddw\.s32" } } */
>>> +
>>> +
>>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
>>> new file mode 100644
>>> index 0000000..98f8768
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
>>> @@ -0,0 +1,18 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target arm_neon_hw } */
>>> +/* { dg-add-options arm_neon_ok } */
>>> +/* { dg-options "-O3" } */
>>> +
>>> +
>>> +int
>>> +t6(int len, void * dummy, unsigned short * __restrict x)
>>> +{
>>> +  len = len & ~31;
>>> +  unsigned int result = 0;
>>> +  __asm volatile ("");
>>> +  for (int i = 0; i < len; i++)
>>> +    result += x[i];
>>> +  return result;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "vaddw.u16" } } */
>>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
>>> new file mode 100644
>>> index 0000000..2e9af56
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
>>> @@ -0,0 +1,18 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target arm_neon_hw } */
>>> +/* { dg-add-options arm_neon_ok } */
>>> +/* { dg-options "-O3" } */
>>> +
>>> +int
>>> +t6(int len, void * dummy, unsigned int * __restrict x)
>>> +{
>>> +  len = len & ~31;
>>> +  unsigned long long result = 0;
>>> +  __asm volatile ("");
>>> +  for (int i = 0; i < len; i++)
>>> +    result += x[i];
>>> +  return result;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "vaddw\.u32" } } */
>>> +
>>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
>>> new file mode 100644
>>> index 0000000..de2ad8a
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
>>> @@ -0,0 +1,21 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target arm_neon_hw } */
>>> +/* { dg-add-options arm_neon_ok } */
>>> +/* { dg-options "-O3" } */
>>> +
>>> +
>>> +int
>>> +t6(int len, void * dummy, char * __restrict x)
>>> +{
>>> +  len = len & ~31;
>>> +  unsigned short result = 0;
>>> +  __asm volatile ("");
>>> +  for (int i = 0; i < len; i++)
>>> +    result += x[i];
>>> +  return result;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "vaddw\.u8" } } */
>>> +
>>> +
>>> +
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2016-02-09 16:27     ` Kyrill Tkachov
@ 2016-02-15  6:32       ` Michael Collison
  0 siblings, 0 replies; 15+ messages in thread
From: Michael Collison @ 2016-02-15  6:32 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc Patches, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 6465 bytes --]

Hi Kyrill,

I made the following changes based on your comments:

1. I rebased the patch so that it applies cleanly on trunk
2. Fixed the dg-add-options as requested to my new test cases
3. Fixed the GNU style issues identified by ./contrib/check_GNU_style.sh

The failure you are seeing on slp-reduc-3.c is a known failure. The test 
case has a xfail with 'xfail { vect_widen_sum_hi_to_si_pattern' which I 
added in my patch. Richard Biener resolved some of these issues with PR 
68333, but 'slp-reduc-3.c' still fails. I will create a new PR.

I retested on the Linaro testing infrastructure with the latest trunk 
and the only failure is 'slp-reduc-3.c'. Okay for GCC 7?

2016-02-12 Michael Collison <michael.collison@linaro.org>

     * config/arm/neon.md (widen_<us>sum<mode>): New patterns where
     mode is VQI to improve mixed mode vectorization.
     * config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): New
     define_insn to match low half of signed vaddw.
     * config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): New
     define_insn to match high half of signed vaddw.
     * config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): New
     define_insn to match low half of unsigned vaddw.
     * config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): New
     define_insn to match high half of unsigned vaddw.
     * config/arm/arm.c (arm_simd_vect_par_cnst_half): New function.
     (arm_simd_check_vect_par_cnst_half_p): Likewise.
     * config/arm/arm-protos.h (arm_simd_vect_par_cnst_half): Prototype
     for new function.
     (arm_simd_check_vect_par_cnst_half_p): Likewise.
     * config/arm/predicates.md (vect_par_constant_high): Support
     big endian and simplify by calling
     arm_simd_check_vect_par_cnst_half
     (vect_par_constant_low): Likewise.
     * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
     * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
     * testsuite/lib/target-supports.exp
     (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
     that arm neon support vector widen sum of HImode TO SImode.

On 02/09/2016 09:27 AM, Kyrill Tkachov wrote:
> Hi Michael,
>
> On 17/12/15 00:02, Michael Collison wrote:
>> Kyrill,
>>
>> I have attached a patch that address your comments. The only change I 
>> would ask you to re-consider renaming is the function 'bool 
>> aarch32_simd_check_vect_par_cnst_half'. This function was copied from 
>> the aarch64 port and I thought it as important to match the naming 
>> for maintenance purposes. I did rename the function to 'bool 
>> arm_simd_check_vect_par_cnst_half_p'. I changed 'aarch32' to 'arm' 
>> and added '_p' per you suggestions. Is this okay?
>>
>
> Ok, that's fine with me.
>
>> I implemented all your other change suggestions.
>>
>
> Thanks, sorry it took a long time to get back to this, I was busy with 
> regression-fixing patches as we're
> in bug-fixing mode...
>
>> 2015-12-16 Michael Collison <michael.collison@linaro.org>
>>
>>     * config/arm/neon.md (widen_<us>sum<mode>): New patterns where
>>     mode is VQI to improve mixed mode vectorization.
>>     * config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): 
>> New
>>     define_insn to match low half of signed vaddw.
>>     * config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): 
>> New
>>     define_insn to match high half of signed vaddw.
>>     * config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): 
>> New
>>     define_insn to match low half of unsigned vaddw.
>>     * config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): 
>> New
>>     define_insn to match high half of unsigned vaddw.
>>     * config/arm/arm.c (arm_simd_vect_par_cnst_half): New function.
>>     (arm_simd_check_vect_par_cnst_half_p): Likewise.
>>     * config/arm/arm-protos.h (arm_simd_vect_par_cnst_half): Prototype
>>     for new function.
>>     (arm_simd_check_vect_par_cnst_half_p): Likewise.
>>     * config/arm/predicates.md (vect_par_constant_high): Support
>>     big endian and simplify by calling
>>     arm_simd_check_vect_par_cnst_half
>>     (vect_par_constant_low): Likewise.
>>     * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
>>     * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
>>     * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
>>     * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
>>     * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
>>     * testsuite/lib/target-supports.exp
>>     (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
>>     that arm neon support vector widen sum of HImode TO SImode.
>>
>
> I've tried this out and I have a few comments.
> The arm.c hunk doesn't apply to current trunk anymore due to context.
> Can you please rebase the patch?
> I've fixed it up manually in my tree so I can build it.
> With this patch I'm seeing two PASS->FAIL on arm-none-eabi:
> FAIL: gcc.dg/vect/slp-reduc-3.c -flto -ffat-lto-objects 
> scan-tree-dump-times vect "vectorizing stmts using SLP" 1
> FAIL: gcc.dg/vect/slp-reduc-3.c scan-tree-dump-times vect "vectorizing 
> stmts using SLP" 1
> My compiler is configured with --with-float=hard --with-cpu=cortex-a9 
> --with-fpu=neon --with-mode=thumb
> Can you please look into these? Maybe it's just the tests that need 
> adjustment?
>
> Also, I'm seeing the new tests give an error:
> ERROR: gcc.target/arm/neon-vaddws16.c: Unrecognized option type: 
> arm_neon_ok for " dg-add-options 3 arm_neon_ok "
> UNRESOLVED: gcc.target/arm/neon-vaddws16.c: Unrecognized option type: 
> arm_neon_ok for " dg-add-options 3 arm_neon_ok "
>
> That've because the dg-add-options argument should be arm_neon rather 
> than arm_neon_ok.
> Also, since the new tests are compile-only the effective target check 
> should be arm_neon_ok rather than arm_neon_hw.
>
> I also see ./contrib/check_GNU_style.sh complaining about some minor 
> style issues like trailing whitespace and
> blocks of whitespace that should be replaced with tabs.
>
> In any case, this patch is GCC 7 material at this point, so I think 
> with the above issues resolved
> (and the FAILs investigated) this should be in good shape.
>
> Thanks,
> Kyrill

-- 
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org


[-- Attachment #2: tcwg_833_upstream_feb_12.patch --]
[-- Type: text/x-patch, Size: 14776 bytes --]

From f3d167389cce45ecbd62bb4b1da754ba629ce32f Mon Sep 17 00:00:00 2001
From: Michael Collison <michael.collison@linaro.org>
Date: Wed, 10 Feb 2016 22:13:26 -0700
Subject: [PATCH] patches for tcwg 833

Fix GNU style issues

GNU formatting changes pt. 2

GNU formatting changes pt. 3

GNU formatting changes pt. 4

Fix inadverdent change

Fix trailing whitespace

Fix another inadverdant change

Fix incorrect application of patch

Fix > 80 character line length issue

Fix trailing whitespace

Fix order of dg-options
---
 gcc/config/arm/arm-protos.h                  |   4 +-
 gcc/config/arm/arm.c                         |  76 ++++++++++++++++
 gcc/config/arm/neon.md                       | 125 ++++++++++++++++++++++++++-
 gcc/config/arm/predicates.md                 |  50 +----------
 gcc/testsuite/gcc.target/arm/neon-vaddws16.c |  19 ++++
 gcc/testsuite/gcc.target/arm/neon-vaddws32.c |  18 ++++
 gcc/testsuite/gcc.target/arm/neon-vaddwu16.c |  18 ++++
 gcc/testsuite/gcc.target/arm/neon-vaddwu32.c |  18 ++++
 gcc/testsuite/gcc.target/arm/neon-vaddwu8.c  |  19 ++++
 gcc/testsuite/lib/target-supports.exp        |   2 +
 10 files changed, 296 insertions(+), 53 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/neon-vaddws16.c
 create mode 100644 gcc/testsuite/gcc.target/arm/neon-vaddws32.c
 create mode 100644 gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
 create mode 100644 gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
 create mode 100644 gcc/testsuite/gcc.target/arm/neon-vaddwu8.c

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 0083673..d8179c4 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -50,7 +50,9 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
 			      ATTRIBUTE_UNUSED);
 extern void arm_init_builtins (void);
 extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update);
-
+extern rtx arm_simd_vect_par_cnst_half (machine_mode mode, bool high);
+extern bool arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
+						 bool high);
 #ifdef RTX_CODE
 extern bool arm_vector_mode_supported_p (machine_mode);
 extern bool arm_small_register_classes_for_mode_p (machine_mode);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 82becef..7ac34bb 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -30239,4 +30239,80 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
   return;
 }
 
+
+/* Construct and return a PARALLEL RTX vector with elements numbering the
+   lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
+   the vector - from the perspective of the architecture.  This does not
+   line up with GCC's perspective on lane numbers, so we end up with
+   different masks depending on our target endian-ness.  The diagram
+   below may help.  We must draw the distinction when building masks
+   which select one half of the vector.  An instruction selecting
+   architectural low-lanes for a big-endian target, must be described using
+   a mask selecting GCC high-lanes.
+
+                 Big-Endian             Little-Endian
+
+GCC             0   1   2   3           3   2   1   0
+              | x | x | x | x |       | x | x | x | x |
+Architecture    3   2   1   0           3   2   1   0
+
+Low Mask:         { 2, 3 }                { 0, 1 }
+High Mask:        { 0, 1 }                { 2, 3 }
+*/
+
+rtx
+arm_simd_vect_par_cnst_half (machine_mode mode, bool high)
+{
+  int nunits = GET_MODE_NUNITS (mode);
+  rtvec v = rtvec_alloc (nunits / 2);
+  int high_base = nunits / 2;
+  int low_base = 0;
+  int base;
+  rtx t1;
+  int i;
+
+  if (BYTES_BIG_ENDIAN)
+    base = high ? low_base : high_base;
+  else
+    base = high ? high_base : low_base;
+
+  for (i = 0; i < nunits / 2; i++)
+    RTVEC_ELT (v, i) = GEN_INT (base + i);
+
+  t1 = gen_rtx_PARALLEL (mode, v);
+  return t1;
+}
+
+/* Check OP for validity as a PARALLEL RTX vector with elements
+   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
+   from the perspective of the architecture.  See the diagram above
+   arm_simd_vect_par_cnst_half_p for more details.  */
+
+bool
+arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
+				       bool high)
+{
+  rtx ideal = arm_simd_vect_par_cnst_half (mode, high);
+  HOST_WIDE_INT count_op = XVECLEN (op, 0);
+  HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
+  int i = 0;
+
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  if (count_op != count_ideal)
+    return false;
+
+  for (i = 0; i < count_ideal; i++)
+    {
+      rtx elt_op = XVECEXP (op, 0, i);
+      rtx elt_ideal = XVECEXP (ideal, 0, i);
+
+      if (!CONST_INT_P (elt_op)
+	  || INTVAL (elt_ideal) != INTVAL (elt_op))
+	return false;
+    }
+  return true;
+}
+
 #include "gt-arm.h"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index f495d40..754d394 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1204,16 +1204,133 @@
 
 ;; Widening operations
 
+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width>
+	 (sign_extend:<V_double_width>
+	  (match_operand:VQI 1 "s_register_operand" ""))
+	 (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    machine_mode mode = GET_MODE (operands[1]);
+    rtx p1, p2;
+
+    p1  = arm_simd_vect_par_cnst_half (mode, false);
+    p2  = arm_simd_vect_par_cnst_half (mode, true);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0],
+							 operands[1],
+							 p1,
+							 operands[0]));
+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0],
+							 operands[1],
+							 p2,
+							 operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen>
+	 (sign_extend:<VW:V_widen>
+	  (vec_select:VW
+	   (match_operand:VQI 1 "s_register_operand" "%w")
+	   (match_operand:VQI 2 "vect_par_constant_low" "")))
+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %f1" :
+    "vaddw.<V_s_elem>\t%q0, %q3, %e1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen>
+	 (sign_extend:<VW:V_widen>
+	  (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+			 (match_operand:VQI 2 "vect_par_constant_high" "")))
+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %e1" :
+    "vaddw.<V_s_elem>\t%q0, %q3, %f1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
 (define_insn "widen_ssum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
-	(plus:<V_widen> (sign_extend:<V_widen>
-			  (match_operand:VW 1 "s_register_operand" "%w"))
-		        (match_operand:<V_widen> 2 "s_register_operand" "w")))]
+	(plus:<V_widen>
+	 (sign_extend:<V_widen>
+	  (match_operand:VW 1 "s_register_operand" "%w"))
+	 (match_operand:<V_widen> 2 "s_register_operand" "w")))]
   "TARGET_NEON"
   "vaddw.<V_s_elem>\t%q0, %q2, %P1"
   [(set_attr "type" "neon_add_widen")]
 )
 
+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width>
+	 (zero_extend:<V_double_width>
+	  (match_operand:VQI 1 "s_register_operand" ""))
+	 (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    machine_mode mode = GET_MODE (operands[1]);
+    rtx p1, p2;
+
+    p1  = arm_simd_vect_par_cnst_half (mode, false);
+    p2  = arm_simd_vect_par_cnst_half (mode, true);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0],
+							 operands[1],
+							 p1,
+							 operands[0]));
+    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0],
+							 operands[1],
+							 p2,
+							 operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen>
+	 (zero_extend:<VW:V_widen>
+	  (vec_select:VW
+	   (match_operand:VQI 1 "s_register_operand" "%w")
+	   (match_operand:VQI 2 "vect_par_constant_low" "")))
+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %f1" :
+    "vaddw.<V_u_elem>\t%q0, %q3, %e1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen>
+	 (zero_extend:<VW:V_widen>
+	  (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+			 (match_operand:VQI 2 "vect_par_constant_high" "")))
+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+ return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %e1" :
+    "vaddw.<V_u_elem>\t%q0, %q3, %f1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
 (define_insn "widen_usum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(plus:<V_widen> (zero_extend:<V_widen>
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 41a6ea4..a21f675 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -605,59 +605,13 @@
 (define_special_predicate "vect_par_constant_high" 
   (match_code "parallel")
 {
-  HOST_WIDE_INT count = XVECLEN (op, 0);
-  int i;
-  int base = GET_MODE_NUNITS (mode);
-
-  if ((count < 1)
-      || (count != base/2))
-    return false;
-    
-  if (!VECTOR_MODE_P (mode))
-    return false;
-
-  for (i = 0; i < count; i++)
-   {
-     rtx elt = XVECEXP (op, 0, i);
-     int val;
-
-     if (!CONST_INT_P (elt))
-       return false;
-
-     val = INTVAL (elt);
-     if (val != (base/2) + i)
-       return false;
-   }
-  return true; 
+  return arm_simd_check_vect_par_cnst_half_p (op, mode, true);
 })
 
 (define_special_predicate "vect_par_constant_low"
   (match_code "parallel")
 {
-  HOST_WIDE_INT count = XVECLEN (op, 0);
-  int i;
-  int base = GET_MODE_NUNITS (mode);
-
-  if ((count < 1)
-      || (count != base/2))
-    return false;
-    
-  if (!VECTOR_MODE_P (mode))
-    return false;
-
-  for (i = 0; i < count; i++)
-   {
-     rtx elt = XVECEXP (op, 0, i);
-     int val;
-
-     if (!CONST_INT_P (elt))
-       return false;
-
-     val = INTVAL (elt);
-     if (val != i)
-       return false;
-   } 
-  return true; 
+  return arm_simd_check_vect_par_cnst_half_p (op, mode, false);
 })
 
 (define_predicate "const_double_vcvt_power_of_two_reciprocal"
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..8281134
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+
+int
+t6 (int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..8c18691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+int
+t6 (int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..580bb06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+int
+t6 (int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..21b0633
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+int
+t6 (int len, void * dummy, unsigned int * __restrict x)
+{
+  len = len & ~31;
+  unsigned long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..d350ed5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3" } */
+/* { dg-add-options arm_neon } */
+
+
+
+int
+t6 (int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 645981a..01d72a5 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4313,6 +4313,8 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
         set et_vect_widen_sum_hi_to_si_pattern_saved 0
         if { [istarget powerpc*-*-*]
              || [istarget aarch64*-*-*]
+	     || ([istarget arm*-*-*] &&
+		 [check_effective_target_arm_neon_ok])
              || [istarget ia64-*-*] } {
             set et_vect_widen_sum_hi_to_si_pattern_saved 1
         }
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-12-17  0:02   ` Michael Collison
@ 2016-02-09 16:27     ` Kyrill Tkachov
  2016-02-15  6:32       ` Michael Collison
  0 siblings, 1 reply; 15+ messages in thread
From: Kyrill Tkachov @ 2016-02-09 16:27 UTC (permalink / raw)
  To: Michael Collison, gcc Patches, Ramana Radhakrishnan

Hi Michael,

On 17/12/15 00:02, Michael Collison wrote:
> Kyrill,
>
> I have attached a patch that address your comments. The only change I would ask you to re-consider renaming is the function 'bool aarch32_simd_check_vect_par_cnst_half'. This function was copied from the aarch64 port and I thought it as 
> important to match the naming for maintenance purposes. I did rename the function to 'bool arm_simd_check_vect_par_cnst_half_p'. I changed 'aarch32' to 'arm' and added '_p' per you suggestions. Is this okay?
>

Ok, that's fine with me.

> I implemented all your other change suggestions.
>

Thanks, sorry it took a long time to get back to this, I was busy with regression-fixing patches as we're
in bug-fixing mode...

> 2015-12-16 Michael Collison  <michael.collison@linaro.org>
>
>     * config/arm/neon.md (widen_<us>sum<mode>): New patterns where
>     mode is VQI to improve mixed mode vectorization.
>     * config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): New
>     define_insn to match low half of signed vaddw.
>     * config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): New
>     define_insn to match high half of signed vaddw.
>     * config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): New
>     define_insn to match low half of unsigned vaddw.
>     * config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): New
>     define_insn to match high half of unsigned vaddw.
>     * config/arm/arm.c (arm_simd_vect_par_cnst_half): New function.
>     (arm_simd_check_vect_par_cnst_half_p): Likewise.
>     * config/arm/arm-protos.h (arm_simd_vect_par_cnst_half): Prototype
>     for new function.
>     (arm_simd_check_vect_par_cnst_half_p): Likewise.
>     * config/arm/predicates.md (vect_par_constant_high): Support
>     big endian and simplify by calling
>     arm_simd_check_vect_par_cnst_half
>     (vect_par_constant_low): Likewise.
>     * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
>     * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
>     * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
>     * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
>     * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
>     * testsuite/lib/target-supports.exp
>     (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
>     that arm neon support vector widen sum of HImode TO SImode.
>

I've tried this out and I have a few comments.
The arm.c hunk doesn't apply to current trunk anymore due to context.
Can you please rebase the patch?
I've fixed it up manually in my tree so I can build it.
With this patch I'm seeing two PASS->FAIL on arm-none-eabi:
FAIL: gcc.dg/vect/slp-reduc-3.c -flto -ffat-lto-objects scan-tree-dump-times vect "vectorizing stmts using SLP" 1
FAIL: gcc.dg/vect/slp-reduc-3.c scan-tree-dump-times vect "vectorizing stmts using SLP" 1
My compiler is configured with --with-float=hard --with-cpu=cortex-a9 --with-fpu=neon --with-mode=thumb
Can you please look into these? Maybe it's just the tests that need adjustment?

Also, I'm seeing the new tests give an error:
ERROR: gcc.target/arm/neon-vaddws16.c: Unrecognized option type: arm_neon_ok for " dg-add-options 3 arm_neon_ok "
UNRESOLVED: gcc.target/arm/neon-vaddws16.c: Unrecognized option type: arm_neon_ok for " dg-add-options 3 arm_neon_ok "

That've because the dg-add-options argument should be arm_neon rather than arm_neon_ok.
Also, since the new tests are compile-only the effective target check should be arm_neon_ok rather than arm_neon_hw.

I also see ./contrib/check_GNU_style.sh complaining about some minor style issues like trailing whitespace and
blocks of whitespace that should be replaced with tabs.

In any case, this patch is GCC 7 material at this point, so I think with the above issues resolved
(and the FAILs investigated) this should be in good shape.

Thanks,
Kyrill

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-12-10 15:09 ` Kyrill Tkachov
@ 2015-12-17  0:02   ` Michael Collison
  2016-02-09 16:27     ` Kyrill Tkachov
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2015-12-17  0:02 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc Patches, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 7237 bytes --]

Kyrill,

I have attached a patch that address your comments. The only change I 
would ask you to re-consider renaming is the function 'bool 
aarch32_simd_check_vect_par_cnst_half'. This function was copied from 
the aarch64 port and I thought it as important to match the naming for 
maintenance purposes. I did rename the function to 'bool 
arm_simd_check_vect_par_cnst_half_p'. I changed 'aarch32' to 'arm' and 
added '_p' per you suggestions. Is this okay?

I implemented all your other change suggestions.

2015-12-16  Michael Collison  <michael.collison@linaro.org>

     * config/arm/neon.md (widen_<us>sum<mode>): New patterns where
     mode is VQI to improve mixed mode vectorization.
     * config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): New
     define_insn to match low half of signed vaddw.
     * config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): New
     define_insn to match high half of signed vaddw.
     * config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): New
     define_insn to match low half of unsigned vaddw.
     * config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): New
     define_insn to match high half of unsigned vaddw.
     * config/arm/arm.c (arm_simd_vect_par_cnst_half): New function.
     (arm_simd_check_vect_par_cnst_half_p): Likewise.
     * config/arm/arm-protos.h (arm_simd_vect_par_cnst_half): Prototype
     for new function.
     (arm_simd_check_vect_par_cnst_half_p): Likewise.
     * config/arm/predicates.md (vect_par_constant_high): Support
     big endian and simplify by calling
     arm_simd_check_vect_par_cnst_half
     (vect_par_constant_low): Likewise.
     * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
     * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
     * testsuite/lib/target-supports.exp
     (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
     that arm neon support vector widen sum of HImode TO SImode.

On 12/10/2015 08:09 AM, Kyrill Tkachov wrote:
> Hi Michael,
>
> A few comments while I look deeper into this patch...
>
> On 30/11/15 01:18, Michael Collison wrote:
>>
>> This is a modified version of my previous patch that supports vector 
>> wide add. I added support for vaddw on big endian when generating the 
>> parallel operand for the vector select.
>>
>> There are four failing test cases on arm big endian with similar 
>> code. They are:
>>
>> gcc.dg/vect/vect-outer-4f.c -flto -ffat-lto-objects execution test
>> gcc.dg/vect/vect-outer-4g.c -flto -ffat-lto-objects execution test
>> gcc.dg/vect/vect-outer-4k.c -flto -ffat-lto-objects execution test
>> gcc.dg/vect/vect-outer-4l.c -flto -ffat-lto-objects execution test
>>
>>
>> The failures occur without my patch and are related to a bug with 
>> vector loads using VUZP operations.
>>
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68532
>>
>> Validated on arm-none-eabi, arm-none-linux-gnueabi, 
>> arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
>>
>> 2015-11-29  Michael Collison <michael.collison@linaro.org>
>>
>>     * config/arm/neon.md (widen_<us>sum<mode>): New patterns where
>>     mode is VQI to improve mixed mode vectorization.
>>     * config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): 
>> New
>>     define_insn to match low half of signed vaddw.
>>     * config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): 
>> New
>>     define_insn to match high half of signed vaddw.
>>     * config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): 
>> New
>>     define_insn to match low half of unsigned vaddw.
>>     * config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): 
>> New
>>     define_insn to match high half of unsigned vaddw.
>>     * config/arm/arm.c (aarch32_simd_vect_par_cnst_half): New function.
>>     (aarch32_simd_check_vect_par_cnst_half): Likewise.
>>     * config/arm/arm-protos.h (aarch32_simd_vect_par_cnst_half): 
>> Prototype
>>     for new function.
>>     (aarch32_simd_check_vect_par_cnst_half): Likewise.
>>     * config/arm/predicates.md (vect_par_constant_high): Support
>>     big endian and simplify by calling
>>     aarch32_simd_check_vect_par_cnst_half
>>     (vect_par_constant_low): Likewise.
>>     * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
>>     * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
>>     * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
>>     * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
>>     * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
>>     * testsuite/lib/target-supports.exp
>>     (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
>>     that arm neon support vector widen sum of HImode TO SImode.
>>
>> Okay for trunk?
>>
>
> --- a/gcc/config/arm/arm-protos.h
> +++ b/gcc/config/arm/arm-protos.h
> @@ -50,7 +50,9 @@ extern tree arm_builtin_decl (unsigned code, bool 
> initialize_p
>                    ATTRIBUTE_UNUSED);
>  extern void arm_init_builtins (void);
>  extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, 
> tree *update);
> -
> +extern rtx aarch32_simd_vect_par_cnst_half (machine_mode mode, bool 
> high);
> +extern bool aarch32_simd_check_vect_par_cnst_half (rtx op, 
> machine_mode mode,
> +                           bool high);
>
>
> Please use arm instead of aarch32 in the name to be consistent with 
> the rest of the
> backend. Also, for functions that return a bool without side-effects 
> it's preferable
> to finish their name with '_p'. So for the second one I'd drop the 
> 'check' and call
> it something like "arm_vector_of_lane_nums_p ", is that a more 
> descriptive name?
>
> +/* Check OP for validity as a PARALLEL RTX vector with elements
> +   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
> +   from the perspective of the architecture.  See the diagram above
> +   aarch64_simd_vect_par_cnst_half for more details.  */
> +
>
> aarch64?
>
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -1174,6 +1174,51 @@
>
>  ;; Widening operations
>
> +(define_expand "widen_ssum<mode>3"
> +  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
> +    (plus:<V_double_width> (sign_extend:<V_double_width> 
> (match_operand:VQI 1 "s_register_operand" ""))
> +                   (match_operand:<V_double_width> 2 
> "s_register_operand" "")))]
> +  "TARGET_NEON"
> +  {
> +    machine_mode mode = GET_MODE (operands[1]);
> +    rtx p1, p2;
> +
> +    p1  = aarch32_simd_vect_par_cnst_half (mode, false);
> +    p2  = aarch32_simd_vect_par_cnst_half (mode, true);
> +
> +    if (operands[0] != operands[2])
> +      emit_move_insn (operands[0], operands[2]);
> +
> +    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], 
> operands[1], p1, operands[0]));
> +    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], 
> operands[1], p2, operands[0]));
> +    DONE;
> +  }
>
> Please format these properly to avoid long lines.
> Thanks,
> Kyrill
>
>

-- 
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org


[-- Attachment #2: tcwg-833-upstream-dec162015.patch --]
[-- Type: text/x-patch, Size: 13474 bytes --]

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index f9b1276..8629e24 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -50,7 +50,9 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
 			      ATTRIBUTE_UNUSED);
 extern void arm_init_builtins (void);
 extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update);
-
+extern rtx arm_simd_vect_par_cnst_half (machine_mode mode, bool high);
+extern bool arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
+						 bool high);
 #ifdef RTX_CODE
 extern bool arm_vector_mode_supported_p (machine_mode);
 extern bool arm_small_register_classes_for_mode_p (machine_mode);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 61e2aa2..4b85342 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -30111,4 +30111,80 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
   *pri = tmp;
   return;
 }
+
+/* Construct and return a PARALLEL RTX vector with elements numbering the
+   lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
+   the vector - from the perspective of the architecture.  This does not
+   line up with GCC's perspective on lane numbers, so we end up with
+   different masks depending on our target endian-ness.  The diagram
+   below may help.  We must draw the distinction when building masks
+   which select one half of the vector.  An instruction selecting
+   architectural low-lanes for a big-endian target, must be described using
+   a mask selecting GCC high-lanes.
+
+                 Big-Endian             Little-Endian
+
+GCC             0   1   2   3           3   2   1   0
+              | x | x | x | x |       | x | x | x | x |
+Architecture    3   2   1   0           3   2   1   0
+
+Low Mask:         { 2, 3 }                { 0, 1 }
+High Mask:        { 0, 1 }                { 2, 3 }
+*/
+
+rtx
+arm_simd_vect_par_cnst_half (machine_mode mode, bool high)
+{
+  int nunits = GET_MODE_NUNITS (mode);
+  rtvec v = rtvec_alloc (nunits / 2);
+  int high_base = nunits / 2;
+  int low_base = 0;
+  int base;
+  rtx t1;
+  int i;
+
+  if (BYTES_BIG_ENDIAN)
+    base = high ? low_base : high_base;
+  else
+    base = high ? high_base : low_base;
+
+  for (i = 0; i < nunits / 2; i++)
+    RTVEC_ELT (v, i) = GEN_INT (base + i);
+
+  t1 = gen_rtx_PARALLEL (mode, v);
+  return t1;
+}
+
+/* Check OP for validity as a PARALLEL RTX vector with elements
+   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
+   from the perspective of the architecture.  See the diagram above
+   arm_simd_vect_par_cnst_half_p for more details.  */
+
+bool
+arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
+				       bool high)
+{
+  rtx ideal = arm_simd_vect_par_cnst_half (mode, high);
+  HOST_WIDE_INT count_op = XVECLEN (op, 0);
+  HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
+  int i = 0;
+
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  if (count_op != count_ideal)
+    return false;
+
+  for (i = 0; i < count_ideal; i++)
+    {
+      rtx elt_op = XVECEXP (op, 0, i);
+      rtx elt_ideal = XVECEXP (ideal, 0, i);
+
+      if (!CONST_INT_P (elt_op)
+	  || INTVAL (elt_ideal) != INTVAL (elt_op))
+	return false;
+    }
+  return true;
+}
+
 #include "gt-arm.h"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index e5a2b0f..bdb94db 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,57 @@
 
 ;; Widening operations
 
+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width> (sign_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+			       (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    machine_mode mode = GET_MODE (operands[1]);
+    rtx p1, p2;
+ 
+    p1  = arm_simd_vect_par_cnst_half (mode, false);
+    p2  = arm_simd_vect_par_cnst_half (mode, true);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], 
+							 operands[1], 
+							 p1, 
+							 operands[0]));
+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], 
+							 operands[1], 
+							 p2, 
+							 operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_low" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %f1" :
+    "vaddw.<V_s_elem>\t%q0, %q3, %e1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_high" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %e1" :
+    "vaddw.<V_s_elem>\t%q0, %q3, %f1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
 (define_insn "widen_ssum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(plus:<V_widen> (sign_extend:<V_widen>
@@ -1184,6 +1235,57 @@
   [(set_attr "type" "neon_add_widen")]
 )
 
+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width> (zero_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+			       (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    machine_mode mode = GET_MODE (operands[1]);
+    rtx p1, p2;
+ 
+    p1  = arm_simd_vect_par_cnst_half (mode, false);
+    p2  = arm_simd_vect_par_cnst_half (mode, true);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0], 
+							 operands[1], 
+							 p1, 
+							 operands[0]));
+    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0], 
+							 operands[1], 
+							 p2, 
+							 operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_low" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %f1" :
+    "vaddw.<V_u_elem>\t%q0, %q3, %e1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_high" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+ return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %e1" :
+    "vaddw.<V_u_elem>\t%q0, %q3, %f1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
 (define_insn "widen_usum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(plus:<V_widen> (zero_extend:<V_widen>
@@ -5331,7 +5433,7 @@ if (BYTES_BIG_ENDIAN)
  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
        (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
 			   (match_operand:VU 1 "register_operand" "w") 
-                           (match_operand:VU 2 "vect_par_constant_low" "")))
+					(match_operand:VU 2 "vect_par_constant_low" "")))
  		        (SE:<V_unpack> (vec_select:<V_HALF>
                            (match_operand:VU 3 "register_operand" "w") 
                            (match_dup 2)))))]
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 48e4ba8..372230a 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -605,59 +605,13 @@
 (define_special_predicate "vect_par_constant_high" 
   (match_code "parallel")
 {
-  HOST_WIDE_INT count = XVECLEN (op, 0);
-  int i;
-  int base = GET_MODE_NUNITS (mode);
-
-  if ((count < 1)
-      || (count != base/2))
-    return false;
-    
-  if (!VECTOR_MODE_P (mode))
-    return false;
-
-  for (i = 0; i < count; i++)
-   {
-     rtx elt = XVECEXP (op, 0, i);
-     int val;
-
-     if (!CONST_INT_P (elt))
-       return false;
-
-     val = INTVAL (elt);
-     if (val != (base/2) + i)
-       return false;
-   }
-  return true; 
+  return arm_simd_check_vect_par_cnst_half_p (op, mode, true);
 })
 
 (define_special_predicate "vect_par_constant_low"
   (match_code "parallel")
 {
-  HOST_WIDE_INT count = XVECLEN (op, 0);
-  int i;
-  int base = GET_MODE_NUNITS (mode);
-
-  if ((count < 1)
-      || (count != base/2))
-    return false;
-    
-  if (!VECTOR_MODE_P (mode))
-    return false;
-
-  for (i = 0; i < count; i++)
-   {
-     rtx elt = XVECEXP (op, 0, i);
-     int val;
-
-     if (!CONST_INT_P (elt))
-       return false;
-
-     val = INTVAL (elt);
-     if (val != i)
-       return false;
-   } 
-  return true; 
+  return arm_simd_check_vect_par_cnst_half_p (op, mode, false);
 })
 
 (define_predicate "const_double_vcvt_power_of_two_reciprocal"
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..96c657e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..1bfdc13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int 
+t6(int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..98f8768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..4a72a39
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int 
+t6(int len, void * dummy, unsigned int * __restrict x)
+{
+  len = len & ~31;
+  unsigned long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..9c9c68a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index b543519..4deca1f 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3943,6 +3943,7 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
     } else {
         set et_vect_widen_sum_hi_to_si_pattern_saved 0
         if { [istarget powerpc*-*-*]
+	     || [check_effective_target_arm_neon_ok]
              || [istarget ia64-*-*] } {
             set et_vect_widen_sum_hi_to_si_pattern_saved 1
         }
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-11-30  6:59 Michael Collison
@ 2015-12-10 15:09 ` Kyrill Tkachov
  2015-12-17  0:02   ` Michael Collison
  0 siblings, 1 reply; 15+ messages in thread
From: Kyrill Tkachov @ 2015-12-10 15:09 UTC (permalink / raw)
  To: Michael Collison, gcc Patches, Ramana Radhakrishnan

Hi Michael,

A few comments while I look deeper into this patch...

On 30/11/15 01:18, Michael Collison wrote:
>
> This is a modified version of my previous patch that supports vector wide add. I added support for vaddw on big endian when generating the parallel operand for the vector select.
>
> There are four failing test cases on arm big endian with similar code. They are:
>
> gcc.dg/vect/vect-outer-4f.c -flto -ffat-lto-objects execution test
> gcc.dg/vect/vect-outer-4g.c -flto -ffat-lto-objects execution test
> gcc.dg/vect/vect-outer-4k.c -flto -ffat-lto-objects execution test
> gcc.dg/vect/vect-outer-4l.c -flto -ffat-lto-objects execution test
>
>
> The failures occur without my patch and are related to a bug with vector loads using VUZP operations.
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68532
>
> Validated on arm-none-eabi, arm-none-linux-gnueabi, arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
>
> 2015-11-29  Michael Collison  <michael.collison@linaro.org>
>
>     * config/arm/neon.md (widen_<us>sum<mode>): New patterns where
>     mode is VQI to improve mixed mode vectorization.
>     * config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): New
>     define_insn to match low half of signed vaddw.
>     * config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): New
>     define_insn to match high half of signed vaddw.
>     * config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): New
>     define_insn to match low half of unsigned vaddw.
>     * config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): New
>     define_insn to match high half of unsigned vaddw.
>     * config/arm/arm.c (aarch32_simd_vect_par_cnst_half): New function.
>     (aarch32_simd_check_vect_par_cnst_half): Likewise.
>     * config/arm/arm-protos.h (aarch32_simd_vect_par_cnst_half): Prototype
>     for new function.
>     (aarch32_simd_check_vect_par_cnst_half): Likewise.
>     * config/arm/predicates.md (vect_par_constant_high): Support
>     big endian and simplify by calling
>     aarch32_simd_check_vect_par_cnst_half
>     (vect_par_constant_low): Likewise.
>     * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
>     * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
>     * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
>     * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
>     * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
>     * testsuite/lib/target-supports.exp
>     (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
>     that arm neon support vector widen sum of HImode TO SImode.
>
> Okay for trunk?
>

--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -50,7 +50,9 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
  			      ATTRIBUTE_UNUSED);
  extern void arm_init_builtins (void);
  extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update);
-
+extern rtx aarch32_simd_vect_par_cnst_half (machine_mode mode, bool high);
+extern bool aarch32_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
+						   bool high);


Please use arm instead of aarch32 in the name to be consistent with the rest of the
backend. Also, for functions that return a bool without side-effects it's preferable
to finish their name with '_p'. So for the second one I'd drop the 'check' and call
it something like "arm_vector_of_lane_nums_p ", is that a more descriptive name?

+/* Check OP for validity as a PARALLEL RTX vector with elements
+   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
+   from the perspective of the architecture.  See the diagram above
+   aarch64_simd_vect_par_cnst_half for more details.  */
+

aarch64?

--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,51 @@
  
  ;; Widening operations
  
+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width> (sign_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+			       (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    machine_mode mode = GET_MODE (operands[1]);
+    rtx p1, p2;
+
+    p1  = aarch32_simd_vect_par_cnst_half (mode, false);
+    p2  = aarch32_simd_vect_par_cnst_half (mode, true);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+    DONE;
+  }

Please format these properly to avoid long lines.
Thanks,
Kyrill


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
@ 2015-11-30  6:59 Michael Collison
  2015-12-10 15:09 ` Kyrill Tkachov
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2015-11-30  6:59 UTC (permalink / raw)
  To: gcc Patches, Ramana Radhakrishnan, Kyrill Tkachov

[-- Attachment #1: Type: text/plain, Size: 2545 bytes --]


This is a modified version of my previous patch that supports vector 
wide add. I added support for vaddw on big endian when generating the 
parallel operand for the vector select.

There are four failing test cases on arm big endian with similar code. 
They are:

gcc.dg/vect/vect-outer-4f.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4g.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4k.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4l.c -flto -ffat-lto-objects execution test


The failures occur without my patch and are related to a bug with vector 
loads using VUZP operations.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68532

Validated on arm-none-eabi, arm-none-linux-gnueabi, 
arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.

2015-11-29  Michael Collison  <michael.collison@linaro.org>

     * config/arm/neon.md (widen_<us>sum<mode>): New patterns where
     mode is VQI to improve mixed mode vectorization.
     * config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): New
     define_insn to match low half of signed vaddw.
     * config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): New
     define_insn to match high half of signed vaddw.
     * config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): New
     define_insn to match low half of unsigned vaddw.
     * config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): New
     define_insn to match high half of unsigned vaddw.
     * config/arm/arm.c (aarch32_simd_vect_par_cnst_half): New function.
     (aarch32_simd_check_vect_par_cnst_half): Likewise.
     * config/arm/arm-protos.h (aarch32_simd_vect_par_cnst_half): Prototype
     for new function.
     (aarch32_simd_check_vect_par_cnst_half): Likewise.
     * config/arm/predicates.md (vect_par_constant_high): Support
     big endian and simplify by calling
     aarch32_simd_check_vect_par_cnst_half
     (vect_par_constant_low): Likewise.
     * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
     * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
     * testsuite/lib/target-supports.exp
     (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
     that arm neon support vector widen sum of HImode TO SImode.

Okay for trunk?

-- 
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org


[-- Attachment #2: tcwg-833-aarch32-upstream_nov29.patch --]
[-- Type: text/x-patch, Size: 13394 bytes --]

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index f9b1276..26fe370 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -50,7 +50,9 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
 			      ATTRIBUTE_UNUSED);
 extern void arm_init_builtins (void);
 extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update);
-
+extern rtx aarch32_simd_vect_par_cnst_half (machine_mode mode, bool high);
+extern bool aarch32_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
+						   bool high);
 #ifdef RTX_CODE
 extern bool arm_vector_mode_supported_p (machine_mode);
 extern bool arm_small_register_classes_for_mode_p (machine_mode);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 61e2aa2..158c2e8 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -30111,4 +30111,80 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
   *pri = tmp;
   return;
 }
+
+/* Construct and return a PARALLEL RTX vector with elements numbering the
+   lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
+   the vector - from the perspective of the architecture.  This does not
+   line up with GCC's perspective on lane numbers, so we end up with
+   different masks depending on our target endian-ness.  The diagram
+   below may help.  We must draw the distinction when building masks
+   which select one half of the vector.  An instruction selecting
+   architectural low-lanes for a big-endian target, must be described using
+   a mask selecting GCC high-lanes.
+
+                 Big-Endian             Little-Endian
+
+GCC             0   1   2   3           3   2   1   0
+              | x | x | x | x |       | x | x | x | x |
+Architecture    3   2   1   0           3   2   1   0
+
+Low Mask:         { 2, 3 }                { 0, 1 }
+High Mask:        { 0, 1 }                { 2, 3 }
+*/
+
+rtx
+aarch32_simd_vect_par_cnst_half (machine_mode mode, bool high)
+{
+  int nunits = GET_MODE_NUNITS (mode);
+  rtvec v = rtvec_alloc (nunits / 2);
+  int high_base = nunits / 2;
+  int low_base = 0;
+  int base;
+  rtx t1;
+  int i;
+
+  if (BYTES_BIG_ENDIAN)
+    base = high ? low_base : high_base;
+  else
+    base = high ? high_base : low_base;
+
+  for (i = 0; i < nunits / 2; i++)
+    RTVEC_ELT (v, i) = GEN_INT (base + i);
+
+  t1 = gen_rtx_PARALLEL (mode, v);
+  return t1;
+}
+
+/* Check OP for validity as a PARALLEL RTX vector with elements
+   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
+   from the perspective of the architecture.  See the diagram above
+   aarch64_simd_vect_par_cnst_half for more details.  */
+
+bool
+aarch32_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
+				       bool high)
+{
+  rtx ideal = aarch32_simd_vect_par_cnst_half (mode, high);
+  HOST_WIDE_INT count_op = XVECLEN (op, 0);
+  HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
+  int i = 0;
+
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  if (count_op != count_ideal)
+    return false;
+
+  for (i = 0; i < count_ideal; i++)
+    {
+      rtx elt_op = XVECEXP (op, 0, i);
+      rtx elt_ideal = XVECEXP (ideal, 0, i);
+
+      if (!CONST_INT_P (elt_op)
+	  || INTVAL (elt_ideal) != INTVAL (elt_op))
+	return false;
+    }
+  return true;
+}
+
 #include "gt-arm.h"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index e5a2b0f..8d311b7 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,51 @@
 
 ;; Widening operations
 
+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width> (sign_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+			       (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    machine_mode mode = GET_MODE (operands[1]);
+    rtx p1, p2;
+ 
+    p1  = aarch32_simd_vect_par_cnst_half (mode, false);
+    p2  = aarch32_simd_vect_par_cnst_half (mode, true);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_low" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %f1" :
+    "vaddw.<V_s_elem>\t%q0, %q3, %e1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_high" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %e1" :
+    "vaddw.<V_s_elem>\t%q0, %q3, %f1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
 (define_insn "widen_ssum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(plus:<V_widen> (sign_extend:<V_widen>
@@ -1184,6 +1229,51 @@
   [(set_attr "type" "neon_add_widen")]
 )
 
+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width> (zero_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+			       (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    machine_mode mode = GET_MODE (operands[1]);
+    rtx p1, p2;
+ 
+    p1  = aarch32_simd_vect_par_cnst_half (mode, false);
+    p2  = aarch32_simd_vect_par_cnst_half (mode, true);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_low" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %f1" :
+    "vaddw.<V_u_elem>\t%q0, %q3, %e1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_high" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+{
+ return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %e1" :
+    "vaddw.<V_u_elem>\t%q0, %q3, %f1";
+}
+  [(set_attr "type" "neon_add_widen")])
+
 (define_insn "widen_usum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(plus:<V_widen> (zero_extend:<V_widen>
@@ -5331,7 +5421,7 @@ if (BYTES_BIG_ENDIAN)
  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
        (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
 			   (match_operand:VU 1 "register_operand" "w") 
-                           (match_operand:VU 2 "vect_par_constant_low" "")))
+					(match_operand:VU 2 "vect_par_constant_low" "")))
  		        (SE:<V_unpack> (vec_select:<V_HALF>
                            (match_operand:VU 3 "register_operand" "w") 
                            (match_dup 2)))))]
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 48e4ba8..e1c24bd 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -605,59 +605,13 @@
 (define_special_predicate "vect_par_constant_high" 
   (match_code "parallel")
 {
-  HOST_WIDE_INT count = XVECLEN (op, 0);
-  int i;
-  int base = GET_MODE_NUNITS (mode);
-
-  if ((count < 1)
-      || (count != base/2))
-    return false;
-    
-  if (!VECTOR_MODE_P (mode))
-    return false;
-
-  for (i = 0; i < count; i++)
-   {
-     rtx elt = XVECEXP (op, 0, i);
-     int val;
-
-     if (!CONST_INT_P (elt))
-       return false;
-
-     val = INTVAL (elt);
-     if (val != (base/2) + i)
-       return false;
-   }
-  return true; 
+  return aarch32_simd_check_vect_par_cnst_half (op, mode, true);
 })
 
 (define_special_predicate "vect_par_constant_low"
   (match_code "parallel")
 {
-  HOST_WIDE_INT count = XVECLEN (op, 0);
-  int i;
-  int base = GET_MODE_NUNITS (mode);
-
-  if ((count < 1)
-      || (count != base/2))
-    return false;
-    
-  if (!VECTOR_MODE_P (mode))
-    return false;
-
-  for (i = 0; i < count; i++)
-   {
-     rtx elt = XVECEXP (op, 0, i);
-     int val;
-
-     if (!CONST_INT_P (elt))
-       return false;
-
-     val = INTVAL (elt);
-     if (val != i)
-       return false;
-   } 
-  return true; 
+  return aarch32_simd_check_vect_par_cnst_half (op, mode, false);
 })
 
 (define_predicate "const_double_vcvt_power_of_two_reciprocal"
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..96c657e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..1bfdc13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int 
+t6(int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..98f8768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..4a72a39
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int 
+t6(int len, void * dummy, unsigned int * __restrict x)
+{
+  len = len & ~31;
+  unsigned long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..9c9c68a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index b543519..4deca1f 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3943,6 +3943,7 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
     } else {
         set et_vect_widen_sum_hi_to_si_pattern_saved 0
         if { [istarget powerpc*-*-*]
+	     || [check_effective_target_arm_neon_ok]
              || [istarget ia64-*-*] } {
             set et_vect_widen_sum_hi_to_si_pattern_saved 1
         }
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-10-20  8:11       ` Michael Collison
@ 2015-10-21 15:14         ` Charles Baylis
  0 siblings, 0 replies; 15+ messages in thread
From: Charles Baylis @ 2015-10-21 15:14 UTC (permalink / raw)
  To: Michael Collison; +Cc: Kyrill Tkachov, GCC Patches, Ramana Radhakrishnan

On 20 October 2015 at 08:54, Michael Collison
<michael.collison@linaro.org> wrote:
> I want to ask a question about existing patterns in neon.md that utilize the
> vec_select and all the lanes as my example does: Why are the following
> pattern not matched if the target is big endian?

> (define_insn "neon_vec_unpack<US>_lo_<mode>"
>   [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
>         (SE:<V_unpack> (vec_select:<V_HALF>
>               (match_operand:VU 1 "register_operand" "w")
>               (match_operand:VU 2 "vect_par_constant_low" ""))))]
>   "TARGET_NEON && !BYTES_BIG_ENDIAN"
>   "vmovl.<US><V_sz_elem> %q0, %e1"
>   [(set_attr "type" "neon_shift_imm_long")]
> )
>
> (define_insn "neon_vec_unpack<US>_hi_<mode>"
>   [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
>         (SE:<V_unpack> (vec_select:<V_HALF>
>               (match_operand:VU 1 "register_operand" "w")
>               (match_operand:VU 2 "vect_par_constant_high" ""))))]
>   "TARGET_NEON && !BYTES_BIG_ENDIAN"
>   "vmovl.<US><V_sz_elem> %q0, %f1"
>   [(set_attr "type" "neon_shift_imm_long")]
>
> These patterns are similar to the new patterns I am adding and I am
> wondering if my patterns should exclude BYTES_BIG_ENDIAN?

These patterns use %e and %f to access the low and high part of the
input operand - so %e is used to match the use of _lo in the pattern
name, and vect_par_constant_low, and %f with _hi and
vect_par_constant_high. For big-endian, the use of %e and %f would
need to be swapped.

Looking at the patch you posted last month (possibly not the latest version?):

This is a pattern which is supposed to act on the low part of the
input vector, hence _lo in the name:
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW
(match_operand:VQI 1 "s_register_operand" "%w")
+   (match_operand:VQI 2 "vect_par_constant_low" "")))
+        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %e1"

Here, using %e1 carries an implicit assumption that the low part of
the input vector is in the lowest numbered of the pair of D registers,
which is only true on little-endian.

This is a bit ugly (and untested) but perhaps something like this
would fix the problem
{
    return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %f1" :
"vaddw.<V_s_elem>\t%q0, %q3, %e1";
}

+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)

Similarly, here. Pattern is _hi, register is %f1:

+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW
(match_operand:VQI 1 "s_register_operand" "%w")
+   (match_operand:VQI 2 "vect_par_constant_high" "")))
+        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)

However, as far as I can see, there isn't an endianness dependency in
widen_ssum<mode>3/widen_usum<mode>3 because both halves of the vector
are used and added together.


Hope this helps
Charles

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-10-08 11:02     ` Kyrill Tkachov
@ 2015-10-20  8:11       ` Michael Collison
  2015-10-21 15:14         ` Charles Baylis
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2015-10-20  8:11 UTC (permalink / raw)
  To: Kyrill Tkachov, GCC Patches, Ramana Radhakrishnan

Hi Kyrill,

Since your email I have done the following:

1. Added the ENDIAN_LANE_N to the define_expand patterns for big endian 
targets. The big endian patches produced no change in the test results. 
I still have several execution failures with targeting big endian with 
lto enabled.

2. I diff'd the rtl dumps from a big endian compiler with lto enabled 
and disabled. I also examined the assembly language and there no 
differences except for the .ascii directives.

I want to ask a question about existing patterns in neon.md that utilize 
the vec_select and all the lanes as my example does: Why are the 
following pattern not matched if the target is big endian?


(define_insn "neon_vec_unpack<US>_lo_<mode>"
   [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
         (SE:<V_unpack> (vec_select:<V_HALF>
               (match_operand:VU 1 "register_operand" "w")
               (match_operand:VU 2 "vect_par_constant_low" ""))))]
   "TARGET_NEON && !BYTES_BIG_ENDIAN"
   "vmovl.<US><V_sz_elem> %q0, %e1"
   [(set_attr "type" "neon_shift_imm_long")]
)

(define_insn "neon_vec_unpack<US>_hi_<mode>"
   [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
         (SE:<V_unpack> (vec_select:<V_HALF>
               (match_operand:VU 1 "register_operand" "w")
               (match_operand:VU 2 "vect_par_constant_high" ""))))]
   "TARGET_NEON && !BYTES_BIG_ENDIAN"
   "vmovl.<US><V_sz_elem> %q0, %f1"
   [(set_attr "type" "neon_shift_imm_long")]

These patterns are similar to the new patterns I am adding and I am 
wondering if my patterns should exclude BYTES_BIG_ENDIAN?

On 10/08/2015 04:02 AM, Kyrill Tkachov wrote:
> Hi Michael,
>
> On 01/10/15 11:05, Michael Collison wrote:
>> Kyrill,
>>
>> I have modified the patch to address your comments. I also modified
>> check_effective_target_vect_widen_sum_hi_to_si_pattern in
>> target-supports.exp to
>> indicate that arm neon supports vector widen sum of HImode to SImode.
>> This resolved
>> several test suite failures.
>>
>> Successfully tested on arm-none-eabi, arm-none-linux-gnueabihf. I have
>> four related execution failure
>> tests on armeb-non-linux-gnueabihf with -flto only.
>>
>> gcc.dg/vect/vect-outer-4f.c -flto -ffat-lto-objects execution test
>> gcc.dg/vect/vect-outer-4g.c -flto -ffat-lto-objects execution test
>> gcc.dg/vect/vect-outer-4k.c -flto -ffat-lto-objects execution test
>> gcc.dg/vect/vect-outer-4l.c -flto -ffat-lto-objects execution test
>
> We'd want to get to the bottom of these before committing.
> Does codegen before and after the patch show anything?
> When it comes to big-endian and NEON, the fiddly parts are
> usually lane numbers. Do you need to select the proper lanes with
> ENDIAN_LANE_N like Charles in his patch at:
> https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00656.html?
>
> Thanks,
> Kyrill
>
>>
>> I am debugging but have not tracked down the root cause yet. Feedback?
>>
>> 2015-07-22  Michael Collison <michael.collison@linaro.org>
>>
>>       * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>>       where mode is VQI to improve mixed mode vectorization.
>>       * config/arm/neon.md 
>> (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): New
>>       define_insn to match low half of signed vaddw.
>>       * config/arm/neon.md 
>> (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): New
>>       define_insn to match high half of signed vaddw.
>>       * config/arm/neon.md 
>> (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): New
>>       define_insn to match low half of unsigned vaddw.
>>       * config/arm/neon.md 
>> (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): New
>>       define_insn to match high half of unsigned vaddw.
>>       * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
>>       * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
>>       * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
>>       * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
>>       * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
>>       * testsuite/lib/target-supports.exp
>>       (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
>>       that arm neon support vector widen sum of HImode TO SImode.
>
> Note that the testsuite changes should have their own ChangeLog entry
> with the paths there starting relative to gcc/testsuite/
>
>>
>> On 09/23/2015 01:49 AM, Kyrill Tkachov wrote:
>>> Hi Michael,
>>>
>>> On 23/09/15 00:52, Michael Collison wrote:
>>>> This is a modified version of the previous patch that removes the
>>>> documentation and read-md.c fixes. These patches have been submitted
>>>> separately and approved.
>>>>
>>>> This patch is designed to address code that was not being 
>>>> vectorized due
>>>> to missing widening patterns in the ARM backend. Code such as:
>>>>
>>>> int t6(int len, void * dummy, short * __restrict x)
>>>> {
>>>>      len = len & ~31;
>>>>      int result = 0;
>>>>      __asm volatile ("");
>>>>      for (int i = 0; i < len; i++)
>>>>        result += x[i];
>>>>      return result;
>>>> }
>>>>
>>>> Validated on arm-none-eabi, arm-none-linux-gnueabi,
>>>> arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
>>>>
>>>> 2015-09-22  Michael Collison <michael.collison@linaro.org>
>>>>
>>>>        * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>>>>        where mode is VQI to improve mixed mode add vectorization.
>>>>
>>> Please list all the new define_expands and define_insns
>>> in the changelog. Also, please add an ChangeLog entry for
>>> the testsuite additions.
>>>
>>> The approach looks ok to me with a few comments on some
>>> parts of the patch itself.
>>>
>>>
>>> +(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
>>> +  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
>>> +    (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW
>>> (match_operand:VQI 1 "s_register_operand" "%w")
>>> +                           (match_operand:VQI 2
>>> "vect_par_constant_high" "")))
>>> +                (match_operand:<VW:V_widen> 3 "s_register_operand"
>>> "0")))]
>>> +  "TARGET_NEON"
>>> +  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
>>> +  [(set_attr "type" "neon_add_widen")
>>> +  (set_attr "length" "8")]
>>> +)
>>>
>>>
>>> This is a single instruction, and it has a length of 4, so no need to
>>> override the length attribute.
>>> Same with the other define_insns in this patch.
>>>
>>>
>>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>>> b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>>> new file mode 100644
>>> index 0000000..ed10669
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>>> @@ -0,0 +1,21 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target arm_neon_hw } */
>>>
>>> The arm_neon_hw check is usually used when you want to run the tests.
>>> Since this is a compile-only tests you just need arm_neon_ok.
>>>
>>>   +/* { dg-add-options arm_neon_ok } */
>>> +/* { dg-options "-O3" } */
>>> +
>>> +
>>> +int
>>> +t6(int len, void * dummy, short * __restrict x)
>>> +{
>>> +  len = len & ~31;
>>> +  int result = 0;
>>> +  __asm volatile ("");
>>> +  for (int i = 0; i < len; i++)
>>> +    result += x[i];
>>> +  return result;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "vaddw\.s16" } } */
>>> +
>>> +
>>> +
>>>
>>> Stray trailing newlines. Similar comments for the other testcases.
>>>
>>> Thanks,
>>> Kyrill
>>>
>

-- 
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-10-01 10:05   ` Michael Collison
@ 2015-10-08 11:02     ` Kyrill Tkachov
  2015-10-20  8:11       ` Michael Collison
  0 siblings, 1 reply; 15+ messages in thread
From: Kyrill Tkachov @ 2015-10-08 11:02 UTC (permalink / raw)
  To: Michael Collison, GCC Patches, Ramana Radhakrishnan

Hi Michael,

On 01/10/15 11:05, Michael Collison wrote:
> Kyrill,
>
> I have modified the patch to address your comments. I also modified
> check_effective_target_vect_widen_sum_hi_to_si_pattern in
> target-supports.exp to
> indicate that arm neon supports vector widen sum of HImode to SImode.
> This resolved
> several test suite failures.
>
> Successfully tested on arm-none-eabi, arm-none-linux-gnueabihf. I have
> four related execution failure
> tests on armeb-non-linux-gnueabihf with -flto only.
>
> gcc.dg/vect/vect-outer-4f.c -flto -ffat-lto-objects execution test
> gcc.dg/vect/vect-outer-4g.c -flto -ffat-lto-objects execution test
> gcc.dg/vect/vect-outer-4k.c -flto -ffat-lto-objects execution test
> gcc.dg/vect/vect-outer-4l.c -flto -ffat-lto-objects execution test

We'd want to get to the bottom of these before committing.
Does codegen before and after the patch show anything?
When it comes to big-endian and NEON, the fiddly parts are
usually lane numbers. Do you need to select the proper lanes with
ENDIAN_LANE_N like Charles in his patch at:
https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00656.html?

Thanks,
Kyrill

>
> I am debugging but have not tracked down the root cause yet. Feedback?
>
> 2015-07-22  Michael Collison  <michael.collison@linaro.org>
>
>       * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>       where mode is VQI to improve mixed mode vectorization.
>       * config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): New
>       define_insn to match low half of signed vaddw.
>       * config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): New
>       define_insn to match high half of signed vaddw.
>       * config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): New
>       define_insn to match low half of unsigned vaddw.
>       * config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): New
>       define_insn to match high half of unsigned vaddw.
>       * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
>       * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
>       * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
>       * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
>       * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
>       * testsuite/lib/target-supports.exp
>       (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
>       that arm neon support vector widen sum of HImode TO SImode.

Note that the testsuite changes should have their own ChangeLog entry
with the paths there starting relative to gcc/testsuite/

>
> On 09/23/2015 01:49 AM, Kyrill Tkachov wrote:
>> Hi Michael,
>>
>> On 23/09/15 00:52, Michael Collison wrote:
>>> This is a modified version of the previous patch that removes the
>>> documentation and read-md.c fixes. These patches have been submitted
>>> separately and approved.
>>>
>>> This patch is designed to address code that was not being vectorized due
>>> to missing widening patterns in the ARM backend. Code such as:
>>>
>>> int t6(int len, void * dummy, short * __restrict x)
>>> {
>>>      len = len & ~31;
>>>      int result = 0;
>>>      __asm volatile ("");
>>>      for (int i = 0; i < len; i++)
>>>        result += x[i];
>>>      return result;
>>> }
>>>
>>> Validated on arm-none-eabi, arm-none-linux-gnueabi,
>>> arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
>>>
>>> 2015-09-22  Michael Collison <michael.collison@linaro.org>
>>>
>>>        * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>>>        where mode is VQI to improve mixed mode add vectorization.
>>>
>> Please list all the new define_expands and define_insns
>> in the changelog. Also, please add an ChangeLog entry for
>> the testsuite additions.
>>
>> The approach looks ok to me with a few comments on some
>> parts of the patch itself.
>>
>>
>> +(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
>> +  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
>> +    (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW
>> (match_operand:VQI 1 "s_register_operand" "%w")
>> +                           (match_operand:VQI 2
>> "vect_par_constant_high" "")))
>> +                (match_operand:<VW:V_widen> 3 "s_register_operand"
>> "0")))]
>> +  "TARGET_NEON"
>> +  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
>> +  [(set_attr "type" "neon_add_widen")
>> +  (set_attr "length" "8")]
>> +)
>>
>>
>> This is a single instruction, and it has a length of 4, so no need to
>> override the length attribute.
>> Same with the other define_insns in this patch.
>>
>>
>> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>> b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>> new file mode 100644
>> index 0000000..ed10669
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
>> @@ -0,0 +1,21 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_neon_hw } */
>>
>> The arm_neon_hw check is usually used when you want to run the tests.
>> Since this is a compile-only tests you just need arm_neon_ok.
>>
>>   +/* { dg-add-options arm_neon_ok } */
>> +/* { dg-options "-O3" } */
>> +
>> +
>> +int
>> +t6(int len, void * dummy, short * __restrict x)
>> +{
>> +  len = len & ~31;
>> +  int result = 0;
>> +  __asm volatile ("");
>> +  for (int i = 0; i < len; i++)
>> +    result += x[i];
>> +  return result;
>> +}
>> +
>> +/* { dg-final { scan-assembler "vaddw\.s16" } } */
>> +
>> +
>> +
>>
>> Stray trailing newlines. Similar comments for the other testcases.
>>
>> Thanks,
>> Kyrill
>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-09-23  8:59 ` Kyrill Tkachov
@ 2015-10-01 10:05   ` Michael Collison
  2015-10-08 11:02     ` Kyrill Tkachov
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2015-10-01 10:05 UTC (permalink / raw)
  To: Kyrill Tkachov, GCC Patches, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 4871 bytes --]

Kyrill,

I have modified the patch to address your comments. I also modified 
check_effective_target_vect_widen_sum_hi_to_si_pattern in 
target-supports.exp to
indicate that arm neon supports vector widen sum of HImode to SImode. 
This resolved
several test suite failures.

Successfully tested on arm-none-eabi, arm-none-linux-gnueabihf. I have 
four related execution failure
tests on armeb-non-linux-gnueabihf with -flto only.

gcc.dg/vect/vect-outer-4f.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4g.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4k.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4l.c -flto -ffat-lto-objects execution test


I am debugging but have not tracked down the root cause yet. Feedback?

2015-07-22  Michael Collison  <michael.collison@linaro.org>

     * config/arm/neon.md (widen_<us>sum<mode>): New patterns
     where mode is VQI to improve mixed mode vectorization.
     * config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): New
     define_insn to match low half of signed vaddw.
     * config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): New
     define_insn to match high half of signed vaddw.
     * config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): New
     define_insn to match low half of unsigned vaddw.
     * config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): New
     define_insn to match high half of unsigned vaddw.
     * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
     * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
     * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
     * testsuite/lib/target-supports.exp
     (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
     that arm neon support vector widen sum of HImode TO SImode.

On 09/23/2015 01:49 AM, Kyrill Tkachov wrote:
> Hi Michael,
>
> On 23/09/15 00:52, Michael Collison wrote:
>> This is a modified version of the previous patch that removes the
>> documentation and read-md.c fixes. These patches have been submitted
>> separately and approved.
>>
>> This patch is designed to address code that was not being vectorized due
>> to missing widening patterns in the ARM backend. Code such as:
>>
>> int t6(int len, void * dummy, short * __restrict x)
>> {
>>     len = len & ~31;
>>     int result = 0;
>>     __asm volatile ("");
>>     for (int i = 0; i < len; i++)
>>       result += x[i];
>>     return result;
>> }
>>
>> Validated on arm-none-eabi, arm-none-linux-gnueabi,
>> arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
>>
>> 2015-09-22  Michael Collison <michael.collison@linaro.org>
>>
>>       * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>>       where mode is VQI to improve mixed mode add vectorization.
>>
>
> Please list all the new define_expands and define_insns
> in the changelog. Also, please add an ChangeLog entry for
> the testsuite additions.
>
> The approach looks ok to me with a few comments on some
> parts of the patch itself.
>
>
> +(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
> +  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
> +    (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW 
> (match_operand:VQI 1 "s_register_operand" "%w")
> +                           (match_operand:VQI 2 
> "vect_par_constant_high" "")))
> +                (match_operand:<VW:V_widen> 3 "s_register_operand" 
> "0")))]
> +  "TARGET_NEON"
> +  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
> +  [(set_attr "type" "neon_add_widen")
> +  (set_attr "length" "8")]
> +)
>
>
> This is a single instruction, and it has a length of 4, so no need to 
> override the length attribute.
> Same with the other define_insns in this patch.
>
>
> diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c 
> b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
> new file mode 100644
> index 0000000..ed10669
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_hw } */
>
> The arm_neon_hw check is usually used when you want to run the tests.
> Since this is a compile-only tests you just need arm_neon_ok.
>
>  +/* { dg-add-options arm_neon_ok } */
> +/* { dg-options "-O3" } */
> +
> +
> +int
> +t6(int len, void * dummy, short * __restrict x)
> +{
> +  len = len & ~31;
> +  int result = 0;
> +  __asm volatile ("");
> +  for (int i = 0; i < len; i++)
> +    result += x[i];
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler "vaddw\.s16" } } */
> +
> +
> +
>
> Stray trailing newlines. Similar comments for the other testcases.
>
> Thanks,
> Kyrill
>

-- 
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org


[-- Attachment #2: tcwg-833-aarc32-upstream.patch --]
[-- Type: text/x-patch, Size: 8791 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 654d9d5..b3485f1 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,55 @@
 
 ;; Widening operations
 
+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width> (sign_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+			       (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    int i;
+    int half_elem = <V_mode_nunits>/2;
+    rtvec v1 = rtvec_alloc (half_elem);
+    rtvec v2 = rtvec_alloc (half_elem);
+    rtx p1, p2;
+
+    for (i = 0; i < half_elem; i++)
+      RTVEC_ELT (v1, i) = GEN_INT (i);
+    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+    for (i = half_elem; i < <V_mode_nunits>; i++)
+      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_low" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %e1"
+  [(set_attr "type" "neon_add_widen")]
+)
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_high" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")]
+)
+
 (define_insn "widen_ssum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(plus:<V_widen> (sign_extend:<V_widen>
@@ -1184,6 +1233,55 @@
   [(set_attr "type" "neon_add_widen")]
 )
 
+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width> (zero_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+			       (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    int i;
+    int half_elem = <V_mode_nunits>/2;
+    rtvec v1 = rtvec_alloc (half_elem);
+    rtvec v2 = rtvec_alloc (half_elem);
+    rtx p1, p2;
+
+    for (i = 0; i < half_elem; i++)
+      RTVEC_ELT (v1, i) = GEN_INT (i);
+    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+    for (i = half_elem; i < <V_mode_nunits>; i++)
+      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_low" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_u_elem>\t%q0, %q3, %e1"
+  [(set_attr "type" "neon_add_widen")]
+)
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_high" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_u_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")]
+)
+
 (define_insn "widen_usum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(plus:<V_widen> (zero_extend:<V_widen>
@@ -5347,7 +5445,7 @@
  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
        (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
 			   (match_operand:VU 1 "register_operand" "w") 
-                           (match_operand:VU 2 "vect_par_constant_low" "")))
+					(match_operand:VU 2 "vect_par_constant_low" "")))
  		        (SE:<V_unpack> (vec_select:<V_HALF>
                            (match_operand:VU 3 "register_operand" "w") 
                            (match_dup 2)))))]
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..96c657e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..1bfdc13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int 
+t6(int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..98f8768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..4a72a39
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int 
+t6(int len, void * dummy, unsigned int * __restrict x)
+{
+  len = len & ~31;
+  unsigned long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..9c9c68a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 1988301..5530edc 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3838,6 +3838,7 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
     } else {
         set et_vect_widen_sum_hi_to_si_pattern_saved 0
         if { [istarget powerpc*-*-*]
+	     || [check_effective_target_arm_neon_ok]
              || [istarget ia64-*-*] } {
             set et_vect_widen_sum_hi_to_si_pattern_saved 1
         }
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
  2015-09-23  2:40 Michael Collison
@ 2015-09-23  8:59 ` Kyrill Tkachov
  2015-10-01 10:05   ` Michael Collison
  0 siblings, 1 reply; 15+ messages in thread
From: Kyrill Tkachov @ 2015-09-23  8:59 UTC (permalink / raw)
  To: Michael Collison, GCC Patches, Ramana Radhakrishnan

Hi Michael,

On 23/09/15 00:52, Michael Collison wrote:
> This is a modified version of the previous patch that removes the
> documentation and read-md.c fixes. These patches have been submitted
> separately and approved.
>
> This patch is designed to address code that was not being vectorized due
> to missing widening patterns in the ARM backend. Code such as:
>
> int t6(int len, void * dummy, short * __restrict x)
> {
>     len = len & ~31;
>     int result = 0;
>     __asm volatile ("");
>     for (int i = 0; i < len; i++)
>       result += x[i];
>     return result;
> }
>
> Validated on arm-none-eabi, arm-none-linux-gnueabi,
> arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
>
> 2015-09-22  Michael Collison  <michael.collison@linaro.org>
>
>       * config/arm/neon.md (widen_<us>sum<mode>): New patterns
>       where mode is VQI to improve mixed mode add vectorization.
>

Please list all the new define_expands and define_insns
in the changelog. Also, please add an ChangeLog entry for
the testsuite additions.

The approach looks ok to me with a few comments on some
parts of the patch itself.


+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_high" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)


This is a single instruction, and it has a length of 4, so no need to override the length attribute.
Same with the other define_insns in this patch.


diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..ed10669
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */

The arm_neon_hw check is usually used when you want to run the tests.
Since this is a compile-only tests you just need arm_neon_ok.

  +/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
+
+
+

Stray trailing newlines. Similar comments for the other testcases.

Thanks,
Kyrill

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [ARM] Use vector wide add for mixed-mode adds
@ 2015-09-23  2:40 Michael Collison
  2015-09-23  8:59 ` Kyrill Tkachov
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2015-09-23  2:40 UTC (permalink / raw)
  To: GCC Patches, Ramana.Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 874 bytes --]

This is a modified version of the previous patch that removes the 
documentation and read-md.c fixes. These patches have been submitted 
separately and approved.

This patch is designed to address code that was not being vectorized due 
to missing widening patterns in the ARM backend. Code such as:

int t6(int len, void * dummy, short * __restrict x)
{
   len = len & ~31;
   int result = 0;
   __asm volatile ("");
   for (int i = 0; i < len; i++)
     result += x[i];
   return result;
}

Validated on arm-none-eabi, arm-none-linux-gnueabi, 
arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.

2015-09-22  Michael Collison  <michael.collison@linaro.org>

     * config/arm/neon.md (widen_<us>sum<mode>): New patterns
     where mode is VQI to improve mixed mode add vectorization.

-- 
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org


[-- Attachment #2: tcwg-833-aarch32-v8.patch --]
[-- Type: text/x-patch, Size: 7719 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 654d9d5..54623fe 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,57 @@
 
 ;; Widening operations
 
+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width> (sign_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+			       (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    int i;
+    int half_elem = <V_mode_nunits>/2;
+    rtvec v1 = rtvec_alloc (half_elem);
+    rtvec v2 = rtvec_alloc (half_elem);
+    rtx p1, p2;
+
+    for (i = 0; i < half_elem; i++)
+      RTVEC_ELT (v1, i) = GEN_INT (i);
+    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+    for (i = half_elem; i < <V_mode_nunits>; i++)
+      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_low" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %e1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_high" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_s_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
 (define_insn "widen_ssum<mode>3"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
 	(plus:<V_widen> (sign_extend:<V_widen>
@@ -1184,4 +1235,55 @@
   [(set_attr "type" "neon_add_widen")]
 )
 
+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+	(plus:<V_double_width> (zero_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+			       (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+  "TARGET_NEON"
+  {
+    int i;
+    int half_elem = <V_mode_nunits>/2;
+    rtvec v1 = rtvec_alloc (half_elem);
+    rtvec v2 = rtvec_alloc (half_elem);
+    rtx p1, p2;
+
+    for (i = 0; i < half_elem; i++)
+      RTVEC_ELT (v1, i) = GEN_INT (i);
+    p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+    for (i = half_elem; i < <V_mode_nunits>; i++)
+      RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+    p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+    if (operands[0] != operands[2])
+      emit_move_insn (operands[0], operands[2]);
+
+    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+    DONE;
+  }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_low" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_u_elem>\t%q0, %q3, %e1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+	(plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+						   (match_operand:VQI 2 "vect_par_constant_high" "")))
+		        (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+  "TARGET_NEON"
+  "vaddw.<V_u_elem>\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)
+
 
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..ed10669
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..94bf0c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int 
+t6(int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..98f8768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..2e9af56
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int 
+t6(int len, void * dummy, unsigned int * __restrict x)
+{
+  len = len & ~31;
+  unsigned long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..de2ad8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int 
+t6(int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
+
+
+
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2016-02-15  6:32 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-08-18  8:02 [ARM] Use vector wide add for mixed-mode adds Michael Collison
2015-08-18 13:46 ` Ramana Radhakrishnan
2015-08-23  4:16   ` Michael Collison
2015-08-24  8:37     ` Ramana Radhakrishnan
2015-09-23  2:40 Michael Collison
2015-09-23  8:59 ` Kyrill Tkachov
2015-10-01 10:05   ` Michael Collison
2015-10-08 11:02     ` Kyrill Tkachov
2015-10-20  8:11       ` Michael Collison
2015-10-21 15:14         ` Charles Baylis
2015-11-30  6:59 Michael Collison
2015-12-10 15:09 ` Kyrill Tkachov
2015-12-17  0:02   ` Michael Collison
2016-02-09 16:27     ` Kyrill Tkachov
2016-02-15  6:32       ` Michael Collison

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).