public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH][AArch64] Make use of FADDP in simple reductions
@ 2019-05-08 13:36 Elen Kalda
  2019-05-30 14:03 ` Sudakshina Das
  0 siblings, 1 reply; 7+ messages in thread
From: Elen Kalda @ 2019-05-08 13:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, Richard Earnshaw, James Greenhalgh, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 762 bytes --]

Hi,

This patch adds a pattern to support the FADDP (scalar) instruction.

Before the patch, the C code

typedef double v2df __attribute__((vector_size (16)));

double
foo (v2df x)
{
  return x[1] + x[0];
}

generated:
foo:
        dup     d1, v0.d[0]
        dup     d0, v0.d[1]
        fadd    d0, d1, d0
        ret

After patch:
foo:
	faddp	d0, v0.2d
	ret


Bootstrapped and done regression tests on aarch64-none-linux-gnu - 
no issues found.

Best wishes,
Elen


gcc/ChangeLog:

2019-04-24  Elen Kalda  <elen.kalda@arm.com>

	* config/aarch64/aarch64-simd.md (*aarch64_faddp<mode>): New.

gcc/testsuite/ChangeLog:

2019-04-24  Elen Kalda  <elen.kalda@arm.com>

	* gcc.target/aarch64/simd/scalar_faddp.c: New test.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: rb11117.patch --]
[-- Type: text/x-patch; name="rb11117.patch", Size: 2229 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index e3852c5d182b70978d7603225fce55c0b8ee2894..89fedc6cb3f0c6eb74c6f8d0b21cedb5ae20a095 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2372,6 +2372,21 @@
   [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
+(define_insn "*aarch64_faddp<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+    (plus:<VEL>
+      (vec_select:<VEL> (match_operand:VHSDF 1 "register_operand" "w")
+		(parallel[(match_operand 2 "const_int_operand" "n")]))
+      (vec_select:<VEL> (match_dup:VHSDF 1)
+		(parallel[(match_operand 3 "const_int_operand" "n")]))))]
+  "TARGET_SIMD
+  && ((INTVAL (operands[2]) == 0 && INTVAL (operands[3]) == 1)
+    || (INTVAL (operands[2]) == 1 && INTVAL (operands[3]) == 0))"
+  "faddp\t%<Vetype>0, %1.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+
 (define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
new file mode 100644
index 0000000000000000000000000000000000000000..2396286d483c16c8b70b16fa08bffeb15f034a93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
@@ -0,0 +1,31 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { scan-assembler-not "dup" } } */
+
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+double
+foo (v2df x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } } */
+
+float
+foo1 (v4sf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } } */
+
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread
* [PATCH]AArch64 Make use of FADDP in simple reductions.
@ 2021-09-01  9:59 Tamar Christina
  2021-09-08 12:38 ` Tamar Christina
  2021-10-08 16:24 ` Richard Sandiford
  0 siblings, 2 replies; 7+ messages in thread
From: Tamar Christina @ 2021-09-01  9:59 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 6908 bytes --]

Hi All,

This is a respin of an older patch which never got upstream reviewed by a
maintainer.  It's been updated to fit the current GCC codegen.

This patch adds a pattern to support the (F)ADDP (scalar) instruction.

Before the patch, the C code

typedef float v4sf __attribute__((vector_size (16)));

float
foo1 (v4sf x)
{
  return x[0] + x[1];
}

generated:

foo1:
	dup     s1, v0.s[1]
	fadd    s0, s1, s0
	ret

After patch:
foo1:
	faddp   s0, v0.2s
	ret

The double case is now handled by SLP but the remaining cases still need help
from combine.  I have kept the integer and floating point separate because of
the integer one only supports V2DI and sharing it with the float would have
required definition of a few new iterators for just a single use.

I provide support for when both elements are subregs as a different pattern
as there's no way to tell reload that the two registers must be equal with
just constraints.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>,
	*aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>,
	*aarch64_addp_scalar2v2di): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
	* gcc.target/aarch64/simd/scalar_addp.c: New test.

Co-authored-by: Tamar Christina <tamar.christina@arm.com>

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a9384424f44bde05 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3414,6 +3414,70 @@ (define_insn "aarch64_faddp<mode>"
   [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_faddp_scalar2<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(plus:<VEL>
+	  (vec_select:<VEL>
+	    (match_operator:<VEL> 1 "subreg_lowpart_operator"
+	      [(match_operand:VHSDF 2 "register_operand" "w")])
+	    (parallel [(match_operand 3 "const_int_operand" "n")]))
+	  (match_dup:<VEL> 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
+  "faddp\t%<Vetype>0, %2.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+(define_insn "*aarch64_faddp_scalar<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(plus:<VEL>
+	  (vec_select:<VEL>
+	    (match_operand:VHSDF 1 "register_operand" "w")
+	    (parallel [(match_operand 2 "const_int_operand" "n")]))
+	  (match_operand:<VEL> 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "faddp\t%<Vetype>0, %1.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_addp_scalar2v2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+	(plus:DI
+	  (vec_select:DI
+	    (match_operator:DI 1 "subreg_lowpart_operator"
+	      [(match_operand:V2DI 2 "register_operand" "w")])
+	    (parallel [(match_operand 3 "const_int_operand" "n")]))
+	  (match_dup:DI 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1"
+  "addp\t%d0, %2.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
+(define_insn "*aarch64_addp_scalarv2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+	(plus:DI
+	  (vec_select:DI
+	    (match_operand:V2DI 1 "register_operand" "w")
+	    (parallel [(match_operand 2 "const_int_operand" "n")]))
+	  (match_operand:DI 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "addp\t%d0, %1.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
 (define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e6b76c0716344ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
@@ -0,0 +1,11 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+
+long long
+foo (v2di x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc61426ba7b9ef91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
@@ -0,0 +1,44 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { scan-assembler-times "dup" 4 } } */
+
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+double
+foo (v2df x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } } */
+
+float
+foo1 (v4sf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } } */
+
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } } */
+
+float
+foo3 (v4sf x)
+{
+  return x[2] + x[3];
+}
+
+__fp16
+foo4 (v8hf x)
+{
+  return x[6] + x[7];
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
new file mode 100644
index 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ecefdc0dbccb16bd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
@@ -0,0 +1,14 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -w" } */
+
+typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
+double a[];
+*b;
+fn1() {
+  __m128i c;
+  *(__m128i *)a = c;
+  *b = a[0] + a[1];
+}
+
+/* { dg-final { scan-assembler-times "faddp" 1 } } */
+


-- 

[-- Attachment #2: rb14681.patch --]
[-- Type: text/x-diff, Size: 5432 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a9384424f44bde05 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3414,6 +3414,70 @@ (define_insn "aarch64_faddp<mode>"
   [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_faddp_scalar2<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(plus:<VEL>
+	  (vec_select:<VEL>
+	    (match_operator:<VEL> 1 "subreg_lowpart_operator"
+	      [(match_operand:VHSDF 2 "register_operand" "w")])
+	    (parallel [(match_operand 3 "const_int_operand" "n")]))
+	  (match_dup:<VEL> 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
+  "faddp\t%<Vetype>0, %2.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+(define_insn "*aarch64_faddp_scalar<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(plus:<VEL>
+	  (vec_select:<VEL>
+	    (match_operand:VHSDF 1 "register_operand" "w")
+	    (parallel [(match_operand 2 "const_int_operand" "n")]))
+	  (match_operand:<VEL> 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "faddp\t%<Vetype>0, %1.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_addp_scalar2v2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+	(plus:DI
+	  (vec_select:DI
+	    (match_operator:DI 1 "subreg_lowpart_operator"
+	      [(match_operand:V2DI 2 "register_operand" "w")])
+	    (parallel [(match_operand 3 "const_int_operand" "n")]))
+	  (match_dup:DI 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1"
+  "addp\t%d0, %2.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
+(define_insn "*aarch64_addp_scalarv2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+	(plus:DI
+	  (vec_select:DI
+	    (match_operand:V2DI 1 "register_operand" "w")
+	    (parallel [(match_operand 2 "const_int_operand" "n")]))
+	  (match_operand:DI 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "addp\t%d0, %1.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
 (define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e6b76c0716344ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
@@ -0,0 +1,11 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+
+long long
+foo (v2di x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc61426ba7b9ef91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
@@ -0,0 +1,44 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { scan-assembler-times "dup" 4 } } */
+
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+double
+foo (v2df x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } } */
+
+float
+foo1 (v4sf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } } */
+
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } } */
+
+float
+foo3 (v4sf x)
+{
+  return x[2] + x[3];
+}
+
+__fp16
+foo4 (v8hf x)
+{
+  return x[6] + x[7];
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
new file mode 100644
index 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ecefdc0dbccb16bd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
@@ -0,0 +1,14 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -w" } */
+
+typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
+double a[];
+*b;
+fn1() {
+  __m128i c;
+  *(__m128i *)a = c;
+  *b = a[0] + a[1];
+}
+
+/* { dg-final { scan-assembler-times "faddp" 1 } } */
+


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-11-01  8:49 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-08 13:36 [PATCH][AArch64] Make use of FADDP in simple reductions Elen Kalda
2019-05-30 14:03 ` Sudakshina Das
2021-09-01  9:59 [PATCH]AArch64 " Tamar Christina
2021-09-08 12:38 ` Tamar Christina
2021-10-08 16:24 ` Richard Sandiford
2021-11-01  7:39   ` Tamar Christina
2021-11-01  8:49     ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).