[PATCH 3/9][GCC][AArch64] Add autovectorization support for Complex instructions

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Tamar Christina <Tamar.Christina@arm.com>
To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>
Cc: nd <nd@arm.com>, James Greenhalgh <James.Greenhalgh@arm.com>,
	Richard Earnshaw <Richard.Earnshaw@arm.com>,
	Marcus Shawcroft	<Marcus.Shawcroft@arm.com>
Subject: [PATCH 3/9][GCC][AArch64] Add autovectorization support for Complex instructions
Date: Sun, 11 Nov 2018 10:26:00 -0000	[thread overview]
Message-ID: <20181111102628.GA4529@arm.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 2279 bytes --]

Hi All,

This patch adds the expander support for supporting autovectorization of complex number operations
such as Complex addition with a rotation along the Argand plane.  This also adds support for complex
FMA.

The instructions are described in the ArmARM [1] and are available from Armv8.3-a onwards.

Concretely, this generates

f90:
	mov	x3, 0
	.p2align 3,,7
.L2:
	ldr	q0, [x0, x3]
	ldr	q1, [x1, x3]
	fcadd	v0.2d, v0.2d, v1.2d, #90
	str	q0, [x2, x3]
	add	x3, x3, 16
	cmp	x3, 3200
	bne	.L2
	ret

now instead of

f90:
	mov	x4, x1
	mov	x1, x2
	add	x3, x4, 31
	add	x2, x0, 31
	sub	x3, x3, x1
	sub	x2, x2, x1
	cmp	x3, 62
	mov	x3, 62
	ccmp	x2, x3, 0, hi
	bls	.L5
	mov	x2, x4
	add	x3, x0, 3200
	.p2align 3,,7
.L3:
	ld2	{v2.2d - v3.2d}, [x0], 32
	ld2	{v4.2d - v5.2d}, [x2], 32
	cmp	x0, x3
	fsub	v0.2d, v2.2d, v5.2d
	fadd	v1.2d, v4.2d, v3.2d
	st2	{v0.2d - v1.2d}, [x1], 32
	bne	.L3
	ret
.L5:
	add	x6, x0, 8
	add	x5, x4, 8
	add	x2, x1, 8
	mov	x3, 0
	.p2align 3,,7
.L2:
	ldr	d1, [x0, x3]
	ldr	d3, [x5, x3]
	ldr	d0, [x6, x3]
	ldr	d2, [x4, x3]
	fsub	d1, d1, d3
	fadd	d0, d0, d2
	str	d1, [x1, x3]
	str	d0, [x2, x3]
	add	x3, x3, 16
	cmp	x3, 3200
	bne	.L2
	ret

For complex additions with a 90* rotation along the Argand plane.

[1] https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile

Bootstrap and Regtest on aarch64-none-linux-gnu, arm-none-gnueabihf and x86_64-pc-linux-gnu
are still on going but previous patch showed no regressions.

The instructions have also been tested on aarch64-none-elf and arm-none-eabi on a Armv8.3-a model
and -march=Armv8.3-a+fp16 and all tests pass.

Ok for trunk?

Thanks,
Tamar

gcc/ChangeLog:

2018-11-11  Tamar Christina  <tamar.christina@arm.com>

	* config/aarch64/aarch64-simd.md (aarch64_fcadd<rot><mode>,
	fcadd<rot><mode>3, aarch64_fcmla<rot><mode>,
	fcmla<rot><mode>4): New.
	* config/aarch64/aarch64.h (TARGET_COMPLEX): New.
	* config/aarch64/iterators.md (UNSPEC_FCADD90, UNSPEC_FCADD270,
	UNSPEC_FCMLA, UNSPEC_FCMLA90, UNSPEC_FCMLA180, UNSPEC_FCMLA270): New.
	(FCADD, FCMLA): New.
	(rot, rotsplit1, rotsplit2): New.
	* config/arm/types.md (neon_fcadd, neon_fcmla): New.

-- 

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: rb10278.patch --]
[-- Type: text/x-diff; name="rb10278.patch", Size: 7074 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c4be3101fdec930707918106cd7c53cf7584553e..12a91183a98ea23015860c77a97955cb1b30bfbb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -419,6 +419,63 @@
 }
 )
 
+;; The fcadd and fcmla patterns are made UNSPEC for the explicitly due to the
+;; fact that their usage need to guarantee that the source vectors are
+;; contiguous.  It would be wrong to describe the operation without being able
+;; to describe the permute that is also required, but even if that is done
+;; the permute would have been created as a LOAD_LANES which means the values
+;; in the registers are in the wrong order.
+(define_insn "aarch64_fcadd<rot><mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+		       (match_operand:VHSDF 2 "register_operand" "w")]
+		       FCADD))]
+  "TARGET_COMPLEX"
+  "fcadd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>, #<rot>"
+  [(set_attr "type" "neon_fcadd")]
+)
+
+(define_expand "fcadd<rot><mode>3"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")
+		       (match_operand:VHSDF 2 "register_operand")]
+		       FCADD))]
+  "TARGET_COMPLEX"
+{
+  emit_insn (gen_aarch64_fcadd<rot><mode> (operands[0], operands[1],
+					   operands[2]));
+  DONE;
+})
+
+(define_insn "aarch64_fcmla<rot><mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0")
+		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w")
+				   (match_operand:VHSDF 3 "register_operand" "w")]
+				   FCMLA)))]
+  "TARGET_COMPLEX"
+  "fcmla\t%0.<Vtype>, %2.<Vtype>, %3.<Vtype>, #<rot>"
+  [(set_attr "type" "neon_fcmla")]
+)
+
+;; The complex mla operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "fcmla<rot><mode>4"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(plus:VHSDF (match_operand:VHSDF 1 "register_operand")
+		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand")
+				   (match_operand:VHSDF 3 "register_operand")]
+				   FCMLA)))]
+  "TARGET_COMPLEX"
+{
+  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0], operands[1],
+						 operands[2], operands[3]));
+  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], operands[0],
+						 operands[2], operands[3]));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "aarch64_<sur>dot<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
@@ -3026,22 +3083,22 @@
     operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "smov\\t%<GPI:w>0, %1.<VDQQH:Vetype>[%2]";
   }
-  [(set_attr "type" "neon_to_gp<q>")]
-)
-
-(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-	(zero_extend:GPI
-	  (vec_select:<VEL>
-	    (match_operand:VDQQH 1 "register_operand" "w")
-	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
-  "TARGET_SIMD"
-  {
-    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
-					   INTVAL (operands[2]));
-    return "umov\\t%w0, %1.<Vetype>[%2]";
-  }
-  [(set_attr "type" "neon_to_gp<q>")]
+  [(set_attr "type" "neon_to_gp<q>")]
+)
+
+(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(zero_extend:GPI
+	  (vec_select:<VEL>
+	    (match_operand:VDQQH 1 "register_operand" "w")
+	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
+  "TARGET_SIMD"
+  {
+    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
+					   INTVAL (operands[2]));
+    return "umov\\t%w0, %1.<Vetype>[%2]";
+  }
+  [(set_attr "type" "neon_to_gp<q>")]
 )
 
 ;; Lane extraction of a value, neither sign nor zero extension
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 0c833a8fdfde0fde0e7c1f01cbdbdef4c2fb0009..445fc939f56dfb51b454acea03bb73658f58ab38 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -254,6 +254,9 @@ extern unsigned aarch64_architecture_version;
 /* ARMv8.3-A features.  */
 #define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)
 
+/* Armv8.3-a Complex number extension to AdvSIMD extensions.  */
+#define TARGET_COMPLEX (TARGET_SIMD && TARGET_ARMV8_3)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 524e4e6929bc9a7136966987600de2513748c20b..2ff0ca04f5970dffca0ba831fb39cbff96b2cdf6 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -485,6 +485,12 @@
     UNSPEC_COND_GE	; Used in aarch64-sve.md.
     UNSPEC_COND_GT	; Used in aarch64-sve.md.
     UNSPEC_LASTB	; Used in aarch64-sve.md.
+    UNSPEC_FCADD90	; Used in aarch64-simd.md.
+    UNSPEC_FCADD270	; Used in aarch64-simd.md.
+    UNSPEC_FCMLA	; Used in aarch64-simd.md.
+    UNSPEC_FCMLA90	; Used in aarch64-simd.md.
+    UNSPEC_FCMLA180	; Used in aarch64-simd.md.
+    UNSPEC_FCMLA270	; Used in aarch64-simd.md.
 ])
 
 ;; ------------------------------------------------------------------
@@ -1580,6 +1586,14 @@
 				      UNSPEC_COND_EQ UNSPEC_COND_NE
 				      UNSPEC_COND_GE UNSPEC_COND_GT])
 
+(define_int_iterator FCADD [UNSPEC_FCADD90
+			    UNSPEC_FCADD270])
+
+(define_int_iterator FCMLA [UNSPEC_FCMLA
+			    UNSPEC_FCMLA90
+			    UNSPEC_FCMLA180
+			    UNSPEC_FCMLA270])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
@@ -1857,3 +1871,20 @@
 			      (UNSPEC_COND_DIV "false")
 			      (UNSPEC_COND_MIN "true")
 			      (UNSPEC_COND_MAX "true")])
+
+(define_int_attr rot [(UNSPEC_FCADD90 "90")
+		      (UNSPEC_FCADD270 "270")
+		      (UNSPEC_FCMLA "0")
+		      (UNSPEC_FCMLA90 "90")
+		      (UNSPEC_FCMLA180 "180")
+		      (UNSPEC_FCMLA270 "270")])
+
+(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
+			    (UNSPEC_FCMLA90 "0")
+			    (UNSPEC_FCMLA180 "180")
+			    (UNSPEC_FCMLA270 "180")])
+
+(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
+			    (UNSPEC_FCMLA90 "270")
+			    (UNSPEC_FCMLA180 "270")
+			    (UNSPEC_FCMLA270 "90")])
\ No newline at end of file
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index 055cb3e7d9414b461a9cf8da2e63f22025c2c901..18020387998c32b6dcc5ba68cc65f8487f719fd8 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -763,6 +763,9 @@
   neon_sub_halve,\
   neon_sub_halve_q,\
   neon_sub_halve_narrow_q,\
+\
+  neon_fcadd,\
+  neon_fcmla,\
 \
   neon_abs,\
   neon_abs_q,\

next             reply	other threads:[~2018-11-11 10:26 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-11 10:26 Tamar Christina [this message]
2018-11-12 12:17 ` Kyrill Tkachov
2018-11-12 12:31   ` Tamar Christina
2018-11-28 17:16     ` James Greenhalgh
2022-11-16 10:50       ` Ramana Radhakrishnan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181111102628.GA4529@arm.com \
    --to=tamar.christina@arm.com \
    --cc=James.Greenhalgh@arm.com \
    --cc=Marcus.Shawcroft@arm.com \
    --cc=Richard.Earnshaw@arm.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=nd@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).