[Neon intrinsics] Literal vector construction through vcombine is poor

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [Neon intrinsics] Literal vector construction through vcombine is poor
@ 2017-06-16 21:08 Michael Collison
  2017-06-19 13:37 ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 6+ messages in thread
From: Michael Collison @ 2017-06-16 21:08 UTC (permalink / raw)
  To: GCC Patches; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 975 bytes --]

This patch improves code generation for literal vector construction by expanding and exposing the pattern to rtl optimization earlier. The current implementation delays splitting the pattern until after reload which results in poor code generation for the following code:


#include "arm_neon.h"

int16x8_t
foo ()
{
  return vcombine_s16 (vdup_n_s16 (0), vdup_n_s16 (8));
}

Trunk generates:

foo:
	movi	v1.2s, 0
	movi	v0.4h, 0x8
	dup	d2, v1.d[0]
	ins	v2.d[1], v0.d[0]
	orr	v0.16b, v2.16b, v2.16b
	ret

With the patch we now generate:

foo:
	movi	v1.4h, 0x8
	movi	v0.4s, 0
	ins	v0.d[1], v1.d[0]
	ret

Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk.

2017-06-15  Michael Collison  <michael.collison@arm.com>

	* config/aarch64/aarch64-simd.md(aarch64_combine_internal<mode>):
	Convert from define_insn_and_split into define_expand
	* config/aarch64/aarch64.c(aarch64_split_simd_combine):
	Allow register and subreg operands.

[-- Attachment #2: pr7057.patch --]
[-- Type: application/octet-stream, Size: 1544 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c462164..4a253a9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2807,27 +2807,11 @@
       op1 = operands[1];
       op2 = operands[2];
     }
-  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, op2));
-  DONE;
-}
-)
 
-(define_insn_and_split "aarch64_combine_internal<mode>"
-  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
-        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
-			   (match_operand:VDC 2 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-{
-  if (BYTES_BIG_ENDIAN)
-    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
-  else
-    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
+  aarch64_split_simd_combine (operands[0], op1, op2);
+
   DONE;
 }
-[(set_attr "type" "multiple")]
 )
 
 (define_expand "aarch64_simd_combine<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..46bd78b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 
   gcc_assert (VECTOR_MODE_P (dst_mode));
 
-  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
+  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
+      && register_operand (src2, src_mode))
     {
       rtx (*gen) (rtx, rtx, rtx);
 
-- 
1.9.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Neon intrinsics] Literal vector construction through vcombine is poor
  2017-06-16 21:08 [Neon intrinsics] Literal vector construction through vcombine is poor Michael Collison
@ 2017-06-19 13:37 ` Richard Earnshaw (lists)
  2017-06-22 23:10   ` Michael Collison
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Earnshaw (lists) @ 2017-06-19 13:37 UTC (permalink / raw)
  To: Michael Collison, GCC Patches; +Cc: nd

On 16/06/17 22:08, Michael Collison wrote:
> This patch improves code generation for literal vector construction by expanding and exposing the pattern to rtl optimization earlier. The current implementation delays splitting the pattern until after reload which results in poor code generation for the following code:
> 
> 
> #include "arm_neon.h"
> 
> int16x8_t
> foo ()
> {
>   return vcombine_s16 (vdup_n_s16 (0), vdup_n_s16 (8));
> }
> 
> Trunk generates:
> 
> foo:
> 	movi	v1.2s, 0
> 	movi	v0.4h, 0x8
> 	dup	d2, v1.d[0]
> 	ins	v2.d[1], v0.d[0]
> 	orr	v0.16b, v2.16b, v2.16b
> 	ret
> 
> With the patch we now generate:
> 
> foo:
> 	movi	v1.4h, 0x8
> 	movi	v0.4s, 0
> 	ins	v0.d[1], v1.d[0]
> 	ret
> 
> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk.
> 
> 2017-06-15  Michael Collison  <michael.collison@arm.com>
> 
> 	* config/aarch64/aarch64-simd.md(aarch64_combine_internal<mode>):
> 	Convert from define_insn_and_split into define_expand
> 	* config/aarch64/aarch64.c(aarch64_split_simd_combine):
> 	Allow register and subreg operands.
> 

Your changelog entry is confusing.  You've deleted the
aarch64_combine_internal<mode> pattern entirely, having merged some of
its functionality directly into its caller (aarch64_combine<mode>).

So I think it should read:

* config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly call
aarch64_split_simd_combine.
(aarch64_combine_internal<mode>): Delete pattern.
* ...

Note also there should be a space between the file name and the open
bracket for the first function name.

Why don't you need the big-endian code path any more?

R.

> 
> pr7057.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index c462164..4a253a9 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2807,27 +2807,11 @@
>        op1 = operands[1];
>        op2 = operands[2];
>      }
> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, op2));
> -  DONE;
> -}
> -)
>  
> -(define_insn_and_split "aarch64_combine_internal<mode>"
> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
> -			   (match_operand:VDC 2 "register_operand" "w")))]
> -  "TARGET_SIMD"
> -  "#"
> -  "&& reload_completed"
> -  [(const_int 0)]
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
> -  else
> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
> +  aarch64_split_simd_combine (operands[0], op1, op2);
> +
>    DONE;
>  }
> -[(set_attr "type" "multiple")]
>  )
>  
>  (define_expand "aarch64_simd_combine<mode>"
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 2e385c4..46bd78b 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
>  
>    gcc_assert (VECTOR_MODE_P (dst_mode));
>  
> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
> +      && register_operand (src2, src_mode))
>      {
>        rtx (*gen) (rtx, rtx, rtx);
>  
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [Neon intrinsics] Literal vector construction through vcombine is poor
  2017-06-19 13:37 ` Richard Earnshaw (lists)
@ 2017-06-22 23:10   ` Michael Collison
  2017-06-23  9:08     ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 6+ messages in thread
From: Michael Collison @ 2017-06-22 23:10 UTC (permalink / raw)
  To: Richard Earnshaw, GCC Patches; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 4276 bytes --]

Richard,

I reworked the patch and retested on big endian as well as little. The original code was performing two swaps in the big endian case which works out to no swaps at all.

I also updated the ChangeLog per your comments. Okay for trunk?

2017-06-19  Michael Collison  <michael.collison@arm.com>

	* config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly
	call aarch64_split_simd_combine.
	* (aarch64_combine_internal<mode>): Delete pattern.
	* config/aarch64/aarch64.c (aarch64_split_simd_combine):
	Allow register and subreg operands.

-----Original Message-----
From: Richard Earnshaw (lists) [mailto:Richard.Earnshaw@arm.com] 
Sent: Monday, June 19, 2017 6:37 AM
To: Michael Collison <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>
Cc: nd <nd@arm.com>
Subject: Re: [Neon intrinsics] Literal vector construction through vcombine is poor

On 16/06/17 22:08, Michael Collison wrote:
> This patch improves code generation for literal vector construction by expanding and exposing the pattern to rtl optimization earlier. The current implementation delays splitting the pattern until after reload which results in poor code generation for the following code:
> 
> 
> #include "arm_neon.h"
> 
> int16x8_t
> foo ()
> {
>   return vcombine_s16 (vdup_n_s16 (0), vdup_n_s16 (8)); }
> 
> Trunk generates:
> 
> foo:
> 	movi	v1.2s, 0
> 	movi	v0.4h, 0x8
> 	dup	d2, v1.d[0]
> 	ins	v2.d[1], v0.d[0]
> 	orr	v0.16b, v2.16b, v2.16b
> 	ret
> 
> With the patch we now generate:
> 
> foo:
> 	movi	v1.4h, 0x8
> 	movi	v0.4s, 0
> 	ins	v0.d[1], v1.d[0]
> 	ret
> 
> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk.
> 
> 2017-06-15  Michael Collison  <michael.collison@arm.com>
> 
> 	* config/aarch64/aarch64-simd.md(aarch64_combine_internal<mode>):
> 	Convert from define_insn_and_split into define_expand
> 	* config/aarch64/aarch64.c(aarch64_split_simd_combine):
> 	Allow register and subreg operands.
> 

Your changelog entry is confusing.  You've deleted the aarch64_combine_internal<mode> pattern entirely, having merged some of its functionality directly into its caller (aarch64_combine<mode>).

So I think it should read:

* config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly call aarch64_split_simd_combine.
(aarch64_combine_internal<mode>): Delete pattern.
* ...

Note also there should be a space between the file name and the open bracket for the first function name.

Why don't you need the big-endian code path any more?

R.

> 
> pr7057.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index c462164..4a253a9 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2807,27 +2807,11 @@
>        op1 = operands[1];
>        op2 = operands[2];
>      }
> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, 
> op2));
> -  DONE;
> -}
> -)
>  
> -(define_insn_and_split "aarch64_combine_internal<mode>"
> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
> -			   (match_operand:VDC 2 "register_operand" "w")))]
> -  "TARGET_SIMD"
> -  "#"
> -  "&& reload_completed"
> -  [(const_int 0)]
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
> -  else
> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
> +  aarch64_split_simd_combine (operands[0], op1, op2);
> +
>    DONE;
>  }
> -[(set_attr "type" "multiple")]
>  )
>  
>  (define_expand "aarch64_simd_combine<mode>"
> diff --git a/gcc/config/aarch64/aarch64.c 
> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, 
> rtx src2)
>  
>    gcc_assert (VECTOR_MODE_P (dst_mode));
>  
> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
> +      && register_operand (src2, src_mode))
>      {
>        rtx (*gen) (rtx, rtx, rtx);
>  
> 


[-- Attachment #2: pr7057v4.patch --]
[-- Type: application/octet-stream, Size: 1747 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c462164..3043f81 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2796,38 +2796,10 @@
    (match_operand:VDC 2 "register_operand")]
   "TARGET_SIMD"
 {
-  rtx op1, op2;
-  if (BYTES_BIG_ENDIAN)
-    {
-      op1 = operands[2];
-      op2 = operands[1];
-    }
-  else
-    {
-      op1 = operands[1];
-      op2 = operands[2];
-    }
-  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, op2));
-  DONE;
-}
-)
+  aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
 
-(define_insn_and_split "aarch64_combine_internal<mode>"
-  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
-        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
-			   (match_operand:VDC 2 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-{
-  if (BYTES_BIG_ENDIAN)
-    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
-  else
-    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
   DONE;
 }
-[(set_attr "type" "multiple")]
 )
 
 (define_expand "aarch64_simd_combine<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..46bd78b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 
   gcc_assert (VECTOR_MODE_P (dst_mode));
 
-  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
+  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
+      && register_operand (src2, src_mode))
     {
       rtx (*gen) (rtx, rtx, rtx);
 
-- 
1.9.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Neon intrinsics] Literal vector construction through vcombine is poor
  2017-06-22 23:10   ` Michael Collison
@ 2017-06-23  9:08     ` Richard Earnshaw (lists)
  2017-06-27  6:14       ` Michael Collison
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Earnshaw (lists) @ 2017-06-23  9:08 UTC (permalink / raw)
  To: Michael Collison, GCC Patches; +Cc: nd

On 23/06/17 00:10, Michael Collison wrote:
> Richard,
> 
> I reworked the patch and retested on big endian as well as little. The original code was performing two swaps in the big endian case which works out to no swaps at all.
> 
> I also updated the ChangeLog per your comments. Okay for trunk?
> 
> 2017-06-19  Michael Collison  <michael.collison@arm.com>
> 
> 	* config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly
> 	call aarch64_split_simd_combine.
> 	* (aarch64_combine_internal<mode>): Delete pattern.
> 	* config/aarch64/aarch64.c (aarch64_split_simd_combine):
> 	Allow register and subreg operands.
> 
> -----Original Message-----
> From: Richard Earnshaw (lists) [mailto:Richard.Earnshaw@arm.com] 
> Sent: Monday, June 19, 2017 6:37 AM
> To: Michael Collison <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>
> Cc: nd <nd@arm.com>
> Subject: Re: [Neon intrinsics] Literal vector construction through vcombine is poor
> 
> On 16/06/17 22:08, Michael Collison wrote:
>> This patch improves code generation for literal vector construction by expanding and exposing the pattern to rtl optimization earlier. The current implementation delays splitting the pattern until after reload which results in poor code generation for the following code:
>>
>>
>> #include "arm_neon.h"
>>
>> int16x8_t
>> foo ()
>> {
>>   return vcombine_s16 (vdup_n_s16 (0), vdup_n_s16 (8)); }
>>
>> Trunk generates:
>>
>> foo:
>> 	movi	v1.2s, 0
>> 	movi	v0.4h, 0x8
>> 	dup	d2, v1.d[0]
>> 	ins	v2.d[1], v0.d[0]
>> 	orr	v0.16b, v2.16b, v2.16b
>> 	ret
>>
>> With the patch we now generate:
>>
>> foo:
>> 	movi	v1.4h, 0x8
>> 	movi	v0.4s, 0
>> 	ins	v0.d[1], v1.d[0]
>> 	ret
>>
>> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk.
>>
>> 2017-06-15  Michael Collison  <michael.collison@arm.com>
>>
>> 	* config/aarch64/aarch64-simd.md(aarch64_combine_internal<mode>):
>> 	Convert from define_insn_and_split into define_expand
>> 	* config/aarch64/aarch64.c(aarch64_split_simd_combine):
>> 	Allow register and subreg operands.
>>
> 
> Your changelog entry is confusing.  You've deleted the aarch64_combine_internal<mode> pattern entirely, having merged some of its functionality directly into its caller (aarch64_combine<mode>).
> 
> So I think it should read:
> 
> * config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly call aarch64_split_simd_combine.
> (aarch64_combine_internal<mode>): Delete pattern.
> * ...
> 
> Note also there should be a space between the file name and the open bracket for the first function name.
> 
> Why don't you need the big-endian code path any more?
> 
> R.
> 
>>
>> pr7057.patch
>>
>>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>> b/gcc/config/aarch64/aarch64-simd.md
>> index c462164..4a253a9 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -2807,27 +2807,11 @@
>>        op1 = operands[1];
>>        op2 = operands[2];
>>      }
>> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, 
>> op2));
>> -  DONE;
>> -}
>> -)
>>  
>> -(define_insn_and_split "aarch64_combine_internal<mode>"
>> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
>> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
>> -			   (match_operand:VDC 2 "register_operand" "w")))]
>> -  "TARGET_SIMD"
>> -  "#"
>> -  "&& reload_completed"
>> -  [(const_int 0)]
>> -{
>> -  if (BYTES_BIG_ENDIAN)
>> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
>> -  else
>> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>> +  aarch64_split_simd_combine (operands[0], op1, op2);
>> +
>>    DONE;
>>  }
>> -[(set_attr "type" "multiple")]
>>  )
>>  
>>  (define_expand "aarch64_simd_combine<mode>"
>> diff --git a/gcc/config/aarch64/aarch64.c 
>> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, 
>> rtx src2)
>>  
>>    gcc_assert (VECTOR_MODE_P (dst_mode));
>>  
>> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
>> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
>> +      && register_operand (src2, src_mode))
>>      {
>>        rtx (*gen) (rtx, rtx, rtx);
>>  
>>
> 
> 
> pr7057v4.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index c462164..3043f81 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2796,38 +2796,10 @@
>     (match_operand:VDC 2 "register_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx op1, op2;
> -  if (BYTES_BIG_ENDIAN)
> -    {
> -      op1 = operands[2];
> -      op2 = operands[1];
> -    }
> -  else
> -    {
> -      op1 = operands[1];
> -      op2 = operands[2];
> -    }
> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, op2));
> -  DONE;
> -}
> -)
> +  aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>  
> -(define_insn_and_split "aarch64_combine_internal<mode>"
> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
> -			   (match_operand:VDC 2 "register_operand" "w")))]
> -  "TARGET_SIMD"
> -  "#"
> -  "&& reload_completed"
> -  [(const_int 0)]
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
> -  else
> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>    DONE;
>  }
> -[(set_attr "type" "multiple")]
>  )
>  
>  (define_expand "aarch64_simd_combine<mode>"
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 2e385c4..46bd78b 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
>  
>    gcc_assert (VECTOR_MODE_P (dst_mode));
>  
> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
> +      && register_operand (src2, src_mode))
>      {
>        rtx (*gen) (rtx, rtx, rtx);
>  
> 

As far as I can see aarch64_split_simd_combine is only called from this
one expand pattern and the predicates on the pattern enforce all the
operands being registers.  Furthermore, aarch64_split_simd_combine does
nothing if they aren't all registers, which would obviously result in
wrong code generation.

So convert

> +  if (register_operand (dst, dst_mode) && register_operand (src1,
src_mode)
> +      && register_operand (src2, src_mode))

into an assertion and make the following code unconditional.

OK with that change.

R.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [Neon intrinsics] Literal vector construction through vcombine is poor
  2017-06-23  9:08     ` Richard Earnshaw (lists)
@ 2017-06-27  6:14       ` Michael Collison
  2017-06-27 13:50         ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 6+ messages in thread
From: Michael Collison @ 2017-06-27  6:14 UTC (permalink / raw)
  To: Richard Earnshaw, GCC Patches; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 7430 bytes --]

Richard,

I reworked the patch using an assert as you suggested. Bootstrapped and retested. Okay for trunk?


-----Original Message-----
From: Richard Earnshaw (lists) [mailto:Richard.Earnshaw@arm.com] 
Sent: Friday, June 23, 2017 2:09 AM
To: Michael Collison <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>
Cc: nd <nd@arm.com>
Subject: Re: [Neon intrinsics] Literal vector construction through vcombine is poor

On 23/06/17 00:10, Michael Collison wrote:
> Richard,
> 
> I reworked the patch and retested on big endian as well as little. The original code was performing two swaps in the big endian case which works out to no swaps at all.
> 
> I also updated the ChangeLog per your comments. Okay for trunk?
> 
> 2017-06-19  Michael Collison  <michael.collison@arm.com>
> 
> 	* config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly
> 	call aarch64_split_simd_combine.
> 	* (aarch64_combine_internal<mode>): Delete pattern.
> 	* config/aarch64/aarch64.c (aarch64_split_simd_combine):
> 	Allow register and subreg operands.
> 
> -----Original Message-----
> From: Richard Earnshaw (lists) [mailto:Richard.Earnshaw@arm.com]
> Sent: Monday, June 19, 2017 6:37 AM
> To: Michael Collison <Michael.Collison@arm.com>; GCC Patches 
> <gcc-patches@gcc.gnu.org>
> Cc: nd <nd@arm.com>
> Subject: Re: [Neon intrinsics] Literal vector construction through 
> vcombine is poor
> 
> On 16/06/17 22:08, Michael Collison wrote:
>> This patch improves code generation for literal vector construction by expanding and exposing the pattern to rtl optimization earlier. The current implementation delays splitting the pattern until after reload which results in poor code generation for the following code:
>>
>>
>> #include "arm_neon.h"
>>
>> int16x8_t
>> foo ()
>> {
>>   return vcombine_s16 (vdup_n_s16 (0), vdup_n_s16 (8)); }
>>
>> Trunk generates:
>>
>> foo:
>> 	movi	v1.2s, 0
>> 	movi	v0.4h, 0x8
>> 	dup	d2, v1.d[0]
>> 	ins	v2.d[1], v0.d[0]
>> 	orr	v0.16b, v2.16b, v2.16b
>> 	ret
>>
>> With the patch we now generate:
>>
>> foo:
>> 	movi	v1.4h, 0x8
>> 	movi	v0.4s, 0
>> 	ins	v0.d[1], v1.d[0]
>> 	ret
>>
>> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk.
>>
>> 2017-06-15  Michael Collison  <michael.collison@arm.com>
>>
>> 	* config/aarch64/aarch64-simd.md(aarch64_combine_internal<mode>):
>> 	Convert from define_insn_and_split into define_expand
>> 	* config/aarch64/aarch64.c(aarch64_split_simd_combine):
>> 	Allow register and subreg operands.
>>
> 
> Your changelog entry is confusing.  You've deleted the aarch64_combine_internal<mode> pattern entirely, having merged some of its functionality directly into its caller (aarch64_combine<mode>).
> 
> So I think it should read:
> 
> * config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly call aarch64_split_simd_combine.
> (aarch64_combine_internal<mode>): Delete pattern.
> * ...
> 
> Note also there should be a space between the file name and the open bracket for the first function name.
> 
> Why don't you need the big-endian code path any more?
> 
> R.
> 
>>
>> pr7057.patch
>>
>>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>> b/gcc/config/aarch64/aarch64-simd.md
>> index c462164..4a253a9 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -2807,27 +2807,11 @@
>>        op1 = operands[1];
>>        op2 = operands[2];
>>      }
>> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, 
>> op2));
>> -  DONE;
>> -}
>> -)
>>  
>> -(define_insn_and_split "aarch64_combine_internal<mode>"
>> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
>> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
>> -			   (match_operand:VDC 2 "register_operand" "w")))]
>> -  "TARGET_SIMD"
>> -  "#"
>> -  "&& reload_completed"
>> -  [(const_int 0)]
>> -{
>> -  if (BYTES_BIG_ENDIAN)
>> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
>> -  else
>> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>> +  aarch64_split_simd_combine (operands[0], op1, op2);
>> +
>>    DONE;
>>  }
>> -[(set_attr "type" "multiple")]
>>  )
>>  
>>  (define_expand "aarch64_simd_combine<mode>"
>> diff --git a/gcc/config/aarch64/aarch64.c 
>> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, 
>> rtx src2)
>>  
>>    gcc_assert (VECTOR_MODE_P (dst_mode));
>>  
>> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
>> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
>> +      && register_operand (src2, src_mode))
>>      {
>>        rtx (*gen) (rtx, rtx, rtx);
>>  
>>
> 
> 
> pr7057v4.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index c462164..3043f81 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2796,38 +2796,10 @@
>     (match_operand:VDC 2 "register_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx op1, op2;
> -  if (BYTES_BIG_ENDIAN)
> -    {
> -      op1 = operands[2];
> -      op2 = operands[1];
> -    }
> -  else
> -    {
> -      op1 = operands[1];
> -      op2 = operands[2];
> -    }
> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, 
> op2));
> -  DONE;
> -}
> -)
> +  aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>  
> -(define_insn_and_split "aarch64_combine_internal<mode>"
> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
> -			   (match_operand:VDC 2 "register_operand" "w")))]
> -  "TARGET_SIMD"
> -  "#"
> -  "&& reload_completed"
> -  [(const_int 0)]
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
> -  else
> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>    DONE;
>  }
> -[(set_attr "type" "multiple")]
>  )
>  
>  (define_expand "aarch64_simd_combine<mode>"
> diff --git a/gcc/config/aarch64/aarch64.c 
> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, 
> rtx src2)
>  
>    gcc_assert (VECTOR_MODE_P (dst_mode));
>  
> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
> +      && register_operand (src2, src_mode))
>      {
>        rtx (*gen) (rtx, rtx, rtx);
>  
> 

As far as I can see aarch64_split_simd_combine is only called from this one expand pattern and the predicates on the pattern enforce all the operands being registers.  Furthermore, aarch64_split_simd_combine does nothing if they aren't all registers, which would obviously result in wrong code generation.

So convert

> +  if (register_operand (dst, dst_mode) && register_operand (src1,
src_mode)
> +      && register_operand (src2, src_mode))

into an assertion and make the following code unconditional.

OK with that change.

R.


[-- Attachment #2: pr7057v5.patch --]
[-- Type: application/octet-stream, Size: 3134 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c462164..3043f81 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2796,38 +2796,10 @@
    (match_operand:VDC 2 "register_operand")]
   "TARGET_SIMD"
 {
-  rtx op1, op2;
-  if (BYTES_BIG_ENDIAN)
-    {
-      op1 = operands[2];
-      op2 = operands[1];
-    }
-  else
-    {
-      op1 = operands[1];
-      op2 = operands[2];
-    }
-  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, op2));
-  DONE;
-}
-)
+  aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
 
-(define_insn_and_split "aarch64_combine_internal<mode>"
-  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
-        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
-			   (match_operand:VDC 2 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-{
-  if (BYTES_BIG_ENDIAN)
-    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
-  else
-    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
   DONE;
 }
-[(set_attr "type" "multiple")]
 )
 
 (define_expand "aarch64_simd_combine<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..12ae238 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1649,41 +1649,41 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
   machine_mode dst_mode = GET_MODE (dst);
 
   gcc_assert (VECTOR_MODE_P (dst_mode));
+  gcc_assert (register_operand (dst, dst_mode)
+	      && register_operand (src1, src_mode)
+	      && register_operand (src2, src_mode));
 
-  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
-    {
-      rtx (*gen) (rtx, rtx, rtx);
-
-      switch (src_mode)
-	{
-	case V8QImode:
-	  gen = gen_aarch64_simd_combinev8qi;
-	  break;
-	case V4HImode:
-	  gen = gen_aarch64_simd_combinev4hi;
-	  break;
-	case V2SImode:
-	  gen = gen_aarch64_simd_combinev2si;
-	  break;
-	case V4HFmode:
-	  gen = gen_aarch64_simd_combinev4hf;
-	  break;
-	case V2SFmode:
-	  gen = gen_aarch64_simd_combinev2sf;
-	  break;
-	case DImode:
-	  gen = gen_aarch64_simd_combinedi;
-	  break;
-	case DFmode:
-	  gen = gen_aarch64_simd_combinedf;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
+  rtx (*gen) (rtx, rtx, rtx);
 
-      emit_insn (gen (dst, src1, src2));
-      return;
+  switch (src_mode)
+    {
+    case V8QImode:
+      gen = gen_aarch64_simd_combinev8qi;
+      break;
+    case V4HImode:
+      gen = gen_aarch64_simd_combinev4hi;
+      break;
+    case V2SImode:
+      gen = gen_aarch64_simd_combinev2si;
+      break;
+    case V4HFmode:
+      gen = gen_aarch64_simd_combinev4hf;
+      break;
+    case V2SFmode:
+      gen = gen_aarch64_simd_combinev2sf;
+      break;
+    case DImode:
+      gen = gen_aarch64_simd_combinedi;
+      break;
+    case DFmode:
+      gen = gen_aarch64_simd_combinedf;
+      break;
+    default:
+      gcc_unreachable ();
     }
+
+  emit_insn (gen (dst, src1, src2));
+  return;
 }
 
 /* Split a complex SIMD move.  */
-- 
1.9.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Neon intrinsics] Literal vector construction through vcombine is poor
  2017-06-27  6:14       ` Michael Collison
@ 2017-06-27 13:50         ` Richard Earnshaw (lists)
  0 siblings, 0 replies; 6+ messages in thread
From: Richard Earnshaw (lists) @ 2017-06-27 13:50 UTC (permalink / raw)
  To: Michael Collison, GCC Patches; +Cc: nd

On 27/06/17 07:13, Michael Collison wrote:
> Richard,
> 
> I reworked the patch using an assert as you suggested. Bootstrapped and retested. Okay for trunk?
> 

Yes, fine thanks.

R.

> 
> -----Original Message-----
> From: Richard Earnshaw (lists) [mailto:Richard.Earnshaw@arm.com] 
> Sent: Friday, June 23, 2017 2:09 AM
> To: Michael Collison <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>
> Cc: nd <nd@arm.com>
> Subject: Re: [Neon intrinsics] Literal vector construction through vcombine is poor
> 
> On 23/06/17 00:10, Michael Collison wrote:
>> Richard,
>>
>> I reworked the patch and retested on big endian as well as little. The original code was performing two swaps in the big endian case which works out to no swaps at all.
>>
>> I also updated the ChangeLog per your comments. Okay for trunk?
>>
>> 2017-06-19  Michael Collison  <michael.collison@arm.com>
>>
>> 	* config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly
>> 	call aarch64_split_simd_combine.
>> 	* (aarch64_combine_internal<mode>): Delete pattern.
>> 	* config/aarch64/aarch64.c (aarch64_split_simd_combine):
>> 	Allow register and subreg operands.
>>
>> -----Original Message-----
>> From: Richard Earnshaw (lists) [mailto:Richard.Earnshaw@arm.com]
>> Sent: Monday, June 19, 2017 6:37 AM
>> To: Michael Collison <Michael.Collison@arm.com>; GCC Patches 
>> <gcc-patches@gcc.gnu.org>
>> Cc: nd <nd@arm.com>
>> Subject: Re: [Neon intrinsics] Literal vector construction through 
>> vcombine is poor
>>
>> On 16/06/17 22:08, Michael Collison wrote:
>>> This patch improves code generation for literal vector construction by expanding and exposing the pattern to rtl optimization earlier. The current implementation delays splitting the pattern until after reload which results in poor code generation for the following code:
>>>
>>>
>>> #include "arm_neon.h"
>>>
>>> int16x8_t
>>> foo ()
>>> {
>>>   return vcombine_s16 (vdup_n_s16 (0), vdup_n_s16 (8)); }
>>>
>>> Trunk generates:
>>>
>>> foo:
>>> 	movi	v1.2s, 0
>>> 	movi	v0.4h, 0x8
>>> 	dup	d2, v1.d[0]
>>> 	ins	v2.d[1], v0.d[0]
>>> 	orr	v0.16b, v2.16b, v2.16b
>>> 	ret
>>>
>>> With the patch we now generate:
>>>
>>> foo:
>>> 	movi	v1.4h, 0x8
>>> 	movi	v0.4s, 0
>>> 	ins	v0.d[1], v1.d[0]
>>> 	ret
>>>
>>> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk.
>>>
>>> 2017-06-15  Michael Collison  <michael.collison@arm.com>
>>>
>>> 	* config/aarch64/aarch64-simd.md(aarch64_combine_internal<mode>):
>>> 	Convert from define_insn_and_split into define_expand
>>> 	* config/aarch64/aarch64.c(aarch64_split_simd_combine):
>>> 	Allow register and subreg operands.
>>>
>>
>> Your changelog entry is confusing.  You've deleted the aarch64_combine_internal<mode> pattern entirely, having merged some of its functionality directly into its caller (aarch64_combine<mode>).
>>
>> So I think it should read:
>>
>> * config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly call aarch64_split_simd_combine.
>> (aarch64_combine_internal<mode>): Delete pattern.
>> * ...
>>
>> Note also there should be a space between the file name and the open bracket for the first function name.
>>
>> Why don't you need the big-endian code path any more?
>>
>> R.
>>
>>>
>>> pr7057.patch
>>>
>>>
>>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>>> b/gcc/config/aarch64/aarch64-simd.md
>>> index c462164..4a253a9 100644
>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>> @@ -2807,27 +2807,11 @@
>>>        op1 = operands[1];
>>>        op2 = operands[2];
>>>      }
>>> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, 
>>> op2));
>>> -  DONE;
>>> -}
>>> -)
>>>  
>>> -(define_insn_and_split "aarch64_combine_internal<mode>"
>>> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
>>> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
>>> -			   (match_operand:VDC 2 "register_operand" "w")))]
>>> -  "TARGET_SIMD"
>>> -  "#"
>>> -  "&& reload_completed"
>>> -  [(const_int 0)]
>>> -{
>>> -  if (BYTES_BIG_ENDIAN)
>>> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
>>> -  else
>>> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>>> +  aarch64_split_simd_combine (operands[0], op1, op2);
>>> +
>>>    DONE;
>>>  }
>>> -[(set_attr "type" "multiple")]
>>>  )
>>>  
>>>  (define_expand "aarch64_simd_combine<mode>"
>>> diff --git a/gcc/config/aarch64/aarch64.c 
>>> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644
>>> --- a/gcc/config/aarch64/aarch64.c
>>> +++ b/gcc/config/aarch64/aarch64.c
>>> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, 
>>> rtx src2)
>>>  
>>>    gcc_assert (VECTOR_MODE_P (dst_mode));
>>>  
>>> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
>>> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
>>> +      && register_operand (src2, src_mode))
>>>      {
>>>        rtx (*gen) (rtx, rtx, rtx);
>>>  
>>>
>>
>>
>> pr7057v4.patch
>>
>>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>> b/gcc/config/aarch64/aarch64-simd.md
>> index c462164..3043f81 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -2796,38 +2796,10 @@
>>     (match_operand:VDC 2 "register_operand")]
>>    "TARGET_SIMD"
>>  {
>> -  rtx op1, op2;
>> -  if (BYTES_BIG_ENDIAN)
>> -    {
>> -      op1 = operands[2];
>> -      op2 = operands[1];
>> -    }
>> -  else
>> -    {
>> -      op1 = operands[1];
>> -      op2 = operands[2];
>> -    }
>> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, 
>> op2));
>> -  DONE;
>> -}
>> -)
>> +  aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>>  
>> -(define_insn_and_split "aarch64_combine_internal<mode>"
>> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
>> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
>> -			   (match_operand:VDC 2 "register_operand" "w")))]
>> -  "TARGET_SIMD"
>> -  "#"
>> -  "&& reload_completed"
>> -  [(const_int 0)]
>> -{
>> -  if (BYTES_BIG_ENDIAN)
>> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
>> -  else
>> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>>    DONE;
>>  }
>> -[(set_attr "type" "multiple")]
>>  )
>>  
>>  (define_expand "aarch64_simd_combine<mode>"
>> diff --git a/gcc/config/aarch64/aarch64.c 
>> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, 
>> rtx src2)
>>  
>>    gcc_assert (VECTOR_MODE_P (dst_mode));
>>  
>> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
>> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
>> +      && register_operand (src2, src_mode))
>>      {
>>        rtx (*gen) (rtx, rtx, rtx);
>>  
>>
> 
> As far as I can see aarch64_split_simd_combine is only called from this one expand pattern and the predicates on the pattern enforce all the operands being registers.  Furthermore, aarch64_split_simd_combine does nothing if they aren't all registers, which would obviously result in wrong code generation.
> 
> So convert
> 
>> +  if (register_operand (dst, dst_mode) && register_operand (src1,
> src_mode)
>> +      && register_operand (src2, src_mode))
> 
> into an assertion and make the following code unconditional.
> 
> OK with that change.
> 
> R.
> 
> 
> pr7057v5.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index c462164..3043f81 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2796,38 +2796,10 @@
>     (match_operand:VDC 2 "register_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx op1, op2;
> -  if (BYTES_BIG_ENDIAN)
> -    {
> -      op1 = operands[2];
> -      op2 = operands[1];
> -    }
> -  else
> -    {
> -      op1 = operands[1];
> -      op2 = operands[2];
> -    }
> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, op2));
> -  DONE;
> -}
> -)
> +  aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>  
> -(define_insn_and_split "aarch64_combine_internal<mode>"
> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
> -			   (match_operand:VDC 2 "register_operand" "w")))]
> -  "TARGET_SIMD"
> -  "#"
> -  "&& reload_completed"
> -  [(const_int 0)]
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
> -  else
> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>    DONE;
>  }
> -[(set_attr "type" "multiple")]
>  )
>  
>  (define_expand "aarch64_simd_combine<mode>"
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 2e385c4..12ae238 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1649,41 +1649,41 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
>    machine_mode dst_mode = GET_MODE (dst);
>  
>    gcc_assert (VECTOR_MODE_P (dst_mode));
> +  gcc_assert (register_operand (dst, dst_mode)
> +	      && register_operand (src1, src_mode)
> +	      && register_operand (src2, src_mode));
>  
> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
> -    {
> -      rtx (*gen) (rtx, rtx, rtx);
> -
> -      switch (src_mode)
> -	{
> -	case V8QImode:
> -	  gen = gen_aarch64_simd_combinev8qi;
> -	  break;
> -	case V4HImode:
> -	  gen = gen_aarch64_simd_combinev4hi;
> -	  break;
> -	case V2SImode:
> -	  gen = gen_aarch64_simd_combinev2si;
> -	  break;
> -	case V4HFmode:
> -	  gen = gen_aarch64_simd_combinev4hf;
> -	  break;
> -	case V2SFmode:
> -	  gen = gen_aarch64_simd_combinev2sf;
> -	  break;
> -	case DImode:
> -	  gen = gen_aarch64_simd_combinedi;
> -	  break;
> -	case DFmode:
> -	  gen = gen_aarch64_simd_combinedf;
> -	  break;
> -	default:
> -	  gcc_unreachable ();
> -	}
> +  rtx (*gen) (rtx, rtx, rtx);
>  
> -      emit_insn (gen (dst, src1, src2));
> -      return;
> +  switch (src_mode)
> +    {
> +    case V8QImode:
> +      gen = gen_aarch64_simd_combinev8qi;
> +      break;
> +    case V4HImode:
> +      gen = gen_aarch64_simd_combinev4hi;
> +      break;
> +    case V2SImode:
> +      gen = gen_aarch64_simd_combinev2si;
> +      break;
> +    case V4HFmode:
> +      gen = gen_aarch64_simd_combinev4hf;
> +      break;
> +    case V2SFmode:
> +      gen = gen_aarch64_simd_combinev2sf;
> +      break;
> +    case DImode:
> +      gen = gen_aarch64_simd_combinedi;
> +      break;
> +    case DFmode:
> +      gen = gen_aarch64_simd_combinedf;
> +      break;
> +    default:
> +      gcc_unreachable ();
>      }
> +
> +  emit_insn (gen (dst, src1, src2));
> +  return;
>  }
>  
>  /* Split a complex SIMD move.  */
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-06-27 13:50 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-16 21:08 [Neon intrinsics] Literal vector construction through vcombine is poor Michael Collison
2017-06-19 13:37 ` Richard Earnshaw (lists)
2017-06-22 23:10   ` Michael Collison
2017-06-23  9:08     ` Richard Earnshaw (lists)
2017-06-27  6:14       ` Michael Collison
2017-06-27 13:50         ` Richard Earnshaw (lists)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).