[PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720)

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720)
@ 2020-06-17  9:09 Dmitrij Pochepko
  2020-06-23 15:55 ` Tamar Christina
  2020-06-23 17:10 ` Richard Sandiford
  0 siblings, 2 replies; 8+ messages in thread
From: Dmitrij Pochepko @ 2020-06-17  9:09 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 829 bytes --]

The following patch enables vector permutations optimization by trying to use ins instruction instead of slow and generic tbl.

example:
#define vector __attribute__((vector_size(4*sizeof(float))))

vector float f0(vector float a, vector float b)
{
  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
}


was compiled into:
...
        adrp    x0, .LC0
        ldr     q1, [x0, #:lo12:.LC0]
        tbl     v0.16b, {v0.16b}, v1.16b
...

and after patch:
...
        ins     v0.s[0], v0.s[3]
...

bootstrapped and tested on aarch64-linux-gnu with no regressions


This patch was initially introduced by me with Andrew Pinksi <apinski@marvell.com> being involved later.

Please note that test in this patch depends on another commit (PR82199), which I sent not long ago.

(I have no write access to repo)

Thanks,
Dmitrij

[-- Attachment #2: 0001-vector-creation-from-two-parts-of-two-vectors-produc.patch --]
[-- Type: text/x-diff, Size: 6807 bytes --]

From d4ccbcdf67648a095706213a0fe0ac856bb077bb Mon Sep 17 00:00:00 2001
From: Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com>
Date: Wed, 17 Jun 2020 12:04:10 +0300
Subject: [PATCH] vector creation from two parts of two vectors produces TBL
 rather than ins (PR93720)

The following patch enables vector permutations optimization by trying to use ins instruction instead of slow and generic tbl.

example:

vector float f0(vector float a, vector float b)
{
  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
}

was compiled into:
...
        adrp    x0, .LC0
        ldr     q1, [x0, #:lo12:.LC0]
        tbl     v0.16b, {v0.16b}, v1.16b
...

and after patch:
...
        ins     v0.s[0], v0.s[3]
...

bootstrapped and tested on aarch64-linux-gnu with no regressions

This patch was initially introduced by me with Andrew Pinksi <apinski@marvell.com> being involved later.
---
 gcc/config/aarch64/aarch64.c              | 85 +++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/vins-1.c | 23 +++++++++
 gcc/testsuite/gcc.target/aarch64/vins-2.c | 23 +++++++++
 gcc/testsuite/gcc.target/aarch64/vins-3.c | 23 +++++++++
 4 files changed, 154 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-3.c

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ab7b39e..e0bde6d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -20516,6 +20516,89 @@ aarch64_evpc_sel (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the INS instructions.  */
+static bool
+aarch64_evpc_ins (struct expand_vec_perm_d *d)
+{
+  machine_mode mode = d->vmode;
+  unsigned HOST_WIDE_INT nelt;
+
+  if (d->vec_flags != VEC_ADVSIMD)
+    return false;
+
+  unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
+  for (unsigned int i = 0; i < encoded_nelts; ++i)
+    if (!d->perm[i].is_constant ())
+      return false;
+
+  /* to_constant is safe since this routine is specific to Advanced SIMD
+     vectors.  */
+  nelt = d->perm.length ().to_constant ();
+  rtx insv = d->op0;
+
+  HOST_WIDE_INT idx = -1;
+
+  for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
+    {
+      if (d->perm[i].to_constant () == (HOST_WIDE_INT) i)
+	continue;
+      if (idx != -1)
+	{
+	  idx = -1;
+	  break;
+	}
+      idx = i;
+    }
+
+  if (idx == -1)
+    {
+      insv = d->op1;
+      for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
+	{
+	  if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
+	    continue;
+	  if (idx != -1)
+	    return false;
+	  idx = i;
+	}
+
+      if (idx == -1)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  gcc_assert (idx != -1);
+
+  unsigned extractindex = d->perm[idx].to_constant ();
+  rtx extractv = d->op0;
+  if (extractindex >= nelt)
+    {
+      extractv = d->op1;
+      extractindex -= nelt;
+    }
+  gcc_assert (extractindex < nelt);
+
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+
+  enum insn_code inscode = optab_handler (vec_set_optab, mode);
+  gcc_assert (inscode != CODE_FOR_nothing);
+  enum insn_code iextcode = convert_optab_handler (vec_extract_optab, mode,
+						   inner_mode);
+  gcc_assert (iextcode != CODE_FOR_nothing);
+  rtx tempinner = gen_reg_rtx (inner_mode);
+  emit_insn (GEN_FCN (iextcode) (tempinner, extractv, GEN_INT (extractindex)));
+
+  rtx temp = gen_reg_rtx (mode);
+  emit_move_insn (temp, insv);
+  emit_insn (GEN_FCN (inscode) (temp, tempinner, GEN_INT (idx)));
+
+  emit_move_insn (d->target, temp);
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -20550,6 +20633,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
       else if (aarch64_evpc_sel (d))
 	return true;
+      else if (aarch64_evpc_ins (d))
+	return true;
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-1.c b/gcc/testsuite/gcc.target/aarch64/vins-1.c
new file mode 100644
index 0000000..8e3725b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f0(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
+}
+vector float f1(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 0, 2, 3});
+}
+vector float f2(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 1, 0, 3});
+}
+vector float f3(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 1, 2, 0});
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]*ins\[ \t\]+v\[0-9\]+\.s" 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-2.c b/gcc/testsuite/gcc.target/aarch64/vins-2.c
new file mode 100644
index 0000000..c50aa5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(8*sizeof(short))))
+
+vector short f0(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){6, 1, 2, 3, 4, 5, 6, 7});
+}
+vector short f2(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 1, 4, 5, 6, 7});
+}
+vector short f4(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 3, 0, 5, 6, 7});
+}
+vector short f6(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 3, 4, 5, 6, 1});
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]*ins\[ \t\]+v\[0-9\]+\.s" 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-3.c b/gcc/testsuite/gcc.target/aarch64/vins-3.c
new file mode 100644
index 0000000..e02d58c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-3.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f0(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){0, 5, 6, 7});
+}
+vector float f1(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 0, 6, 7});
+}
+vector float f2(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 5, 0, 7});
+}
+vector float f3(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 5, 6, 0});
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]*ins\[ \t\]+v\[0-9\]+\.s" 4 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720)
  2020-06-17  9:09 [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720) Dmitrij Pochepko
@ 2020-06-23 15:55 ` Tamar Christina
  2020-06-23 17:10 ` Richard Sandiford
  1 sibling, 0 replies; 8+ messages in thread
From: Tamar Christina @ 2020-06-23 15:55 UTC (permalink / raw)
  To: Dmitrij Pochepko, gcc-patches

Adding AArch64 maintainers,
Also Dmitrij the patch lacks a changelog.

> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of Dmitrij
> Pochepko
> Sent: Wednesday, June 17, 2020 10:09 AM
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH][RFC] vector creation from two parts of two vectors
> produces TBL rather than ins (PR93720)
> 
> The following patch enables vector permutations optimization by trying to
> use ins instruction instead of slow and generic tbl.
> 
> example:
> #define vector __attribute__((vector_size(4*sizeof(float))))
> 
> vector float f0(vector float a, vector float b) {
>   return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3}); }
> 
> 
> was compiled into:
> ...
>         adrp    x0, .LC0
>         ldr     q1, [x0, #:lo12:.LC0]
>         tbl     v0.16b, {v0.16b}, v1.16b
> ...
> 
> and after patch:
> ...
>         ins     v0.s[0], v0.s[3]
> ...
> 
> bootstrapped and tested on aarch64-linux-gnu with no regressions
> 
> 
> This patch was initially introduced by me with Andrew Pinksi
> <apinski@marvell.com> being involved later.
> 
> Please note that test in this patch depends on another commit (PR82199),
> which I sent not long ago.
> 
> (I have no write access to repo)
> 
> Thanks,
> Dmitrij

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720)
  2020-06-17  9:09 [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720) Dmitrij Pochepko
  2020-06-23 15:55 ` Tamar Christina
@ 2020-06-23 17:10 ` Richard Sandiford
  2020-07-10 17:42   ` Dmitrij Pochepko
  1 sibling, 1 reply; 8+ messages in thread
From: Richard Sandiford @ 2020-06-23 17:10 UTC (permalink / raw)
  To: Dmitrij Pochepko; +Cc: gcc-patches

Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com> writes:
> +  unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
> +  for (unsigned int i = 0; i < encoded_nelts; ++i)
> +    if (!d->perm[i].is_constant ())
> +      return false;

I think it would be better to test this as part of the loop below.

> +
> +  /* to_constant is safe since this routine is specific to Advanced SIMD
> +     vectors.  */
> +  nelt = d->perm.length ().to_constant ();
> +  rtx insv = d->op0;
> +
> +  HOST_WIDE_INT idx = -1;
> +
> +  for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
> +    {
> +      if (d->perm[i].to_constant () == (HOST_WIDE_INT) i)
> +	continue;
> +      if (idx != -1)
> +	{
> +	  idx = -1;
> +	  break;
> +	}
> +      idx = i;
> +    }
> +
> +  if (idx == -1)
> +    {
> +      insv = d->op1;
> +      for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
> +	{
> +	  if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
> +	    continue;
> +	  if (idx != -1)
> +	    return false;
> +	  idx = i;
> +	}
> +
> +      if (idx == -1)
> +	return false;
> +    }
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  gcc_assert (idx != -1);
> +
> +  unsigned extractindex = d->perm[idx].to_constant ();
> +  rtx extractv = d->op0;
> +  if (extractindex >= nelt)
> +    {
> +      extractv = d->op1;
> +      extractindex -= nelt;
> +    }
> +  gcc_assert (extractindex < nelt);
> +
> +  machine_mode inner_mode = GET_MODE_INNER (mode);
> +
> +  enum insn_code inscode = optab_handler (vec_set_optab, mode);
> +  gcc_assert (inscode != CODE_FOR_nothing);
> +  enum insn_code iextcode = convert_optab_handler (vec_extract_optab, mode,
> +						   inner_mode);
> +  gcc_assert (iextcode != CODE_FOR_nothing);
> +  rtx tempinner = gen_reg_rtx (inner_mode);
> +  emit_insn (GEN_FCN (iextcode) (tempinner, extractv, GEN_INT (extractindex)));
> +
> +  rtx temp = gen_reg_rtx (mode);
> +  emit_move_insn (temp, insv);
> +  emit_insn (GEN_FCN (inscode) (temp, tempinner, GEN_INT (idx)));
> +
> +  emit_move_insn (d->target, temp);

I think it'd be better to generate the target instruction directly.
We can do that by replacing:

(define_insn "*aarch64_simd_vec_copy_lane<mode>"

with:

(define_insn "@aarch64_simd_vec_copy_lane<mode>"

then using the expand_insn interface to create an instance of
code_for_aarch64_simd_vec_copy_lane (mode).

> +/* { dg-final { scan-assembler-times "\[ \t\]*ins\[ \t\]+v\[0-9\]+\.s" 4 } } */

Same comment as the other patch about using {…} regexp quoting.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720)
  2020-06-23 17:10 ` Richard Sandiford
@ 2020-07-10 17:42   ` Dmitrij Pochepko
  2020-07-11  8:52     ` Richard Sandiford
  0 siblings, 1 reply; 8+ messages in thread
From: Dmitrij Pochepko @ 2020-07-10 17:42 UTC (permalink / raw)
  To: gcc-patches, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 786 bytes --]

Hi,

thank you for reviewing it.

Please check updated version(attached) with all comments addressed.

Thanks,
Dmitrij

On Tue, Jun 23, 2020 at 06:10:52PM +0100, Richard Sandiford wrote:
...
> 
> I think it would be better to test this as part of the loop below.
> 
done

...
> I think it'd be better to generate the target instruction directly.
> We can do that by replacing:
> 
> (define_insn "*aarch64_simd_vec_copy_lane<mode>"
> 
> with:
> 
> (define_insn "@aarch64_simd_vec_copy_lane<mode>"
> 
> then using the expand_insn interface to create an instance of
> code_for_aarch64_simd_vec_copy_lane (mode).
done

...
> 
> > +/* { dg-final { scan-assembler-times "\[ \t\]*ins\[ \t\]+v\[0-9\]+\.s" 4 } } */
> 
> Same comment as the other patch about using {…} regexp quoting.
> 
done

[-- Attachment #2: 0001-vector-creation-from-two-parts-of-two-vectors-produc.patch --]
[-- Type: text/x-diff, Size: 7688 bytes --]

From 8e7cfa2da407171a30d1e152f0e0f4be399d571e Mon Sep 17 00:00:00 2001
From: Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com>
Date: Fri, 10 Jul 2020 20:37:17 +0300
Subject: [PATCH] vector creation from two parts of two vectors produces TBL
 rather than ins (PR 93720)

The following patch enables vector permutations optimization by trying to use ins instruction instead of slow and generic tbl.

example:

vector float f0(vector float a, vector float b)
{
  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
}

was compiled into:
...
	adrp    x0, .LC0
	ldr     q1, [x0, #:lo12:.LC0]
	tbl     v0.16b, {v0.16b}, v1.16b
...

and after patch:
...
	ins     v0.s[0], v0.s[3]
...

bootstrapped and tested on aarch64-linux-gnu with no regressions

gcc/ChangeLog:

2020-07-10     Andrew Pinski   <apinksi@marvell.com>

	PR gcc/93720

	* gcc/config/aarch64/aarch64.c (aarch64_evpc_ins): New function
	* gcc/config/aarch64/aarch64-simd.md (aarch64_simd_vec_copy_lane): changed name prefix

gcc/testsuite/ChangeLog:

2020-07-10      Andrew Pinski   <apinksi@marvell.com>

	PR gcc/93720

	* gcc/testsuite/gcc.target/aarch64/vins-1.c: New test
	* gcc/testsuite/gcc.target/aarch64/vins-2.c: New test
	* gcc/testsuite/gcc.target/aarch64/vins-3.c: New test

Co-Authored-By: Dmitrij Pochepko        <dmitrij.pochepko@bell-sw.com>
---
 gcc/config/aarch64/aarch64-simd.md        |  2 +-
 gcc/config/aarch64/aarch64.c              | 82 +++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/vins-1.c | 23 +++++++++
 gcc/testsuite/gcc.target/aarch64/vins-2.c | 23 +++++++++
 gcc/testsuite/gcc.target/aarch64/vins-3.c | 23 +++++++++
 5 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-3.c

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd..11ebf5b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -958,7 +958,7 @@
   [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )
 
-(define_insn "*aarch64_simd_vec_copy_lane<mode>"
+(define_insn "@aarch64_simd_vec_copy_lane<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_merge:VALL_F16
 	    (vec_duplicate:VALL_F16
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 9b31743..7544fd8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -20594,6 +20594,86 @@ aarch64_evpc_sel (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the INS instructions.  */
+static bool
+aarch64_evpc_ins (struct expand_vec_perm_d *d)
+{
+  machine_mode mode = d->vmode;
+  unsigned HOST_WIDE_INT nelt;
+
+  if (d->vec_flags != VEC_ADVSIMD)
+    return false;
+
+  unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
+  for (unsigned int i = 0; i < encoded_nelts; ++i)
+    if (!d->perm[i].is_constant ())
+      return false;
+
+  /* to_constant is safe since this routine is specific to Advanced SIMD
+     vectors.  */
+  nelt = d->perm.length ().to_constant ();
+  rtx insv = d->op0;
+
+  HOST_WIDE_INT idx = -1;
+
+  for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
+    {
+      poly_int64 elt = d->perm[i];
+      if (!elt.is_constant ())
+	return false;
+      if (elt.to_constant () == (HOST_WIDE_INT) i)
+	continue;
+      if (idx != -1)
+	{
+	  idx = -1;
+	  break;
+	}
+      idx = i;
+    }
+
+  if (idx == -1)
+    {
+      insv = d->op1;
+      for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
+	{
+	  if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
+	    continue;
+	  if (idx != -1)
+	    return false;
+	  idx = i;
+	}
+
+      if (idx == -1)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  gcc_assert (idx != -1);
+
+  unsigned extractindex = d->perm[idx].to_constant ();
+  rtx extractv = d->op0;
+  if (extractindex >= nelt)
+    {
+      extractv = d->op1;
+      extractindex -= nelt;
+    }
+  gcc_assert (extractindex < nelt);
+
+  emit_move_insn (d->target, insv);
+  insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+  expand_operand ops[5];
+  create_output_operand (&ops[0], d->target, mode);
+  create_input_operand (&ops[1], d->target, mode);
+  create_integer_operand (&ops[2], 1 << idx);
+  create_input_operand (&ops[3], extractv, mode);
+  create_integer_operand (&ops[4], extractindex);
+  expand_insn (icode, 5, ops);
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -20628,6 +20708,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
       else if (aarch64_evpc_sel (d))
 	return true;
+      else if (aarch64_evpc_ins (d))
+	return true;
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-1.c b/gcc/testsuite/gcc.target/aarch64/vins-1.c
new file mode 100644
index 0000000..eefb93c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f0(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
+}
+vector float f1(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 0, 2, 3});
+}
+vector float f2(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 1, 0, 3});
+}
+vector float f3(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 1, 2, 0});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*ins[ \t]+v[0-9]+\.s} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-2.c b/gcc/testsuite/gcc.target/aarch64/vins-2.c
new file mode 100644
index 0000000..1501b60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(8*sizeof(short))))
+
+vector short f0(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){6, 1, 2, 3, 4, 5, 6, 7});
+}
+vector short f2(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 1, 4, 5, 6, 7});
+}
+vector short f4(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 3, 0, 5, 6, 7});
+}
+vector short f6(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 3, 4, 5, 6, 1});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*ins[ \t]+v[0-9]+\.h} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-3.c b/gcc/testsuite/gcc.target/aarch64/vins-3.c
new file mode 100644
index 0000000..3b08701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-3.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f0(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){0, 5, 6, 7});
+}
+vector float f1(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 0, 6, 7});
+}
+vector float f2(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 5, 0, 7});
+}
+vector float f3(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 5, 6, 0});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*ins[ \t]+v[0-9]+\.s} 4 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720)
  2020-07-10 17:42   ` Dmitrij Pochepko
@ 2020-07-11  8:52     ` Richard Sandiford
  2020-07-14 12:52       ` Dmitrij Pochepko
  0 siblings, 1 reply; 8+ messages in thread
From: Richard Sandiford @ 2020-07-11  8:52 UTC (permalink / raw)
  To: Dmitrij Pochepko; +Cc: gcc-patches

Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com> writes:
> Hi,
>
> thank you for reviewing it.
>
> Please check updated version(attached) with all comments addressed.
>
> Thanks,
> Dmitrij
>
> On Tue, Jun 23, 2020 at 06:10:52PM +0100, Richard Sandiford wrote:
> ...
>> 
>> I think it would be better to test this as part of the loop below.
>> 
> done

For this point, I meant that we should remove the first loop too.  I.e.:

> +  unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
> +  for (unsigned int i = 0; i < encoded_nelts; ++i)
> +    if (!d->perm[i].is_constant ())
> +      return false;

is now redundant with the later:

> +  for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
> +    {
> +      poly_int64 elt = d->perm[i];
> +      if (!elt.is_constant ())
> +	return false;
> +      if (elt.to_constant () == (HOST_WIDE_INT) i)
> +	continue;

However, a more canonical way to write the condition above is:

  for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
    {
      HOST_WIDE_INT elt;
      if (!elt.is_constant (&elt))
        return false;
      if (elt == (HOST_WIDE_INT) i)
        continue;

Very minor, but the coding conventions don't put a space before “++”.
So:

> +      for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)

…this should be “i++” too.

Looks good otherwise, thanks.

Richard

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720)
  2020-07-11  8:52     ` Richard Sandiford
@ 2020-07-14 12:52       ` Dmitrij Pochepko
  2020-07-17  9:25         ` Richard Sandiford
  0 siblings, 1 reply; 8+ messages in thread
From: Dmitrij Pochepko @ 2020-07-14 12:52 UTC (permalink / raw)
  To: gcc-patches, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 780 bytes --]

Hi,

please take a look at updated patch with all comments addressed (attached).

Thanks,
Dmitrij

On Sat, Jul 11, 2020 at 09:52:40AM +0100, Richard Sandiford wrote:
...
> 
> For this point, I meant that we should remove the first loop too.  I.e.:
>
... 
> 
> is now redundant with the later:
>
... 
> 
> However, a more canonical way to write the condition above is:
> 
>   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
>     {
>       HOST_WIDE_INT elt;
>       if (!elt.is_constant (&elt))
>         return false;
>       if (elt == (HOST_WIDE_INT) i)
>         continue;
> 

done

> Very minor, but the coding conventions don't put a space before “++”.
> So:
> 
> > +      for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
> 
> …this should be “i++” too.

done

[-- Attachment #2: 0001-vector-creation-from-two-parts-of-two-vectors-produc.patch --]
[-- Type: text/x-diff, Size: 7490 bytes --]

From 9acc14f4cdd10091daa5311f495daacfebdcfc3d Mon Sep 17 00:00:00 2001
From: Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com>
Date: Tue, 14 Jul 2020 15:48:46 +0300
Subject: [PATCH] vector creation from two parts of two vectors produces TBL
 rather than ins (PR 93720)

The following patch enables vector permutations optimization by trying to use ins instruction instead of slow and generic tbl.

example:

vector float f0(vector float a, vector float b)
{
  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
}

was compiled into:
...
	adrp    x0, .LC0
	ldr     q1, [x0, #:lo12:.LC0]
	tbl     v0.16b, {v0.16b}, v1.16b
...

and after patch:
...
	ins     v0.s[0], v0.s[3]
...

bootstrapped and tested on aarch64-linux-gnu with no regressions

gcc/ChangeLog:

2020-07-14     Andrew Pinski   <apinksi@marvell.com>

	PR gcc/93720

	* gcc/config/aarch64/aarch64.c (aarch64_evpc_ins): New function
	* gcc/config/aarch64/aarch64-simd.md (aarch64_simd_vec_copy_lane): changed name prefix

gcc/testsuite/ChangeLog:

2020-07-14      Andrew Pinski   <apinksi@marvell.com>

	PR gcc/93720

	* gcc/testsuite/gcc.target/aarch64/vins-1.c: New test
	* gcc/testsuite/gcc.target/aarch64/vins-2.c: New test
	* gcc/testsuite/gcc.target/aarch64/vins-3.c: New test

Co-Authored-By: Dmitrij Pochepko        <dmitrij.pochepko@bell-sw.com>
---
 gcc/config/aarch64/aarch64-simd.md        |  2 +-
 gcc/config/aarch64/aarch64.c              | 77 +++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/vins-1.c | 23 +++++++++
 gcc/testsuite/gcc.target/aarch64/vins-2.c | 23 +++++++++
 gcc/testsuite/gcc.target/aarch64/vins-3.c | 23 +++++++++
 5 files changed, 147 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-3.c

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd..11ebf5b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -958,7 +958,7 @@
   [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )
 
-(define_insn "*aarch64_simd_vec_copy_lane<mode>"
+(define_insn "@aarch64_simd_vec_copy_lane<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_merge:VALL_F16
 	    (vec_duplicate:VALL_F16
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e259d05..f1c5b5a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -20594,6 +20594,81 @@ aarch64_evpc_sel (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the INS instructions.  */
+static bool
+aarch64_evpc_ins (struct expand_vec_perm_d *d)
+{
+  machine_mode mode = d->vmode;
+  unsigned HOST_WIDE_INT nelt;
+
+  if (d->vec_flags != VEC_ADVSIMD)
+    return false;
+
+  /* to_constant is safe since this routine is specific to Advanced SIMD
+     vectors.  */
+  nelt = d->perm.length ().to_constant ();
+  rtx insv = d->op0;
+
+  HOST_WIDE_INT idx = -1;
+
+  for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
+    {
+      HOST_WIDE_INT elt;
+      if (!d->perm[i].is_constant (&elt))
+	return false;
+      if (elt == (HOST_WIDE_INT) i)
+	continue;
+      if (idx != -1)
+	{
+	  idx = -1;
+	  break;
+	}
+      idx = i;
+    }
+
+  if (idx == -1)
+    {
+      insv = d->op1;
+      for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
+	{
+	  if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
+	    continue;
+	  if (idx != -1)
+	    return false;
+	  idx = i;
+	}
+
+      if (idx == -1)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  gcc_assert (idx != -1);
+
+  unsigned extractindex = d->perm[idx].to_constant ();
+  rtx extractv = d->op0;
+  if (extractindex >= nelt)
+    {
+      extractv = d->op1;
+      extractindex -= nelt;
+    }
+  gcc_assert (extractindex < nelt);
+
+  emit_move_insn (d->target, insv);
+  insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+  expand_operand ops[5];
+  create_output_operand (&ops[0], d->target, mode);
+  create_input_operand (&ops[1], d->target, mode);
+  create_integer_operand (&ops[2], 1 << idx);
+  create_input_operand (&ops[3], extractv, mode);
+  create_integer_operand (&ops[4], extractindex);
+  expand_insn (icode, 5, ops);
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -20628,6 +20703,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
       else if (aarch64_evpc_sel (d))
 	return true;
+      else if (aarch64_evpc_ins (d))
+	return true;
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-1.c b/gcc/testsuite/gcc.target/aarch64/vins-1.c
new file mode 100644
index 0000000..eefb93c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f0(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
+}
+vector float f1(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 0, 2, 3});
+}
+vector float f2(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 1, 0, 3});
+}
+vector float f3(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 1, 2, 0});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*ins[ \t]+v[0-9]+\.s} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-2.c b/gcc/testsuite/gcc.target/aarch64/vins-2.c
new file mode 100644
index 0000000..1501b60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(8*sizeof(short))))
+
+vector short f0(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){6, 1, 2, 3, 4, 5, 6, 7});
+}
+vector short f2(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 1, 4, 5, 6, 7});
+}
+vector short f4(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 3, 0, 5, 6, 7});
+}
+vector short f6(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 3, 4, 5, 6, 1});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*ins[ \t]+v[0-9]+\.h} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-3.c b/gcc/testsuite/gcc.target/aarch64/vins-3.c
new file mode 100644
index 0000000..3b08701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-3.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f0(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){0, 5, 6, 7});
+}
+vector float f1(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 0, 6, 7});
+}
+vector float f2(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 5, 0, 7});
+}
+vector float f3(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 5, 6, 0});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*ins[ \t]+v[0-9]+\.s} 4 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720)
  2020-07-14 12:52       ` Dmitrij Pochepko
@ 2020-07-17  9:25         ` Richard Sandiford
  2020-07-17 13:13           ` Dmitrij Pochepko
  0 siblings, 1 reply; 8+ messages in thread
From: Richard Sandiford @ 2020-07-17  9:25 UTC (permalink / raw)
  To: Dmitrij Pochepko; +Cc: gcc-patches

Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com> writes:
> Hi,
>
> please take a look at updated patch with all comments addressed (attached).

Thanks, pushed to master with a slightly tweaked changelog.

Richard

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720)
  2020-07-17  9:25         ` Richard Sandiford
@ 2020-07-17 13:13           ` Dmitrij Pochepko
  0 siblings, 0 replies; 8+ messages in thread
From: Dmitrij Pochepko @ 2020-07-17 13:13 UTC (permalink / raw)
  To: gcc-patches, richard.sandiford

Thank you!

On 17.07.2020 12:25, Richard Sandiford wrote:
> Dmitrij Pochepko <dmitrij.pochepko@bell-sw.com> writes:
>> Hi,
>>
>> please take a look at updated patch with all comments addressed (attached).
> Thanks, pushed to master with a slightly tweaked changelog.
>
> Richard

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-07-17 13:13 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-17  9:09 [PATCH][RFC] vector creation from two parts of two vectors produces TBL rather than ins (PR93720) Dmitrij Pochepko
2020-06-23 15:55 ` Tamar Christina
2020-06-23 17:10 ` Richard Sandiford
2020-07-10 17:42   ` Dmitrij Pochepko
2020-07-11  8:52     ` Richard Sandiford
2020-07-14 12:52       ` Dmitrij Pochepko
2020-07-17  9:25         ` Richard Sandiford
2020-07-17 13:13           ` Dmitrij Pochepko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).