public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, ARM] Constant vector permute for the Neon vext insn
@ 2012-08-24  7:45 Christophe Lyon
  2012-08-24  8:40 ` Richard Earnshaw
  0 siblings, 1 reply; 12+ messages in thread
From: Christophe Lyon @ 2012-08-24  7:45 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 774 bytes --]

Hi,

The patch below enables GCC for ARM to implement relevant constant
vector permutations using the Neon vext instruction, by extending the
support currently in place for vrev, vzip, vunzip and vtrn.

For the cases where vext and vrev would lead to the same result, I
have chosen to keep using vrev to avoid updating the testsuite when
both are equivalent (1 cycle) or when vrev is faster (1 cycle when
operating on Qn vs 2 cycles for vext).

Tested with qemu-arm on arm-none-linux-gnueabi.

Christophe.

2012-08-23  Christophe Lyon  <christophe.lyon@linaro.org>

    gcc/
    * config/arm/arm.c (arm_evpc_neon_vext): New
    function.
    (arm_expand_vec_perm_const_1): Add call to
    arm_evpc_neon_vext.

    gcc/testsuite/
    * gcc.target/arm/neon-vext.c: New tests.

[-- Attachment #2: gcc-vec-permute-vext.patch --]
[-- Type: application/octet-stream, Size: 5598 bytes --]

Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 190590)
+++ gcc/config/arm/arm.c	(working copy)
@@ -25940,6 +25940,64 @@ arm_evpc_neon_vtrn (struct expand_vec_pe
   return true;
 }
 
+/* Recognize patterns for the VEXT insns.  */
+
+static bool
+arm_evpc_neon_vext (struct expand_vec_perm_d *d)
+{
+  unsigned int i, nelt = d->nelt;
+  rtx (*gen) (rtx, rtx, rtx, rtx);
+  rtx offset;
+
+  unsigned int next = d->perm[0] + 1;
+
+  /* Check if the extracted indexes are increasing by one.  */
+  for (i = 1; i < nelt; next++, i++)
+    {
+      /* If we hit the most significant element of the 2nd vector in
+	 the previous iteration, no need to test further.  */
+      if (next == 2 * nelt)
+	return false;
+
+      /* If we are operating on only one vector: it could be a
+	 rotation.  If there are only two elements of size < 64, let
+	 arm_evpc_neon_vrev catch it.  */
+      if (d->one_vector_p && (next == nelt))
+	{
+	  if ((nelt == 2) && (d->vmode != V2DImode))
+	    return false;
+	  else
+	    next = 0;
+	}
+
+      if (d->perm[i] != next)
+	return false;
+    }
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vextv16qi; break;
+    case V8QImode: gen = gen_neon_vextv8qi; break;
+    case V4HImode: gen = gen_neon_vextv4hi; break;
+    case V8HImode: gen = gen_neon_vextv8hi; break;
+    case V2SImode: gen = gen_neon_vextv2si; break;
+    case V4SImode: gen = gen_neon_vextv4si; break;
+    case V2SFmode: gen = gen_neon_vextv2sf; break;
+    case V4SFmode: gen = gen_neon_vextv4sf; break;
+    case V2DImode: gen = gen_neon_vextv2di; break;
+    default:
+      return false;
+    }
+
+  /* Success! */
+  if (d->testing_p)
+    return true;
+
+  offset = gen_rtx_CONST_INT (VOIDmode, d->perm[0]);
+  emit_insn (gen (d->target, d->op0, d->op1, offset));
+  return true;
+}
+
 /* The NEON VTBL instruction is a fully variable permuation that's even
    stronger than what we expose via VEC_PERM_EXPR.  What it doesn't do
    is mask the index operand as VEC_PERM_EXPR requires.  Therefore we
@@ -25979,6 +26037,12 @@ arm_evpc_neon_vtbl (struct expand_vec_pe
 static bool
 arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
+  /* Check if the input mask matches vext before reordering the
+     operands.  */
+  if (TARGET_NEON)
+    if (arm_evpc_neon_vext (d))
+      return true;
+
   /* The pattern matching functions above are written to look for a small
      number to begin the sequence (0, 1, N/2).  If we begin with an index
      from the second operand, we can swap the operands.  */
Index: gcc/testsuite/gcc.target/arm/neon-vext.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vext.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vext.c	(revision 0)
@@ -0,0 +1,113 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint8x8_t
+tst_vext_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  uint8x8_t __mask1 = { 2, 3, 4, 5, 6, 7, 8, 9};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x8_t
+tst_vext_u8_rotate (uint8x8_t __a)
+{
+  uint8x8_t __mask1 = { 2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  uint16x4_t __mask1 = { 2, 3, 4, 5};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16_rotate (uint16x4_t __a)
+{
+  uint16x4_t __mask1 = { 2, 3, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x2_t
+tst_vext_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2_t __mask1 = { 1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+/* This one is mapped into vrev64.32.  */
+uint32x2_t
+tst_vext_u32_rotate (uint32x2_t __a)
+{
+  uint32x2_t __mask1 = { 1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  uint8x16_t __mask1 = { 4, 5, 6, 7, 8, 9, 10, 11,
+			 12, 13, 14, 15, 16, 17, 18, 19};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8_rotate (uint8x16_t __a)
+{
+  uint8x16_t __mask1 = { 4, 5, 6, 7, 8, 9, 10, 11,
+			 12, 13, 14, 15, 0, 1, 2, 3};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  uint16x8_t __mask1 = { 2, 3, 4, 5, 6, 7, 8, 9};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16_rotate (uint16x8_t __a)
+{
+  uint16x8_t __mask1 = { 2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  uint32x4_t __mask1 = { 1, 2, 3, 4};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32_rotate (uint32x4_t __a)
+{
+  uint32x4_t __mask1 = { 1, 2, 3, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  uint64x2_t __mask1 = { 1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64_rotate (uint64x2_t __a)
+{
+  uint64x2_t __mask1 = { 1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+/* { dg-final {scan-assembler-times "vext\.8\\t" 4} }  */
+/* { dg-final {scan-assembler-times "vext\.16\\t" 4} }  */
+/* { dg-final {scan-assembler-times "vext\.32\\t" 3} }  */
+/* { dg-final {scan-assembler-times "vrev64\.32\\t" 1} }  */
+/* { dg-final {scan-assembler-times "vext\.64\\t" 2} }  */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-24  7:45 [PATCH, ARM] Constant vector permute for the Neon vext insn Christophe Lyon
@ 2012-08-24  8:40 ` Richard Earnshaw
  2012-08-24  8:54   ` Christophe Lyon
  2012-08-27 15:03   ` Christophe Lyon
  0 siblings, 2 replies; 12+ messages in thread
From: Richard Earnshaw @ 2012-08-24  8:40 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

On 24/08/12 08:45, Christophe Lyon wrote:
> Hi,
> 
> The patch below enables GCC for ARM to implement relevant constant
> vector permutations using the Neon vext instruction, by extending the
> support currently in place for vrev, vzip, vunzip and vtrn.
> 
> For the cases where vext and vrev would lead to the same result, I
> have chosen to keep using vrev to avoid updating the testsuite when
> both are equivalent (1 cycle) or when vrev is faster (1 cycle when
> operating on Qn vs 2 cycles for vext).
> 
> Tested with qemu-arm on arm-none-linux-gnueabi.
> 
> Christophe.
> 
> 2012-08-23  Christophe Lyon  <christophe.lyon@linaro.org>
> 
>     gcc/
>     * config/arm/arm.c (arm_evpc_neon_vext): New
>     function.
>     (arm_expand_vec_perm_const_1): Add call to
>     arm_evpc_neon_vext.
> 
>     gcc/testsuite/
>     * gcc.target/arm/neon-vext.c: New tests.=
> 

Has this been tested for big-endian?

R.




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-24  8:40 ` Richard Earnshaw
@ 2012-08-24  8:54   ` Christophe Lyon
  2012-08-31 13:23     ` Christophe Lyon
  2012-08-27 15:03   ` Christophe Lyon
  1 sibling, 1 reply; 12+ messages in thread
From: Christophe Lyon @ 2012-08-24  8:54 UTC (permalink / raw)
  To: Richard Earnshaw, gcc-patches

On 24 August 2012 10:40, Richard Earnshaw <rearnsha@arm.com> wrote:
>
> Has this been tested for big-endian?
>
> R.

No. I'll give a look at it and let you know.

Christophe.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-24  8:40 ` Richard Earnshaw
  2012-08-24  8:54   ` Christophe Lyon
@ 2012-08-27 15:03   ` Christophe Lyon
  2012-08-27 19:28     ` Janis Johnson
  1 sibling, 1 reply; 12+ messages in thread
From: Christophe Lyon @ 2012-08-27 15:03 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: gcc-patches

[ Richard, sorry for the duplicate message where I omitted the mailing-list]

On 24 August 2012 10:40, Richard Earnshaw <rearnsha@arm.com> wrote:
>
> Has this been tested for big-endian?
>

Hi,
While improving my tests and trying to turn them into execution tests,
I realized that vector initialization such as:
  uint16x4_t vec1 = {0x1234, 0x5678, 0x9abc, 0xdef0};
is endianness dependent.

However, I have noticed that other tests (such as neon-vrev.c,
neon-vset_lanes8.c, pr48252) do use such constructs.... and the last
two ones fail at execution in big-endian mode (the 1st one is only
compiled).

I guess that the 'right' (portable) was of initializing a vector is to
load it from an array, right?

Thanks,

Christophe.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-27 15:03   ` Christophe Lyon
@ 2012-08-27 19:28     ` Janis Johnson
  2012-08-28 14:20       ` Christophe Lyon
  0 siblings, 1 reply; 12+ messages in thread
From: Janis Johnson @ 2012-08-27 19:28 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Richard Earnshaw, gcc-patches

On 08/27/2012 08:02 AM, Christophe Lyon wrote:
> [ Richard, sorry for the duplicate message where I omitted the mailing-list]
> 
> On 24 August 2012 10:40, Richard Earnshaw <rearnsha@arm.com> wrote:
>>
>> Has this been tested for big-endian?
>>
> 
> Hi,
> While improving my tests and trying to turn them into execution tests,
> I realized that vector initialization such as:
>   uint16x4_t vec1 = {0x1234, 0x5678, 0x9abc, 0xdef0};
> is endianness dependent.
> 
> However, I have noticed that other tests (such as neon-vrev.c,
> neon-vset_lanes8.c, pr48252) do use such constructs.... and the last
> two ones fail at execution in big-endian mode (the 1st one is only
> compiled).
> 
> I guess that the 'right' (portable) was of initializing a vector is to
> load it from an array, right?

See http://gcc.gnu.org/ml/gcc-patches/2011-10/msg01114.html for Richard
Earnshaw's suggestion on how to fix neon-vset_lanes8.c, and an alternate
suggestion for changing the compiler.

Janis

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-27 19:28     ` Janis Johnson
@ 2012-08-28 14:20       ` Christophe Lyon
  2012-08-28 15:19         ` Christophe Lyon
  0 siblings, 1 reply; 12+ messages in thread
From: Christophe Lyon @ 2012-08-28 14:20 UTC (permalink / raw)
  To: janisjo; +Cc: Richard Earnshaw, gcc-patches

On 27 August 2012 21:29, Janis Johnson <janis_johnson@mentor.com> wrote:
> On 08/27/2012 08:02 AM, Christophe Lyon wrote:
>> I guess that the 'right' (portable) was of initializing a vector is to
>> load it from an array, right?
>
> See http://gcc.gnu.org/ml/gcc-patches/2011-10/msg01114.html for Richard
> Earnshaw's suggestion on how to fix neon-vset_lanes8.c, and an alternate
> suggestion for changing the compiler.
>

Thanks for the pointer, which confirms it is much more complex than I
anticipated.

However, I still need to initialize the mask vector with GCC-vector
notation, not using vld1 otherwise the compiler cannot detect if the
mask vector is contant (and therefore suitable for an optimization).

This makes writing exhaustive, portable (big and little endian),
executable tests a painful task.

Thanks

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-28 14:20       ` Christophe Lyon
@ 2012-08-28 15:19         ` Christophe Lyon
  0 siblings, 0 replies; 12+ messages in thread
From: Christophe Lyon @ 2012-08-28 15:19 UTC (permalink / raw)
  To: janisjo; +Cc: Richard Earnshaw, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 711 bytes --]

On 28 August 2012 16:20, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> This makes writing exhaustive, portable (big and little endian),
> executable tests a painful task.
>
For instance, considering the attached sample code, I obtain a
different result in big-endian vs little-endian, while the input
values are the same.
Indeed, in both cases, the program prints:
__a[0] = 0 ..... __a[7] = 7
__b[0] = 8 .... __b[7] = 15
__mask1[0] = 2 .... __mask1[7] = 9
but in Little-endian, the result of builtin_shuffle(__a, __b, __mask1) is
mem[0] = 2, mem[1] = 3 .... mem[7] = 9
while in big-endian it is:
mem[0] =5, mem[1] = 4, .... mem[5] = 0, mem[6] = 15, mem[7] = 14

What am I missing?

Thanks,

Christophe.

[-- Attachment #2: neon-shuffle-endian.c --]
[-- Type: text/x-csrc, Size: 1320 bytes --]

/* { dg-do run } */
/* { dg-require-effective-target arm_neon_ok } */
/* { dg-options "-O2" } */
/* { dg-add-options arm_neon } */

#include <arm_neon.h>
#include <stdlib.h>
#include <stdio.h>

uint8x8_t
tst_vext_u8 (uint8x8_t __a, uint8x8_t __b)
{
#ifdef __ARMEL__
  uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9};
#else
  uint8x8_t __mask1 = {9, 8, 7, 6, 5, 4, 3, 2};
#endif

  union {uint8x8_t v; uint8_t buf[8];} mem_u8x8;
  int i;

  vst1_u8(mem_u8x8.buf, __a);
  for(i=0; i<8; i++) {
    fprintf(stderr, "__a[%d]=%d\n", i, mem_u8x8.buf[i]);
  }
  vst1_u8(mem_u8x8.buf, __b);
  for(i=0; i<8; i++) {
    fprintf(stderr, "__b[%d]=%d\n", i, mem_u8x8.buf[i]);
  }
  vst1_u8(mem_u8x8.buf, __mask1);
  for(i=0; i<8; i++) {
    fprintf(stderr, "__mask1[%d]=%d\n", i, mem_u8x8.buf[i]);
  }
  return __builtin_shuffle ( __a, __b, __mask1) ;
}

int main(void)
{
  uint8_t arr_u8x8[] = {0, 1, 2, 3, 4, 5, 6, 7};
  uint8_t arr2_u8x8[] = {8, 9, 10, 11, 12, 13, 14, 15};

  uint8x8_t vec_u8x8 = vld1_u8(arr_u8x8);
  uint8x8_t vec2_u8x8 = vld1_u8(arr2_u8x8);

  uint8x8_t result_u8x8;

  union {uint8x8_t v; uint8_t buf[8];} mem_u8x8;

  int i;

  result_u8x8 = tst_vext_u8 (vec_u8x8, vec2_u8x8);
  vst1_u8(mem_u8x8.buf, result_u8x8);

  for (i=0; i<8; i++) {
    printf("mem_u8x8[%d]=%d\n", i, mem_u8x8.buf[i]);
  }

  return 0;
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-24  8:54   ` Christophe Lyon
@ 2012-08-31 13:23     ` Christophe Lyon
  2012-08-31 14:25       ` Christophe Lyon
  0 siblings, 1 reply; 12+ messages in thread
From: Christophe Lyon @ 2012-08-31 13:23 UTC (permalink / raw)
  To: Richard Earnshaw, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 904 bytes --]

On 24 August 2012 10:54, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> On 24 August 2012 10:40, Richard Earnshaw <rearnsha@arm.com> wrote:
>>
>> Has this been tested for big-endian?
>>
>> R.
>
> No. I'll give a look at it and let you know.
>
> Christophe.

Here is an updated patch, which now does no optimization in the
big-endian case. Given the current status of big-endian + neon
support, I guess it is not a serious problem.

I have also added runtime tests.

I will later post an additional patch (on top of this one), which
enhances the tests so that they can be run in big-endian mode too.

Christophe.

2012-08-31  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm.c (arm_evpc_neon_vext): New
	function.
	(arm_expand_vec_perm_const_1): Add call to
	arm_evpc_neon_vext.


	gcc/testsuite/
	* gcc.target/arm/neon-vext.c
	gcc.target/arm/neon-vext-execute.c:
	New tests.

[-- Attachment #2: gcc-vec-permute-vext.changelog --]
[-- Type: application/octet-stream, Size: 284 bytes --]

2012-08-31  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm.c (arm_evpc_neon_vext): New
	function.
	(arm_expand_vec_perm_const_1): Add call to
	arm_evpc_neon_vext.


	gcc/testsuite/
	* gcc.target/arm/neon-vext.c
	gcc.target/arm/neon-vext-execute.c:
	New tests.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-31 13:23     ` Christophe Lyon
@ 2012-08-31 14:25       ` Christophe Lyon
  2012-08-31 15:59         ` Richard Henderson
  0 siblings, 1 reply; 12+ messages in thread
From: Christophe Lyon @ 2012-08-31 14:25 UTC (permalink / raw)
  To: Richard Earnshaw, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1142 bytes --]

This time with the actual patch attached.



On 31 August 2012 15:23, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> On 24 August 2012 10:54, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>> On 24 August 2012 10:40, Richard Earnshaw <rearnsha@arm.com> wrote:
>>>
>>> Has this been tested for big-endian?
>>>
>>> R.
>>
>> No. I'll give a look at it and let you know.
>>
>> Christophe.
>
> Here is an updated patch, which now does no optimization in the
> big-endian case. Given the current status of big-endian + neon
> support, I guess it is not a serious problem.
>
> I have also added runtime tests.
>
> I will later post an additional patch (on top of this one), which
> enhances the tests so that they can be run in big-endian mode too.
>
> Christophe.
>
> 2012-08-31  Christophe Lyon  <christophe.lyon@linaro.org>
>
>         gcc/
>         * config/arm/arm.c (arm_evpc_neon_vext): New
>         function.
>         (arm_expand_vec_perm_const_1): Add call to
>         arm_evpc_neon_vext.
>
>
>         gcc/testsuite/
>         * gcc.target/arm/neon-vext.c
>         gcc.target/arm/neon-vext-execute.c:
>         New tests.

[-- Attachment #2: gcc-vec-permute-vext.patch --]
[-- Type: application/octet-stream, Size: 15562 bytes --]

Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 190590)
+++ gcc/config/arm/arm.c	(working copy)
@@ -25940,6 +25940,72 @@ arm_evpc_neon_vtrn (struct expand_vec_pe
   return true;
 }
 
+/* Recognize patterns for the VEXT insns.  */
+
+static bool
+arm_evpc_neon_vext (struct expand_vec_perm_d *d)
+{
+  unsigned int i, nelt = d->nelt;
+  rtx (*gen) (rtx, rtx, rtx, rtx);
+  rtx offset;
+
+  unsigned int location;
+
+  unsigned int next  = d->perm[0] + 1;
+
+  /* TODO: Handle GCC's numbering of elements for big-endian.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  /* Check if the extracted indexes are increasing by one.  */
+  for (i = 1; i < nelt; next++, i++)
+    {
+      /* If we hit the most significant element of the 2nd vector in
+	 the previous iteration, no need to test further.  */
+      if (next == 2 * nelt)
+	return false;
+
+      /* If we are operating on only one vector: it could be a
+	 rotation.  If there are only two elements of size < 64, let
+	 arm_evpc_neon_vrev catch it.  */
+      if (d->one_vector_p && (next == nelt))
+	{
+	  if ((nelt == 2) && (d->vmode != V2DImode))
+	    return false;
+	  else
+	    next = 0;
+	}
+
+      if (d->perm[i] != next)
+	return false;
+    }
+
+  location = d->perm[0];
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vextv16qi; break;
+    case V8QImode: gen = gen_neon_vextv8qi; break;
+    case V4HImode: gen = gen_neon_vextv4hi; break;
+    case V8HImode: gen = gen_neon_vextv8hi; break;
+    case V2SImode: gen = gen_neon_vextv2si; break;
+    case V4SImode: gen = gen_neon_vextv4si; break;
+    case V2SFmode: gen = gen_neon_vextv2sf; break;
+    case V4SFmode: gen = gen_neon_vextv4sf; break;
+    case V2DImode: gen = gen_neon_vextv2di; break;
+    default:
+      return false;
+    }
+
+  /* Success! */
+  if (d->testing_p)
+    return true;
+
+  offset = gen_rtx_CONST_INT (VOIDmode, location);
+  emit_insn (gen (d->target, d->op0, d->op1, offset));
+  return true;
+}
+
 /* The NEON VTBL instruction is a fully variable permuation that's even
    stronger than what we expose via VEC_PERM_EXPR.  What it doesn't do
    is mask the index operand as VEC_PERM_EXPR requires.  Therefore we
@@ -25979,6 +26045,12 @@ arm_evpc_neon_vtbl (struct expand_vec_pe
 static bool
 arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
+  /* Check if the input mask matches vext before reordering the
+     operands.  */
+  if (TARGET_NEON)
+    if (arm_evpc_neon_vext (d))
+      return true;
+
   /* The pattern matching functions above are written to look for a small
      number to begin the sequence (0, 1, N/2).  If we begin with an index
      from the second operand, we can swap the operands.  */
Index: gcc/testsuite/gcc.target/arm/neon-vext.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vext.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vext.c	(revision 0)
@@ -0,0 +1,115 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm_little_endian } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint8x8_t
+tst_vext_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9};
+
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x8_t
+tst_vext_u8_rotate (uint8x8_t __a)
+{
+  uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  uint16x4_t __mask1 = {2, 3, 4, 5};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16_rotate (uint16x4_t __a)
+{
+  uint16x4_t __mask1 = {2, 3, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x2_t
+tst_vext_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2_t __mask1 = {1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+/* This one is mapped into vrev64.32.  */
+uint32x2_t
+tst_vext_u32_rotate (uint32x2_t __a)
+{
+  uint32x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11,
+			12, 13, 14, 15, 16, 17, 18, 19};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8_rotate (uint8x16_t __a)
+{
+  uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11,
+			12, 13, 14, 15, 0, 1, 2, 3};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16_rotate (uint16x8_t __a)
+{
+  uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  uint32x4_t __mask1 = {1, 2, 3, 4};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32_rotate (uint32x4_t __a)
+{
+  uint32x4_t __mask1 = {1, 2, 3, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  uint64x2_t __mask1 = {1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64_rotate (uint64x2_t __a)
+{
+  uint64x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+/* { dg-final {scan-assembler-times "vext\.8\\t" 4} }  */
+/* { dg-final {scan-assembler-times "vext\.16\\t" 4} }  */
+/* { dg-final {scan-assembler-times "vext\.32\\t" 3} }  */
+/* { dg-final {scan-assembler-times "vrev64\.32\\t" 1} }  */
+/* { dg-final {scan-assembler-times "vext\.64\\t" 2} }  */
Index: gcc/testsuite/gcc.target/arm/neon-vext-execute.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vext-execute.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vext-execute.c	(revision 0)
@@ -0,0 +1,340 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm_little_endian } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+uint8x8_t
+tst_vext_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9};
+
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x8_t
+tst_vext_u8_rotate (uint8x8_t __a)
+{
+  uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  uint16x4_t __mask1 = {2, 3, 4, 5};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16_rotate (uint16x4_t __a)
+{
+  uint16x4_t __mask1 = {2, 3, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x2_t
+tst_vext_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2_t __mask1 = {1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+/* This one is mapped into vrev64.32.  */
+uint32x2_t
+tst_vext_u32_rotate (uint32x2_t __a)
+{
+  uint32x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11,
+			12, 13, 14, 15, 16, 17, 18, 19};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8_rotate (uint8x16_t __a)
+{
+  uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11,
+			12, 13, 14, 15, 0, 1, 2, 3};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16_rotate (uint16x8_t __a)
+{
+  uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  uint32x4_t __mask1 = {1, 2, 3, 4};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32_rotate (uint32x4_t __a)
+{
+  uint32x4_t __mask1 = {1, 2, 3, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  uint64x2_t __mask1 = {1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64_rotate (uint64x2_t __a)
+{
+  uint64x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+int main (void)
+{
+  uint8_t arr_u8x8[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  uint8_t arr2_u8x8[] = {8, 9, 10, 11, 12, 13, 14, 15};
+  uint16_t arr_u16x4[] = {0, 1, 2, 3};
+  uint16_t arr2_u16x4[] = {4, 5, 6, 7};
+  uint32_t arr_u32x2[] = {0, 1};
+  uint32_t arr2_u32x2[] = {2, 3};
+  uint8_t arr_u8x16[] = {0, 1, 2, 3, 4, 5, 6, 7,
+			 8, 9, 10, 11, 12, 13, 14, 15};
+  uint8_t arr2_u8x16[] = {16, 17, 18, 19, 20, 21, 22, 23,
+			  24, 25, 26, 27, 28, 29, 30, 31};
+  uint16_t arr_u16x8[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  uint16_t arr2_u16x8[] = {8, 9, 10, 11, 12, 13, 14, 15};
+  uint32_t arr_u32x4[] = {0, 1, 2, 3};
+  uint32_t arr2_u32x4[] = {4, 5, 6, 7};
+  uint64_t arr_u64x2[] = {0, 1};
+  uint64_t arr2_u64x2[] = {2, 3};
+
+  uint8_t expected_u8x8[] = {2, 3, 4, 5, 6, 7, 8, 9};
+  uint8_t expected_rot_u8x8[] = {2, 3, 4, 5, 6, 7, 0, 1};
+  uint16_t expected_u16x4[] = {2, 3, 4, 5};
+  uint16_t expected_rot_u16x4[] = {2, 3, 0, 1};
+  uint32_t expected_u32x2[] = {1, 2};
+  uint32_t expected_rot_u32x2[] = {1, 0};
+  uint8_t expected_u8x16[] = {4, 5, 6, 7, 8, 9, 10, 11,
+			      12, 13, 14, 15, 16, 17, 18, 19};
+  uint8_t expected_rot_u8x16[] = {4, 5, 6, 7, 8, 9, 10, 11,
+				  12, 13, 14, 15, 0, 1, 2, 3,};
+  uint16_t expected_u16x8[] = {2, 3, 4, 5, 6, 7, 8, 9};
+  uint16_t expected_rot_u16x8[] = {2, 3, 4, 5, 6, 7, 0, 1};
+  uint32_t expected_u32x4[] = {1, 2, 3, 4};
+  uint32_t expected_rot_u32x4[] = {1, 2, 3, 0};
+  uint64_t expected_u64x2[] = {1, 2};
+  uint64_t expected_rot_u64x2[] = {1, 0};
+
+  uint8x8_t vec_u8x8 = vld1_u8 (arr_u8x8);
+  uint8x8_t vec2_u8x8 = vld1_u8 (arr2_u8x8);
+  uint16x4_t vec_u16x4 = vld1_u16 (arr_u16x4);
+  uint16x4_t vec2_u16x4 = vld1_u16 (arr2_u16x4);
+  uint32x2_t vec_u32x2 = vld1_u32 (arr_u32x2);
+  uint32x2_t vec2_u32x2 = vld1_u32 (arr2_u32x2);
+  uint8x16_t vec_u8x16 = vld1q_u8 (arr_u8x16);
+  uint8x16_t vec2_u8x16 = vld1q_u8 (arr2_u8x16);
+  uint16x8_t vec_u16x8 = vld1q_u16 (arr_u16x8);
+  uint16x8_t vec2_u16x8 = vld1q_u16 (arr2_u16x8);
+  uint32x4_t vec_u32x4 = vld1q_u32 (arr_u32x4);
+  uint32x4_t vec2_u32x4 = vld1q_u32 (arr2_u32x4);
+  uint64x2_t vec_u64x2 = vld1q_u64 (arr_u64x2);
+  uint64x2_t vec2_u64x2 = vld1q_u64 (arr2_u64x2);
+
+  uint8x8_t result_u8x8;
+  uint16x4_t result_u16x4;
+  uint32x2_t result_u32x2;
+  uint8x16_t result_u8x16;
+  uint16x8_t result_u16x8;
+  uint32x4_t result_u32x4;
+  uint64x2_t result_u64x2;
+
+  union {uint8x8_t v; uint8_t buf[8];} mem_u8x8;
+  union {uint16x4_t v; uint16_t buf[4];} mem_u16x4;
+  union {uint32x2_t v; uint32_t buf[2];} mem_u32x2;
+  union {uint8x16_t v; uint8_t buf[16];} mem_u8x16;
+  union {uint16x8_t v; uint16_t buf[8];} mem_u16x8;
+  union {uint32x4_t v; uint32_t buf[4];} mem_u32x4;
+  union {uint64x2_t v; uint64_t buf[2];} mem_u64x2;
+
+  int i;
+
+  result_u8x8 = tst_vext_u8 (vec_u8x8, vec2_u8x8);
+  vst1_u8 (mem_u8x8.buf, result_u8x8);
+
+  for (i=0; i<8; i++)
+      if (mem_u8x8.buf[i] != expected_u8x8[i])
+	{
+	  printf ("tst_vext_u8[%d]=%d expected %d\n",
+		  i, mem_u8x8.buf[i], expected_u8x8[i]);
+	  abort ();
+	}
+
+  result_u8x8 = tst_vext_u8_rotate (vec_u8x8);
+  vst1_u8 (mem_u8x8.buf, result_u8x8);
+
+  for (i=0; i<8; i++)
+      if (mem_u8x8.buf[i] != expected_rot_u8x8[i])
+	{
+	  printf ("tst_vext_u8_rotate[%d]=%d expected %d\n",
+		  i, mem_u8x8.buf[i], expected_rot_u8x8[i]);
+	  abort ();
+	}
+
+
+  result_u16x4 = tst_vext_u16 (vec_u16x4, vec2_u16x4);
+  vst1_u16 (mem_u16x4.buf, result_u16x4);
+
+  for (i=0; i<4; i++)
+      if (mem_u16x4.buf[i] != expected_u16x4[i])
+	{
+	  printf ("tst_vext_u16[%d]=%d expected %d\n",
+		  i, mem_u16x4.buf[i], expected_u16x4[i]);
+	  abort ();
+	}
+
+  result_u16x4 = tst_vext_u16_rotate (vec_u16x4);
+  vst1_u16 (mem_u16x4.buf, result_u16x4);
+
+  for (i=0; i<4; i++)
+      if (mem_u16x4.buf[i] != expected_rot_u16x4[i])
+	{
+	  printf ("tst_vext_u16_rotate[%d]=%d expected %d\n",
+		  i, mem_u16x4.buf[i], expected_rot_u16x4[i]);
+	  abort ();
+	}
+
+
+  result_u32x2 = tst_vext_u32 (vec_u32x2, vec2_u32x2);
+  vst1_u32 (mem_u32x2.buf, result_u32x2);
+
+  for (i=0; i<2; i++)
+      if (mem_u32x2.buf[i] != expected_u32x2[i])
+	{
+	  printf ("tst_vext_u32[%d]=%d expected %d\n",
+		  i, mem_u32x2.buf[i], expected_u32x2[i]);
+	  abort ();
+	}
+
+  result_u32x2 = tst_vext_u32_rotate (vec_u32x2);
+  vst1_u32 (mem_u32x2.buf, result_u32x2);
+
+  for (i=0; i<2; i++)
+      if (mem_u32x2.buf[i] != expected_rot_u32x2[i])
+	{
+	  printf ("tst_vext_u32_rotate[%d]=%d expected %d\n",
+		  i, mem_u32x2.buf[i], expected_rot_u32x2[i]);
+	  abort ();
+	}
+
+
+  result_u8x16 = tst_vextq_u8 (vec_u8x16, vec2_u8x16);
+  vst1q_u8 (mem_u8x16.buf, result_u8x16);
+
+  for (i=0; i<16; i++)
+      if (mem_u8x16.buf[i] != expected_u8x16[i])
+	{
+	  printf ("tst_vextq_u8[%d]=%d expected %d\n",
+		  i, mem_u8x16.buf[i], expected_u8x16[i]);
+	  abort ();
+	}
+
+  result_u8x16 = tst_vextq_u8_rotate (vec_u8x16);
+  vst1q_u8 (mem_u8x16.buf, result_u8x16);
+
+  for (i=0; i<16; i++)
+      if (mem_u8x16.buf[i] != expected_rot_u8x16[i])
+	{
+	  printf ("tst_vextq_u8_rotate[%d]=%d expected %d\n",
+		  i, mem_u8x16.buf[i], expected_rot_u8x16[i]);
+	  abort ();
+	}
+
+  result_u16x8 = tst_vextq_u16 (vec_u16x8, vec2_u16x8);
+  vst1q_u16 (mem_u16x8.buf, result_u16x8);
+
+  for (i=0; i<8; i++)
+      if (mem_u16x8.buf[i] != expected_u16x8[i])
+	{
+	  printf ("tst_vextq_u16[%d]=%d expected %d\n",
+		  i, mem_u16x8.buf[i], expected_u16x8[i]);
+	  abort ();
+	}
+
+  result_u16x8 = tst_vextq_u16_rotate (vec_u16x8);
+  vst1q_u16 (mem_u16x8.buf, result_u16x8);
+
+  for (i=0; i<8; i++)
+      if (mem_u16x8.buf[i] != expected_rot_u16x8[i])
+	{
+	  printf ("tst_vextq_u16_rotate[%d]=%d expected %d\n",
+		  i, mem_u16x8.buf[i], expected_rot_u16x8[i]);
+	  abort ();
+	}
+
+  result_u32x4 = tst_vextq_u32 (vec_u32x4, vec2_u32x4);
+  vst1q_u32 (mem_u32x4.buf, result_u32x4);
+
+  for (i=0; i<4; i++)
+      if (mem_u32x4.buf[i] != expected_u32x4[i])
+	{
+	  printf ("tst_vextq_u32[%d]=%d expected %d\n",
+		  i, mem_u32x4.buf[i], expected_u32x4[i]);
+	  abort ();
+	}
+
+  result_u32x4 = tst_vextq_u32_rotate (vec_u32x4);
+  vst1q_u32 (mem_u32x4.buf, result_u32x4);
+
+  for (i=0; i<4; i++)
+      if (mem_u32x4.buf[i] != expected_rot_u32x4[i])
+	{
+	  printf ("tst_vextq_u32_rotate[%d]=%d expected %d\n",
+		  i, mem_u32x4.buf[i], expected_rot_u32x4[i]);
+	  abort ();
+	}
+
+  result_u64x2 = tst_vextq_u64 (vec_u64x2, vec2_u64x2);
+  vst1q_u64 (mem_u64x2.buf, result_u64x2);
+
+  for (i=0; i<2; i++)
+      if (mem_u64x2.buf[i] != expected_u64x2[i])
+	{
+	  printf ("tst_vextq_u64[%d]=%lld expected %lld\n",
+		  i, mem_u64x2.buf[i], expected_u64x2[i]);
+	  abort ();
+	}
+
+  result_u64x2 = tst_vextq_u64_rotate (vec_u64x2);
+  vst1q_u64 (mem_u64x2.buf, result_u64x2);
+
+  for (i=0; i<2; i++)
+      if (mem_u64x2.buf[i] != expected_rot_u64x2[i])
+	{
+	  printf ("tst_vextq_u64_rotate[%d]=%lld expected %lld\n",
+		  i, mem_u64x2.buf[i], expected_rot_u64x2[i]);
+	  abort ();
+	}
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-31 14:25       ` Christophe Lyon
@ 2012-08-31 15:59         ` Richard Henderson
  2012-09-03  9:00           ` Christophe Lyon
  0 siblings, 1 reply; 12+ messages in thread
From: Richard Henderson @ 2012-08-31 15:59 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Richard Earnshaw, gcc-patches

On 2012-08-31 07:25, Christophe Lyon wrote:
> +  offset = gen_rtx_CONST_INT (VOIDmode, location);

Never call gen_rtx_CONST_INT directly.  Use GEN_INT.


r~

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-08-31 15:59         ` Richard Henderson
@ 2012-09-03  9:00           ` Christophe Lyon
  2012-09-03  9:59             ` Ramana Radhakrishnan
  0 siblings, 1 reply; 12+ messages in thread
From: Christophe Lyon @ 2012-09-03  9:00 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Richard Earnshaw, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 692 bytes --]

On 31 August 2012 17:59, Richard Henderson <rth@redhat.com> wrote:
> On 2012-08-31 07:25, Christophe Lyon wrote:
>> +  offset = gen_rtx_CONST_INT (VOIDmode, location);
>
> Never call gen_rtx_CONST_INT directly.  Use GEN_INT.
>
>
Here is an updated patch with that small change.
For the record, there are quite a few existing calls to
gen_rtx_CONST_INT, maybe a cleanup pass is needed?

Thanks,

Christophe.

2012-09-03  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm.c (arm_evpc_neon_vext): New
	function.
	(arm_expand_vec_perm_const_1): Add call to
	arm_evpc_neon_vext.


	gcc/testsuite/
	* gcc.target/arm/neon-vext.c
	gcc.target/arm/neon-vext-execute.c:
	New tests.

[-- Attachment #2: gcc-vec-permute-vext.patch --]
[-- Type: application/octet-stream, Size: 15542 bytes --]

Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 190590)
+++ gcc/config/arm/arm.c	(working copy)
@@ -25940,6 +25940,72 @@ arm_evpc_neon_vtrn (struct expand_vec_pe
   return true;
 }
 
+/* Recognize patterns for the VEXT insns.  */
+
+static bool
+arm_evpc_neon_vext (struct expand_vec_perm_d *d)
+{
+  unsigned int i, nelt = d->nelt;
+  rtx (*gen) (rtx, rtx, rtx, rtx);
+  rtx offset;
+
+  unsigned int location;
+
+  unsigned int next  = d->perm[0] + 1;
+
+  /* TODO: Handle GCC's numbering of elements for big-endian.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  /* Check if the extracted indexes are increasing by one.  */
+  for (i = 1; i < nelt; next++, i++)
+    {
+      /* If we hit the most significant element of the 2nd vector in
+	 the previous iteration, no need to test further.  */
+      if (next == 2 * nelt)
+	return false;
+
+      /* If we are operating on only one vector: it could be a
+	 rotation.  If there are only two elements of size < 64, let
+	 arm_evpc_neon_vrev catch it.  */
+      if (d->one_vector_p && (next == nelt))
+	{
+	  if ((nelt == 2) && (d->vmode != V2DImode))
+	    return false;
+	  else
+	    next = 0;
+	}
+
+      if (d->perm[i] != next)
+	return false;
+    }
+
+  location = d->perm[0];
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vextv16qi; break;
+    case V8QImode: gen = gen_neon_vextv8qi; break;
+    case V4HImode: gen = gen_neon_vextv4hi; break;
+    case V8HImode: gen = gen_neon_vextv8hi; break;
+    case V2SImode: gen = gen_neon_vextv2si; break;
+    case V4SImode: gen = gen_neon_vextv4si; break;
+    case V2SFmode: gen = gen_neon_vextv2sf; break;
+    case V4SFmode: gen = gen_neon_vextv4sf; break;
+    case V2DImode: gen = gen_neon_vextv2di; break;
+    default:
+      return false;
+    }
+
+  /* Success! */
+  if (d->testing_p)
+    return true;
+
+  offset = GEN_INT (location);
+  emit_insn (gen (d->target, d->op0, d->op1, offset));
+  return true;
+}
+
 /* The NEON VTBL instruction is a fully variable permuation that's even
    stronger than what we expose via VEC_PERM_EXPR.  What it doesn't do
    is mask the index operand as VEC_PERM_EXPR requires.  Therefore we
@@ -25979,6 +26045,12 @@ arm_evpc_neon_vtbl (struct expand_vec_pe
 static bool
 arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
+  /* Check if the input mask matches vext before reordering the
+     operands.  */
+  if (TARGET_NEON)
+    if (arm_evpc_neon_vext (d))
+      return true;
+
   /* The pattern matching functions above are written to look for a small
      number to begin the sequence (0, 1, N/2).  If we begin with an index
      from the second operand, we can swap the operands.  */
Index: gcc/testsuite/gcc.target/arm/neon-vext.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vext.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vext.c	(revision 0)
@@ -0,0 +1,115 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm_little_endian } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint8x8_t
+tst_vext_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9};
+
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x8_t
+tst_vext_u8_rotate (uint8x8_t __a)
+{
+  uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  uint16x4_t __mask1 = {2, 3, 4, 5};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16_rotate (uint16x4_t __a)
+{
+  uint16x4_t __mask1 = {2, 3, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x2_t
+tst_vext_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2_t __mask1 = {1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+/* This one is mapped into vrev64.32.  */
+uint32x2_t
+tst_vext_u32_rotate (uint32x2_t __a)
+{
+  uint32x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11,
+			12, 13, 14, 15, 16, 17, 18, 19};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8_rotate (uint8x16_t __a)
+{
+  uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11,
+			12, 13, 14, 15, 0, 1, 2, 3};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16_rotate (uint16x8_t __a)
+{
+  uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  uint32x4_t __mask1 = {1, 2, 3, 4};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32_rotate (uint32x4_t __a)
+{
+  uint32x4_t __mask1 = {1, 2, 3, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  uint64x2_t __mask1 = {1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64_rotate (uint64x2_t __a)
+{
+  uint64x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+/* { dg-final {scan-assembler-times "vext\.8\\t" 4} }  */
+/* { dg-final {scan-assembler-times "vext\.16\\t" 4} }  */
+/* { dg-final {scan-assembler-times "vext\.32\\t" 3} }  */
+/* { dg-final {scan-assembler-times "vrev64\.32\\t" 1} }  */
+/* { dg-final {scan-assembler-times "vext\.64\\t" 2} }  */
Index: gcc/testsuite/gcc.target/arm/neon-vext-execute.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vext-execute.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vext-execute.c	(revision 0)
@@ -0,0 +1,340 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm_little_endian } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+uint8x8_t
+tst_vext_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9};
+
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x8_t
+tst_vext_u8_rotate (uint8x8_t __a)
+{
+  uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  uint16x4_t __mask1 = {2, 3, 4, 5};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x4_t
+tst_vext_u16_rotate (uint16x4_t __a)
+{
+  uint16x4_t __mask1 = {2, 3, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x2_t
+tst_vext_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2_t __mask1 = {1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+/* This one is mapped into vrev64.32.  */
+uint32x2_t
+tst_vext_u32_rotate (uint32x2_t __a)
+{
+  uint32x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11,
+			12, 13, 14, 15, 16, 17, 18, 19};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint8x16_t
+tst_vextq_u8_rotate (uint8x16_t __a)
+{
+  uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11,
+			12, 13, 14, 15, 0, 1, 2, 3};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint16x8_t
+tst_vextq_u16_rotate (uint16x8_t __a)
+{
+  uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  uint32x4_t __mask1 = {1, 2, 3, 4};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint32x4_t
+tst_vextq_u32_rotate (uint32x4_t __a)
+{
+  uint32x4_t __mask1 = {1, 2, 3, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  uint64x2_t __mask1 = {1, 2};
+  return __builtin_shuffle ( __a, __b, __mask1) ;
+}
+
+uint64x2_t
+tst_vextq_u64_rotate (uint64x2_t __a)
+{
+  uint64x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+int main (void)
+{
+  uint8_t arr_u8x8[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  uint8_t arr2_u8x8[] = {8, 9, 10, 11, 12, 13, 14, 15};
+  uint16_t arr_u16x4[] = {0, 1, 2, 3};
+  uint16_t arr2_u16x4[] = {4, 5, 6, 7};
+  uint32_t arr_u32x2[] = {0, 1};
+  uint32_t arr2_u32x2[] = {2, 3};
+  uint8_t arr_u8x16[] = {0, 1, 2, 3, 4, 5, 6, 7,
+			 8, 9, 10, 11, 12, 13, 14, 15};
+  uint8_t arr2_u8x16[] = {16, 17, 18, 19, 20, 21, 22, 23,
+			  24, 25, 26, 27, 28, 29, 30, 31};
+  uint16_t arr_u16x8[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  uint16_t arr2_u16x8[] = {8, 9, 10, 11, 12, 13, 14, 15};
+  uint32_t arr_u32x4[] = {0, 1, 2, 3};
+  uint32_t arr2_u32x4[] = {4, 5, 6, 7};
+  uint64_t arr_u64x2[] = {0, 1};
+  uint64_t arr2_u64x2[] = {2, 3};
+
+  uint8_t expected_u8x8[] = {2, 3, 4, 5, 6, 7, 8, 9};
+  uint8_t expected_rot_u8x8[] = {2, 3, 4, 5, 6, 7, 0, 1};
+  uint16_t expected_u16x4[] = {2, 3, 4, 5};
+  uint16_t expected_rot_u16x4[] = {2, 3, 0, 1};
+  uint32_t expected_u32x2[] = {1, 2};
+  uint32_t expected_rot_u32x2[] = {1, 0};
+  uint8_t expected_u8x16[] = {4, 5, 6, 7, 8, 9, 10, 11,
+			      12, 13, 14, 15, 16, 17, 18, 19};
+  uint8_t expected_rot_u8x16[] = {4, 5, 6, 7, 8, 9, 10, 11,
+				  12, 13, 14, 15, 0, 1, 2, 3,};
+  uint16_t expected_u16x8[] = {2, 3, 4, 5, 6, 7, 8, 9};
+  uint16_t expected_rot_u16x8[] = {2, 3, 4, 5, 6, 7, 0, 1};
+  uint32_t expected_u32x4[] = {1, 2, 3, 4};
+  uint32_t expected_rot_u32x4[] = {1, 2, 3, 0};
+  uint64_t expected_u64x2[] = {1, 2};
+  uint64_t expected_rot_u64x2[] = {1, 0};
+
+  uint8x8_t vec_u8x8 = vld1_u8 (arr_u8x8);
+  uint8x8_t vec2_u8x8 = vld1_u8 (arr2_u8x8);
+  uint16x4_t vec_u16x4 = vld1_u16 (arr_u16x4);
+  uint16x4_t vec2_u16x4 = vld1_u16 (arr2_u16x4);
+  uint32x2_t vec_u32x2 = vld1_u32 (arr_u32x2);
+  uint32x2_t vec2_u32x2 = vld1_u32 (arr2_u32x2);
+  uint8x16_t vec_u8x16 = vld1q_u8 (arr_u8x16);
+  uint8x16_t vec2_u8x16 = vld1q_u8 (arr2_u8x16);
+  uint16x8_t vec_u16x8 = vld1q_u16 (arr_u16x8);
+  uint16x8_t vec2_u16x8 = vld1q_u16 (arr2_u16x8);
+  uint32x4_t vec_u32x4 = vld1q_u32 (arr_u32x4);
+  uint32x4_t vec2_u32x4 = vld1q_u32 (arr2_u32x4);
+  uint64x2_t vec_u64x2 = vld1q_u64 (arr_u64x2);
+  uint64x2_t vec2_u64x2 = vld1q_u64 (arr2_u64x2);
+
+  uint8x8_t result_u8x8;
+  uint16x4_t result_u16x4;
+  uint32x2_t result_u32x2;
+  uint8x16_t result_u8x16;
+  uint16x8_t result_u16x8;
+  uint32x4_t result_u32x4;
+  uint64x2_t result_u64x2;
+
+  union {uint8x8_t v; uint8_t buf[8];} mem_u8x8;
+  union {uint16x4_t v; uint16_t buf[4];} mem_u16x4;
+  union {uint32x2_t v; uint32_t buf[2];} mem_u32x2;
+  union {uint8x16_t v; uint8_t buf[16];} mem_u8x16;
+  union {uint16x8_t v; uint16_t buf[8];} mem_u16x8;
+  union {uint32x4_t v; uint32_t buf[4];} mem_u32x4;
+  union {uint64x2_t v; uint64_t buf[2];} mem_u64x2;
+
+  int i;
+
+  result_u8x8 = tst_vext_u8 (vec_u8x8, vec2_u8x8);
+  vst1_u8 (mem_u8x8.buf, result_u8x8);
+
+  for (i=0; i<8; i++)
+      if (mem_u8x8.buf[i] != expected_u8x8[i])
+	{
+	  printf ("tst_vext_u8[%d]=%d expected %d\n",
+		  i, mem_u8x8.buf[i], expected_u8x8[i]);
+	  abort ();
+	}
+
+  result_u8x8 = tst_vext_u8_rotate (vec_u8x8);
+  vst1_u8 (mem_u8x8.buf, result_u8x8);
+
+  for (i=0; i<8; i++)
+      if (mem_u8x8.buf[i] != expected_rot_u8x8[i])
+	{
+	  printf ("tst_vext_u8_rotate[%d]=%d expected %d\n",
+		  i, mem_u8x8.buf[i], expected_rot_u8x8[i]);
+	  abort ();
+	}
+
+
+  result_u16x4 = tst_vext_u16 (vec_u16x4, vec2_u16x4);
+  vst1_u16 (mem_u16x4.buf, result_u16x4);
+
+  for (i=0; i<4; i++)
+      if (mem_u16x4.buf[i] != expected_u16x4[i])
+	{
+	  printf ("tst_vext_u16[%d]=%d expected %d\n",
+		  i, mem_u16x4.buf[i], expected_u16x4[i]);
+	  abort ();
+	}
+
+  result_u16x4 = tst_vext_u16_rotate (vec_u16x4);
+  vst1_u16 (mem_u16x4.buf, result_u16x4);
+
+  for (i=0; i<4; i++)
+      if (mem_u16x4.buf[i] != expected_rot_u16x4[i])
+	{
+	  printf ("tst_vext_u16_rotate[%d]=%d expected %d\n",
+		  i, mem_u16x4.buf[i], expected_rot_u16x4[i]);
+	  abort ();
+	}
+
+
+  result_u32x2 = tst_vext_u32 (vec_u32x2, vec2_u32x2);
+  vst1_u32 (mem_u32x2.buf, result_u32x2);
+
+  for (i=0; i<2; i++)
+      if (mem_u32x2.buf[i] != expected_u32x2[i])
+	{
+	  printf ("tst_vext_u32[%d]=%d expected %d\n",
+		  i, mem_u32x2.buf[i], expected_u32x2[i]);
+	  abort ();
+	}
+
+  result_u32x2 = tst_vext_u32_rotate (vec_u32x2);
+  vst1_u32 (mem_u32x2.buf, result_u32x2);
+
+  for (i=0; i<2; i++)
+      if (mem_u32x2.buf[i] != expected_rot_u32x2[i])
+	{
+	  printf ("tst_vext_u32_rotate[%d]=%d expected %d\n",
+		  i, mem_u32x2.buf[i], expected_rot_u32x2[i]);
+	  abort ();
+	}
+
+
+  result_u8x16 = tst_vextq_u8 (vec_u8x16, vec2_u8x16);
+  vst1q_u8 (mem_u8x16.buf, result_u8x16);
+
+  for (i=0; i<16; i++)
+      if (mem_u8x16.buf[i] != expected_u8x16[i])
+	{
+	  printf ("tst_vextq_u8[%d]=%d expected %d\n",
+		  i, mem_u8x16.buf[i], expected_u8x16[i]);
+	  abort ();
+	}
+
+  result_u8x16 = tst_vextq_u8_rotate (vec_u8x16);
+  vst1q_u8 (mem_u8x16.buf, result_u8x16);
+
+  for (i=0; i<16; i++)
+      if (mem_u8x16.buf[i] != expected_rot_u8x16[i])
+	{
+	  printf ("tst_vextq_u8_rotate[%d]=%d expected %d\n",
+		  i, mem_u8x16.buf[i], expected_rot_u8x16[i]);
+	  abort ();
+	}
+
+  result_u16x8 = tst_vextq_u16 (vec_u16x8, vec2_u16x8);
+  vst1q_u16 (mem_u16x8.buf, result_u16x8);
+
+  for (i=0; i<8; i++)
+      if (mem_u16x8.buf[i] != expected_u16x8[i])
+	{
+	  printf ("tst_vextq_u16[%d]=%d expected %d\n",
+		  i, mem_u16x8.buf[i], expected_u16x8[i]);
+	  abort ();
+	}
+
+  result_u16x8 = tst_vextq_u16_rotate (vec_u16x8);
+  vst1q_u16 (mem_u16x8.buf, result_u16x8);
+
+  for (i=0; i<8; i++)
+      if (mem_u16x8.buf[i] != expected_rot_u16x8[i])
+	{
+	  printf ("tst_vextq_u16_rotate[%d]=%d expected %d\n",
+		  i, mem_u16x8.buf[i], expected_rot_u16x8[i]);
+	  abort ();
+	}
+
+  result_u32x4 = tst_vextq_u32 (vec_u32x4, vec2_u32x4);
+  vst1q_u32 (mem_u32x4.buf, result_u32x4);
+
+  for (i=0; i<4; i++)
+      if (mem_u32x4.buf[i] != expected_u32x4[i])
+	{
+	  printf ("tst_vextq_u32[%d]=%d expected %d\n",
+		  i, mem_u32x4.buf[i], expected_u32x4[i]);
+	  abort ();
+	}
+
+  result_u32x4 = tst_vextq_u32_rotate (vec_u32x4);
+  vst1q_u32 (mem_u32x4.buf, result_u32x4);
+
+  for (i=0; i<4; i++)
+      if (mem_u32x4.buf[i] != expected_rot_u32x4[i])
+	{
+	  printf ("tst_vextq_u32_rotate[%d]=%d expected %d\n",
+		  i, mem_u32x4.buf[i], expected_rot_u32x4[i]);
+	  abort ();
+	}
+
+  result_u64x2 = tst_vextq_u64 (vec_u64x2, vec2_u64x2);
+  vst1q_u64 (mem_u64x2.buf, result_u64x2);
+
+  for (i=0; i<2; i++)
+      if (mem_u64x2.buf[i] != expected_u64x2[i])
+	{
+	  printf ("tst_vextq_u64[%d]=%lld expected %lld\n",
+		  i, mem_u64x2.buf[i], expected_u64x2[i]);
+	  abort ();
+	}
+
+  result_u64x2 = tst_vextq_u64_rotate (vec_u64x2);
+  vst1q_u64 (mem_u64x2.buf, result_u64x2);
+
+  for (i=0; i<2; i++)
+      if (mem_u64x2.buf[i] != expected_rot_u64x2[i])
+	{
+	  printf ("tst_vextq_u64_rotate[%d]=%lld expected %lld\n",
+		  i, mem_u64x2.buf[i], expected_rot_u64x2[i]);
+	  abort ();
+	}
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH, ARM] Constant vector permute for the Neon vext insn
  2012-09-03  9:00           ` Christophe Lyon
@ 2012-09-03  9:59             ` Ramana Radhakrishnan
  0 siblings, 0 replies; 12+ messages in thread
From: Ramana Radhakrishnan @ 2012-09-03  9:59 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Richard Henderson, Richard Earnshaw, gcc-patches

On 09/03/12 09:59, Christophe Lyon wrote:
> On 31 August 2012 17:59, Richard Henderson <rth@redhat.com> wrote:
>> On 2012-08-31 07:25, Christophe Lyon wrote:
>>> +  offset = gen_rtx_CONST_INT (VOIDmode, location);
>>
>> Never call gen_rtx_CONST_INT directly.  Use GEN_INT.
>>
>>
> Here is an updated patch with that small change.
> For the record, there are quite a few existing calls to
> gen_rtx_CONST_INT, maybe a cleanup pass is needed?

A set of cleanup patches are welcome.

This looks OK - thanks.

Ramana


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2012-09-03  9:59 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-08-24  7:45 [PATCH, ARM] Constant vector permute for the Neon vext insn Christophe Lyon
2012-08-24  8:40 ` Richard Earnshaw
2012-08-24  8:54   ` Christophe Lyon
2012-08-31 13:23     ` Christophe Lyon
2012-08-31 14:25       ` Christophe Lyon
2012-08-31 15:59         ` Richard Henderson
2012-09-03  9:00           ` Christophe Lyon
2012-09-03  9:59             ` Ramana Radhakrishnan
2012-08-27 15:03   ` Christophe Lyon
2012-08-27 19:28     ` Janis Johnson
2012-08-28 14:20       ` Christophe Lyon
2012-08-28 15:19         ` Christophe Lyon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).